In [1]:
import pandas as pd
from statsmodels.stats.proportion import proportion_confint as ci
from natsort import natsorted
import numpy as np
from scipy.stats import percentileofscore


def get_clean_cape_dataframe(filepath):
    """
    read the cleaned data as dataframe
    Args:
        filepath: str
        the location of the csv file
    Returns:
        dataframe
    """
    df = pd.read_csv(filepath)
    return df


def get_depts_and_courses_dictionary(df):
    """
    get all courses for departments
    Args:
        df: dataframe

    Returns:
        depths_courses: dict

    Examples:
    --------
        # >>> df = get_clean_cape_dataframe(filepath)
        # >>> get_depts_and_courses_dictionary(df)
        depths_courses = {'AAS': ['10', '11', '170', '190'], 'ANAR': ['100', '114', '115', '116', '135', '135S', '143']}

    """
    df = (df['Course_ID'].str.split(expand=True)
          .rename(columns={0: 'dept', 1: 'course'})
          .drop_duplicates())

    depts = natsorted(df.dept.unique())
    df = df.set_index(['dept', 'course']).sort_index()

    depts_courses = {dept: natsorted(df.loc[dept].index) for dept in depts}

    return depts_courses


def get_time_dicionary(df):
    """
    group the dataframe using the Course_ID, the average spending time for a particular course between different terms is calculated
    compare the time with other courses
    Args:
        df
    Returns:
        time: dict

    Examples:
    --------
        # >>> df = get_clean_cape_dataframe(filepath)
        # >>> get_time_dictionary(df)

        new_df:
                time  depart_avg_time  ...  depart_time_diff  total_time_diff
        Course_ID                         ...
        AAS 10     3.80             3.50  ...              0.30        -1.747582
        AAS 11     4.15             3.50  ...              0.65        -1.397582
        AAS 170    3.06             3.50  ...             -0.44        -2.487582
        AAS 190    2.18             3.50  ...             -1.32        -3.367582

        time:
        {'AAS 10': {'expected': '3.8', 'statement': 'This course will take an average amount of time outside of class.',
        'color': 'black'}, 'AAS 11': {'expected': '4.15',
        'statement': 'This course will take an average amount of time outside of class.', 'color': 'black'}
    """

    df = df[['Course_ID', 'time']]
    # groupby to get average time for same courses in different quarters
    gb = df[['time', 'Course_ID']].groupby('Course_ID').mean().round(2)
    # print(gb)

    # for departments
    depths_time = {}
    depths_courses = get_depts_and_courses_dictionary(df)
    for dept, courses in depths_courses.items():
        times = []
        for course in courses:
            mask = df['Course_ID'].str.startswith(dept + ' ' + course)
            if mask.any():
                times.extend(df.loc[mask, 'time'])

        if times:
            average_time = np.mean(times)
            std_deviation = np.std(times)
            depths_time[dept] = (average_time, std_deviation)

    # for all courses
    total_average = float(gb.mean())
    total_sd = float(gb.std())

    df['department'] = df['Course_ID'].str.split().str[0]
    df['depart_avg_time'] = df['department'].map(lambda x: depths_time[x][0] if x in depths_time else None)

    gb = df.groupby('Course_ID').mean().round(2)
    gb['total_avg_time'] = total_average
    gb['depart_time_diff'] = gb['time'] - gb['depart_avg_time']
    gb['total_time_diff'] = gb['time'] - total_average

    # warning statements
    warning = (
        'This course will take more time outside of class than average.'
    )
    normal = (
        'This course will take an average amount of time outside of class.'
    )
    relax = (
        'This course might take less time outside of class than average.'
    )

    def get_statement_and_color(dev, sd):
        if (dev > sd):
            statement = warning
            color = 'red'
        elif (abs(dev) < sd):
            statement = normal
            color = 'black'
        else:
            statement = relax
            color = 'green'
        return statement, color

    time = {}
    for course in gb.index:
        statement, color = get_statement_and_color(gb.loc[course, 'depart_time_diff'], total_sd)
        time[course] = {'expected': str(float(gb.loc[course, 'time'])),
                        'statement': statement, 'color': color}

    return time, gb


def get_grade_dictionary(df):
    """
    group the dataframe using the Course_ID, the average expected and actual gpa for a particular course between different terms is calculated
    compare the expected gpa and actual gpa
    Args:
        df
    Returns:
        grade: dict

    Examples:
    --------
        # >>> df = get_clean_cape_dataframe(filepath)
        # >>> get_grade_dictionary(df)

        new_df:
                        expected_gpa  actual_gpa   dev
        Course_ID
        AAS 10             3.70        3.68 -0.02
        AAS 11             3.84        3.71 -0.13
        AAS 170            3.88        3.80 -0.08
        AAS 190            3.93        4.00  0.07

        grade:
        {'AAS 10': {'expected': 'B+', 'color': 'black', 'statement': 'Students tend to get the grade they expect for this course.'},
        'AAS 11': {'expected': 'A-', 'color': 'black', 'statement': 'Students tend to get the grade they expect for this course.'},
        'AAS 170': {'expected': 'A-', 'color': 'black', 'statement': 'Students tend to get the grade they expect for this course.'}

    """
    df = df[['Course_ID', 'expected_gpa', 'actual_gpa']]

    # groupby to get the mean grade and round to 2 decimal places
    gb = df.groupby('Course_ID').mean().round(2)
    gb['dev'] = gb['actual_gpa'] - gb['expected_gpa']

    # warning statements
    warning = (
        'Students tend to get lower grades than they expect for this course.'
    )
    normal = (
        'Students tend to get the grade they expect for this course.'
    )
    relax = (
        'Students tend to get higher grades than they expect for this course.'
    )

    def GPA_val_to_grade(val):
        if val == 4.0:
            grade = 'A'
        elif val >= 3.7:
            grade = 'A-'
        elif val >= 3.3:
            grade = 'B+'
        elif val >= 3.0:
            grade = 'B'
        elif val >= 2.7:
            grade = 'B-'
        elif val >= 2.3:
            grade = 'C+'
        elif val >= 2.0:
            grade = 'C'
        elif val >= 1.7:
            grade = 'C-'
        elif val >= 1.0:
            grade = 'D'
        return grade

    def get_statement_and_color(dev):
        if dev > 0.4:
            color = 'green'
            statement = relax
        elif dev < -0.4:
            color = 'red'
            statement = warning
        else:
            color = 'black'
            statement = normal

        return statement, color

    grade = {}
    for course in gb.index:
        statement, color = get_statement_and_color(gb.loc[course, 'dev'])
        grade[course] = {
            'expected': GPA_val_to_grade(gb.loc[course, 'actual_gpa']),
            'color': color,
            'statement': statement
        }

    return grade, gb


# def get_prof_ranking_dictionary(df):
#     """
#     get the professor ranking dictionary
#     Args:
#         df
#     Returns:
#         ranking: dict
#     """
#     df = (df[['instr', 'course', 'evals', 'instr_weighted_evals']])
#
#     gb = df.groupby(['course', 'instr']).sum()
#
#     gb.loc[:, 'lower'], gb.loc[:, 'upper'] = ci(gb.instr_weighted_evals,
#                                                 gb.evals, method='wilson')
#
#     # populate the dictionary
#     ranking = {}
#     for course, instr in gb.index:
#         professors_sorted = gb.loc[course].sort_values(by='lower',
#                                                        ascending=False)
#         ranking[course] = list(professors_sorted.index)
#
#     return ranking


df = get_clean_cape_dataframe('data_clean.csv')
# depts_courses = get_depts_and_courses_dictionary(df)
# print(get_grade_dictionary(df))

# print(type(depts_courses))

# df.to_csv('ECE143_Project/recommend.csv', index=False)

In [2]:
df

Unnamed: 0,instr,term,enroll,evals,rmd_class,rmd_instr,time,Course_ID,Course_Name,expected_grade,expected_gpa,actual_grade,actual_gpa
0,"Butler, Elizabeth Annette",SP23,66,48,93.5,100.0,2.80,AAS 10,Intro/African-American Studies (A),A-,3.84,B+,3.67
1,"Butler, Elizabeth Annette",SP23,20,7,100.0,100.0,2.50,AAS 170,Legacies of Research (A),A-,3.86,A-,3.92
2,"Shtienberg, Gilad",SP23,26,6,100.0,83.3,3.83,ANAR 115,Coastal Geomorphology/Environ (A),B+,3.50,B,3.07
3,"Braswell, Geoffrey E.",SP23,22,9,100.0,100.0,5.17,ANAR 155,Stdy Abrd: Ancient Mesoamerica (A),A,4.00,A,4.00
4,"Hrvoj Mihic, Branka",SP23,22,4,100.0,100.0,2.50,ANBI 111,Human Evolution (A),A,4.00,B-,2.95
...,...,...,...,...,...,...,...,...,...,...,...,...,...
15400,"Blomstedt, Elizabeth Ann",SP19,20,6,100.0,100.0,4.50,WCWP 100,Academic Writing (B),B+,3.60,B+,3.42
15401,"McCleary, Keith Long",SP19,20,19,84.2,94.4,8.39,WCWP 100,Academic Writing (C),B-,2.95,B,3.17
15402,"Young, Mark T",SP19,20,8,75.0,87.5,8.00,WCWP 100,Academic Writing (F),B,3.13,B+,3.34
15403,"Gagnon, Jeffrey C",SP19,150,126,76.9,91.5,5.21,WCWP 10A,The Writing Course A (0),B,3.11,B,3.21


## Input the weights

Please provide your preferences for each category on a scale of 1 to 10, separated by spaces

(GPA, Time Spent, Exp vs True GPA, % Rec Course, % Rec Prof):

For example, enter: 10 10 5 5 5

In [3]:
user_input = input("Enter ratings separated by spaces: ")
ratings = list(map(int, user_input.split()))
def weights(ratings):
    '''
    Calculate weights based on rankings
    '''
    weights = []
    for i in ratings:
        weights.append(i/sum(ratings))
    return weights
        

Enter ratings separated by spaces:  1


In [4]:
weights = weights(ratings)
print(weights)

[1.0]


## Input the desired quarters

Please input a space seperated list of desired quarters using the corresponding numbers:

1. Fall
2. Winter
3. Spring
4. Summer Session 1
5. Summer Session 2

For example, entering `1 2 3` will select Fall, Winter, and Spring.

In [10]:
quarters = list(map(int, input("Enter quarter numbers separated by spaces: ").split(' ')))
quarters

Enter quarter numbers separated by spaces:  1 2


[1, 2]

## Input the desired depeartments

Please input a space seperated list of desired quarters 

In [11]:
user_input = input("Enter department names separated by spaces: ")

depts = user_input.split()

depts

Enter department names separated by spaces:  ECE CSE


['ECE', 'CSE']

In [12]:
maps = {1: 'FA', 2:'WI', 3:'SP', 4: 'S1', 5:'S2'}

quarter_str = '|'.join([maps[q] for q in quarters])
df2 = df[df['term'].str.contains(quarter_str)]
df2

Unnamed: 0,instr,term,enroll,evals,rmd_class,rmd_instr,time,Course_ID,Course_Name,expected_grade,expected_gpa,actual_grade,actual_gpa
954,"Butler, Elizabeth Annette",WI23,65,46,93.5,93.3,4.15,AAS 11,Intro Black Diasporic Studies (A),A-,3.84,A-,3.71
955,"Shtienberg, Gilad",WI23,27,9,100.0,100.0,3.61,ANAR 116,Sea Level ChangeIsrael (A),A,4.00,B+,3.62
956,"Braswell, Geoffrey E.",WI23,24,15,100.0,92.9,3.42,ANAR 135,Ancient Mediterranean Civ (A),A,4.00,A-,3.99
957,"Rodriguez, Eric Andrew",WI23,41,23,100.0,100.0,4.07,ANAR 164,Underwater Archaeology (A),A-,3.80,B+,3.63
958,"Non, Amy L",WI23,71,45,84.4,81.8,4.28,ANBI 130,Biology of Inequality (A),B+,3.42,B+,3.41
...,...,...,...,...,...,...,...,...,...,...,...,...,...
14007,"Cho, Erica",FA19,127,78,81.6,70.7,6.47,VIS 70N,Introduction to Media (A),B+,3.64,B+,3.59
14008,"Tonies, Joshua Vanjaymes",FA19,115,96,97.9,98.9,9.43,VIS 80,Introduction to Studio Major (A),B+,3.67,B+,3.65
14009,"McCleary, Keith Long",FA19,20,20,95.0,100.0,8.70,WCWP 100,Academic Writing (B),B,3.00,B,3.28
14010,"Gagnon, Jeffrey C",FA19,697,398,74.3,86.9,4.77,WCWP 10A,The Writing Course A (0),B+,3.39,B,3.28


In [13]:
df3 = df2[df2['Course_ID'].str.contains('|'.join(depts))]
df3

Unnamed: 0,instr,term,enroll,evals,rmd_class,rmd_instr,time,Course_ID,Course_Name,expected_grade,expected_gpa,actual_grade,actual_gpa
1231,"Moshiri, Alexander Niema",WI23,188,133,97.0,98.5,7.82,CSE 100,Advanced Data Structures (A),B+,3.60,B+,3.36
1232,"Moshiri, Alexander Niema",WI23,315,250,98.8,98.4,7.87,CSE 100R,Advanced Data Structures (A),B+,3.68,B+,3.47
1233,"Kane, Daniel Mertz",WI23,425,169,86.7,88.3,12.24,CSE 101,Design & Analysis of Algorithm (A),B,3.10,B,3.08
1234,"Jones, Miles E",WI23,340,142,84.2,94.9,8.51,CSE 105,Theory of Computation (A),B+,3.49,B,3.25
1235,"Bellare, Mihir",WI23,56,11,90.9,90.9,6.14,CSE 107,Intro to Modern Cryptography (A),A-,3.70,B-,2.76
...,...,...,...,...,...,...,...,...,...,...,...,...,...
13343,"Eldon, John",FA19,163,81,97.1,96.9,6.76,ECE 5,Intro to ECE (A),A-,3.72,B+,3.58
13344,"Gilja, Vikash",FA19,163,81,97.1,100.0,6.76,ECE 5,Intro to ECE (A),A-,3.72,B+,3.58
13345,"Hall, Drew A.",FA19,163,81,97.1,98.4,6.76,ECE 5,Intro to ECE (A),A-,3.72,B+,3.58
13346,"Nguyen, Truong Quang",FA19,163,81,97.1,100.0,6.76,ECE 5,Intro to ECE (A),A-,3.72,B+,3.58


In [None]:
df3['actual_gpa'] = pd.to_numeric(df3['actual_gpa'], errors='coerce')
df3['actual_gpa_percentile'] = df3['actual_gpa'].apply(lambda x: percentileofscore(df3['actual_gpa'].dropna(), x))