# Import Libraries

In [1]:
import pandas as pd
import os 
from collections import defaultdict
from datetime import date

# Functions

In [2]:
'''
# Function imports file and creates df with additional column needed for analysis
'''
def import_file(file_path):
    
    #Move directories to find file
    os.getcwd()
    os.chdir('../data')
    
    #Read excel file and create dataframe
    data = pd.read_excel(file_path)
    
    return data


In [3]:
'''
# Cleanup function
'''
def cleanup(df_name):
    # Renaming Columns
    cols_to_rename = {
    'ACAD_YR':'admit_acad_yr', 
    'ACAD_YR3':'enrl_acad_yr', 
    'TERM_ID4': 'enrl_term_id', 
    'TERM_DESC':'enrl_term_desc', 
    'Term_id':'grad_term_id',
    'TERM_ID':'admit_term_id', 
    'ADMIT_TERM': 'admit_term_desc', 
    'ADMIT_STRM':'admit_strm',
    'College/School':'grad_college', 
    'Department':'grad_department', 
    'Major': 'grad_major'
    }

    # Dropping columns
    cols_to_drop = [
    'Unnamed: 0'
    ]
    
    df_name.rename(columns = cols_to_rename, inplace = True)
    df_name.drop(cols_to_drop, axis = 1, inplace=True)
    
    return df_name

In [4]:
'''
# Use dict to store grades to grade point value conversions
'''
grade_dict = {
  "A": 4.00,
  "A-": 3.67,
  "B+": 3.33,
  "B": 3.00,
  "B-": 2.67,
  "C+": 2.33,
  "C": 2.00,
  "C-": 1.67,  # This needed to be added to take care of historical terms were C- exists
  "D+" : 1.33, # This needed to be added to take care of historical terms were D+ might exist
  "D": 1.00,
  "D-": 0.67,  # This needed to be added to take care of historical terms were D- exists   
  "F": 0,
  "P": -1,
 
}


'''
Function to change grade to point value
'''
def grade_convert(grade): 
    points = grade_dict.get(grade)
    return points


'''
Function to define pass/fail
'''
def pf_flag(grade): 
    if grade in ['A', 'A-', 'AU', 'B', 'B+', 'B-', 'C', 'C+', 'C-', 'P']: 
        return 'pass'
    return 'fail'


'''
# Function to get year from term
# This works because python uses floor division (will always round down to nearest int, giving me right year)
'''
def remove_last_two(num): 
    return (num // 100)


'''
# Calculate the difference between two columns
# Used to calculate the years elapsed
'''
def difference(a,b):
    return a - b


'''
# Function to identify primary course type of 'lab'
 '''
def lab_identifier(name, prefix):
    name = name.lower()
    l_id = ''
    if name != None and 'lab' in name:
        if prefix not in ('LBS', 'EEC'):
            l_id = 'l'
    return l_id
  
    
'''    
# Function to flag the row containing the latest term in which a course is taken    
'''
def keep_latest(enrl_term, latest_term):
    is_latest = False
    if enrl_term == latest_term:
        is_latest = True
    return is_latest

'''
# Function to calculate the percentage of courses that were passed from total taken
'''
def perc_courses_passed(passed, total):
    return passed / total
 
'''    
# Create course level field based on course number
'''
def assign_level(crs_num):
    
    crs_num = str(crs_num)
    level = crs_num[-len(crs_num)]
    
    return level    

'''
# Calculates the total points earned per course
'''
def total_points(credits, point_value):
    
    tot_points = 0
    
    if point_value < 0:
        point_value = 0
        
        tot_points = credits * point_value
    else:
        tot_points = credits * point_value
    
    return tot_points


'''
# Create course level field based on course number 
'''
def assign_level(crs_num):
    
    crs_num = str(crs_num)
    level = crs_num[-len(crs_num)]
    
    return level    

  
'''    
# Calculate the term gpa based on viable courses/grade points only
'''
def calc_gpa(tot_pts_term, tot_creds):
    
    points = 0
    
    if tot_creds <= 0 or tot_pts_term <= 0:
        points
    else:
        points = tot_pts_term / tot_creds
   
    return points 


'''
# Calculate the avg grade obtain in course per term
'''
def calc_avg_grade(grade_pts, students):
    
    points = 0
    
    if grade_pts <= 0:
        points
    else:
        points = grade_pts / students
   
    return points

In [5]:
'''
Calculates the term gpa per student
'''
def calc_term_gpa(df):
    # Calculate total points earned per course
    df['crs_tot_pts'] = df[['CREDITS', 'grade_point_value']].apply(lambda x: total_points(x['CREDITS'], x['grade_point_value']), axis = 1) 
    
    # Remove any courses that have grade point values that are negative
    filtered = df.loc[(df['grade_point_value'] >= 0)].copy()

    # Calculate the sum of all points earned, as well as sum of credits per term
    grouped = filtered.groupby(['uuid', 'enrl_term_id'], as_index=False).agg({'crs_tot_pts': ['sum'], 'CREDITS': ['sum']})
    grouped.columns = grouped.columns.get_level_values(0)
    grouped = grouped.rename(columns={'CREDITS': 'tot_creds'})
    
    # Use the grouped df to calculate the term gpa per student (excluding courses that do not apply to GPA)
    grouped['term_gpa'] = grouped[['crs_tot_pts', 'tot_creds']].apply(lambda x: calc_gpa(x['crs_tot_pts'], 
                                                                                  x['tot_creds']), axis = 1)
    
    # Merge the data back into the main df; also keeping count of total credits taken by student
    stu_file = pd.merge(df, grouped[['uuid','enrl_term_id','term_gpa', 'tot_creds']], how='left', left_on=['uuid', 'enrl_term_id'],
         right_on=['uuid', 'enrl_term_id'], suffixes=('_og', '_stu'), copy=True, indicator=False,
         validate=None)
    # Drop unnecessary columns
    stu_file.drop(['crs_tot_pts','tot_creds'], axis=1, inplace = True)
    
    return stu_file

'''
Calculates the credits attempted per student
'''
def calc_creds_attp(df, og_data):
    
    # Find the number of credits attempted per term (regardless of grade)
    group_creds = og_data.groupby(['uuid', 'enrl_term_id'], as_index=False).agg({'CREDITS': ['sum']})
    group_creds.columns = group_creds.columns.get_level_values(0)
    group_creds = group_creds.rename(columns={'CREDITS': 'creds_attp_term'})
    
    # Merge the data back into the main df; also keeping count of total credits taken by student
    stu_file2 = pd.merge(df, group_creds[['uuid','enrl_term_id','creds_attp_term']], how='left', left_on=['uuid', 'enrl_term_id'],
         right_on=['uuid', 'enrl_term_id'], suffixes=('_og', '_stu'), copy=True, indicator=False,
         validate=None)
    
    return stu_file2

'''
Calculates the average grade earned in course per term
'''
def calc_crs_avg_gpa(df):
    # Remove any students that NAN for term GPA
    crs_filtered = df.loc[(df['grade_point_value'] >= 0)].copy()
    
    # Calculate the sum of all points earned, as well as sum of credits per term
    crs_group = crs_filtered.groupby(['unique_course', 'enrl_term_id'], as_index=False).agg({'grade_point_value': ['sum'], 'uuid': ['nunique']})
    crs_group.columns = crs_group.columns.get_level_values(0)
    
    # Use the grouped df to calculate the avg grade per term for students who take the course
    crs_group['crs_avg_grd_term'] = crs_group[['grade_point_value', 'uuid']].apply(lambda x: calc_avg_grade(x['grade_point_value'], 
                                                                                  x['uuid']), axis = 1)
    
    # Merge the data back into the main df
    final_file = pd.merge(df, crs_group[['unique_course','enrl_term_id','crs_avg_grd_term']], how='left', left_on=['unique_course', 'enrl_term_id'],
         right_on=['unique_course', 'enrl_term_id'], suffixes=('_og', '_stu'), copy=True, indicator=False,
         validate=None)
    
    return final_file


'''
Create unique course identifier, course level, grade point value and run three functions above
'''
def create_vars(df):
    # Convert course number from int to str
    df['CRS_NUMBER'] = df['CRS_NUMBER'].astype(str)
    # Create lab flag
    df['lab_ind'] = df.apply(lambda x: lab_identifier(x['COURSE_NAME'], x['CRS_PREFIX']), axis=1) 
    # Create unique course identifer and take care of NAN's by using agg with '-' as separator
    df['unique_course'] = df[['CRS_PREFIX','CRS_NUMBER', 'lab_ind']].agg('-'.join, axis=1)
    # Create course level
    df['crs_level'] = df['CRS_NUMBER'].apply(lambda x: assign_level(x)) 
    
    # Calculate grade point value
    
    #Remove  space from column values 
    df['CRS_GRADE']=df['CRS_GRADE'].apply(lambda x: x.strip())
    # Create grade_point_value attribute using grade_convert function
    df['grade_point_value'] = df['CRS_GRADE'].apply(grade_convert)
    
    #Calculate term gpa
    df_gpa = calc_term_gpa(df)
    
    #Calculate credits attempted per term
    df_creds = calc_creds_attp(df_gpa, df)
    
    #Calculate course average term gpa
    df_final = calc_crs_avg_gpa(df_creds)
    
    return df_final

# Import data and create variables

In [6]:
full_file = import_file("anon_data_all.xlsx")

In [7]:
data = cleanup(full_file).copy()

In [8]:
final = create_vars(data)

In [9]:
final.drop('lab_ind', axis=1, inplace=True)

## File description / shape

In [10]:
full_file.head(5)

Unnamed: 0,admit_term_id,admit_term_desc,admit_strm,admit_acad_yr,TOT_ACT,TOT_SAT,TOT_SAT_W,ETHNICITY,STUDENT_GENDER,ADMIT_COLLEGE,...,Modal_Group,grad_term_id,Degree Term,grad_college,grad_department,grad_major,Degree_Level_Granted,grad_strm,grad_strm_new,uuid
0,201105,Summer 2011,1115,2011-2012,,,,Hispanic/Latino,Female,Green School of International and Public Affairs,...,Face to Face,201405.0,Summer 2014,Green School of International and Public Affairs,Politics & International Relations,Political Science & Government,B,1145,1145.0,72967827-6f53-48d6-9bef-9c4d73b2f1bd
1,201105,Summer 2011,1115,2011-2012,,,,Hispanic/Latino,Female,Green School of International and Public Affairs,...,Face to Face,201405.0,Summer 2014,Green School of International and Public Affairs,Politics & International Relations,Political Science & Government,B,1145,1145.0,72967827-6f53-48d6-9bef-9c4d73b2f1bd
2,201105,Summer 2011,1115,2011-2012,,,,Hispanic/Latino,Female,"College of Arts, Sciences and Education",...,Online 2.0,,,,,,,-1,,be1c3756-9437-4e3e-afd0-caf991de13c9
3,201105,Summer 2011,1115,2011-2012,,,,Hispanic/Latino,Female,"College of Arts, Sciences and Education",...,Online 2.0,,,,,,,-1,,be1c3756-9437-4e3e-afd0-caf991de13c9
4,201105,Summer 2011,1115,2011-2012,,,,Hispanic/Latino,Female,"College of Arts, Sciences and Education",...,Online 2.0,,,,,,,-1,,be1c3756-9437-4e3e-afd0-caf991de13c9


In [11]:
full_file.shape

(564787, 44)

In [12]:
 print("Count of unique students: " , full_file.uuid.nunique())

Count of unique students:  35677


In [13]:
full_file.columns

Index(['admit_term_id', 'admit_term_desc', 'admit_strm', 'admit_acad_yr',
       'TOT_ACT', 'TOT_SAT', 'TOT_SAT_W', 'ETHNICITY', 'STUDENT_GENDER',
       'ADMIT_COLLEGE', 'ADMIT_DEPARTMENT', 'FL_RESY', 'enrl_acad_yr',
       'enrl_term_id', 'enrl_term_desc', 'COURSE_NAME', 'CRS_PREFIX',
       'CRS_NUMBER', 'CRS_SECT_NUMBER', 'CREDITS', 'CRS_GRADE',
       'INST_MODE_DESCR', 'UCC_CRSE_FLG', 'PELL_IND', 'PELL_ELIGIBILITY',
       'INST_GPA', 'CUM_HRS_XFR', 'TERM_COLLEGE', 'TERM_DEPARTMENT',
       'ENROLLMENT_STATUS', 'ACAD_PLAN', 'ACAD_SUB_PLAN', 'STUDENT_ADM_TYPE',
       'GATEWAYCRSE_FLG', 'Modal_Group', 'grad_term_id', 'Degree Term',
       'grad_college', 'grad_department', 'grad_major', 'Degree_Level_Granted',
       'grad_strm', 'grad_strm_new', 'uuid'],
      dtype='object')

## Final dataframe description and shape

In [14]:
final.head(5)

Unnamed: 0,admit_term_id,admit_term_desc,admit_strm,admit_acad_yr,TOT_ACT,TOT_SAT,TOT_SAT_W,ETHNICITY,STUDENT_GENDER,ADMIT_COLLEGE,...,Degree_Level_Granted,grad_strm,grad_strm_new,uuid,unique_course,crs_level,grade_point_value,term_gpa,creds_attp_term,crs_avg_grd_term
0,201105,Summer 2011,1115,2011-2012,,,,Hispanic/Latino,Female,Green School of International and Public Affairs,...,B,1145,1145.0,72967827-6f53-48d6-9bef-9c4d73b2f1bd,MGF-1107-,1,2.33,2.5,6,2.097561
1,201105,Summer 2011,1115,2011-2012,,,,Hispanic/Latino,Female,Green School of International and Public Affairs,...,B,1145,1145.0,72967827-6f53-48d6-9bef-9c4d73b2f1bd,ECO-2013-,2,2.67,2.5,6,2.884382
2,201105,Summer 2011,1115,2011-2012,,,,Hispanic/Latino,Female,"College of Arts, Sciences and Education",...,,-1,,be1c3756-9437-4e3e-afd0-caf991de13c9,HSA-3111-,3,4.0,2.5,6,3.204615
3,201105,Summer 2011,1115,2011-2012,,,,Hispanic/Latino,Female,"College of Arts, Sciences and Education",...,,-1,,be1c3756-9437-4e3e-afd0-caf991de13c9,HSA-4170-,4,4.0,3.835,6,3.7
4,201105,Summer 2011,1115,2011-2012,,,,Hispanic/Latino,Female,"College of Arts, Sciences and Education",...,,-1,,be1c3756-9437-4e3e-afd0-caf991de13c9,EEC-3400-,3,4.0,3.835,6,3.514016


In [15]:
final.shape

(564787, 50)

In [16]:
 print("Count of unique students: " , final.uuid.nunique())

Count of unique students:  35677


In [17]:
final.columns

Index(['admit_term_id', 'admit_term_desc', 'admit_strm', 'admit_acad_yr',
       'TOT_ACT', 'TOT_SAT', 'TOT_SAT_W', 'ETHNICITY', 'STUDENT_GENDER',
       'ADMIT_COLLEGE', 'ADMIT_DEPARTMENT', 'FL_RESY', 'enrl_acad_yr',
       'enrl_term_id', 'enrl_term_desc', 'COURSE_NAME', 'CRS_PREFIX',
       'CRS_NUMBER', 'CRS_SECT_NUMBER', 'CREDITS', 'CRS_GRADE',
       'INST_MODE_DESCR', 'UCC_CRSE_FLG', 'PELL_IND', 'PELL_ELIGIBILITY',
       'INST_GPA', 'CUM_HRS_XFR', 'TERM_COLLEGE', 'TERM_DEPARTMENT',
       'ENROLLMENT_STATUS', 'ACAD_PLAN', 'ACAD_SUB_PLAN', 'STUDENT_ADM_TYPE',
       'GATEWAYCRSE_FLG', 'Modal_Group', 'grad_term_id', 'Degree Term',
       'grad_college', 'grad_department', 'grad_major', 'Degree_Level_Granted',
       'grad_strm', 'grad_strm_new', 'uuid', 'unique_course', 'crs_level',
       'grade_point_value', 'term_gpa', 'creds_attp_term', 'crs_avg_grd_term'],
      dtype='object')

# REVIEWING FILE

In [18]:
final.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 564787 entries, 0 to 564786
Data columns (total 50 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   admit_term_id         564787 non-null  int64  
 1   admit_term_desc       564787 non-null  object 
 2   admit_strm            564787 non-null  int64  
 3   admit_acad_yr         564787 non-null  object 
 4   TOT_ACT               114843 non-null  float64
 5   TOT_SAT               184958 non-null  float64
 6   TOT_SAT_W             183197 non-null  float64
 7   ETHNICITY             564787 non-null  object 
 8   STUDENT_GENDER        564787 non-null  object 
 9   ADMIT_COLLEGE         564787 non-null  object 
 10  ADMIT_DEPARTMENT      564787 non-null  object 
 11  FL_RESY               564787 non-null  object 
 12  enrl_acad_yr          564787 non-null  object 
 13  enrl_term_id          564787 non-null  int64  
 14  enrl_term_desc        564787 non-null  object 
 15  

In [19]:
final.loc[(final['uuid'] == 'bfe96db6-e979-4d8c-b837-6c9e7f143814')][['enrl_term_id', 'creds_attp_term','crs_avg_grd_term','grade_point_value','CRS_GRADE' , 'unique_course', 'CREDITS']]

Unnamed: 0,enrl_term_id,creds_attp_term,crs_avg_grd_term,grade_point_value,CRS_GRADE,unique_course,CREDITS
97032,201501,6,3.023034,3.0,B,CLP-4374-,3
97033,202008,8,3.445485,3.67,A-,PSY-3024-,2
97034,202008,8,3.347872,4.0,A,HUM-3306-,3
97035,201408,9,2.849662,2.67,B-,BSC-2023-,3
97036,202001,6,3.552024,3.33,B+,SOP-3004-,3
97037,201408,9,3.072863,3.0,B,PPE-3003-,3
97038,202008,8,3.744481,3.67,A-,PSY-4931-,3
97039,201501,6,2.998789,0.67,D-,CLP-4314-,3
97040,202001,6,3.494523,4.0,A,PSY-3211-,3
97041,201408,9,3.076667,1.67,C-,GEA-2000-,3


# SAVING FILE AS EXCEL

In [20]:
#file_stu_elapsed.to_excel('anon_engvars.xlsx', encoding='utf-8')
final.to_excel('anon_engvars_02282022.xlsx', encoding='utf-8')