# College Scorecard: Which college leads to better earnings - Data Cleaning
Jeevarani Radhakrishnan<br>
July 2019

### Import the necessary packages and set the style sheet for this notebook
Setting the stylesheet here to display the dataframe in a readable manner

In [147]:
import pandas as pd
import glob
import numpy as np
import re
%matplotlib inline
from IPython.core.display import HTML
css = open('data\style-table.css').read() 
HTML('<style>{}</style>'.format(css))

### Examine the data dictionary

In [148]:
data_dictionary = pd.read_excel("data/CollegeScorecardDataDictionary.xlsx",sheet_name='data_dictionary')
cohort_map = pd.read_excel("data/CollegeScorecardDataDictionary.xlsx",sheet_name='cohort_map')

data_Details = data_dictionary.dropna(subset=['VARIABLE NAME']).iloc[:,[0,1,2,3,4]]
data_Details.columns = ['Name','Category','DevName','DataType','VariableName']

cohort_check = pd.merge(data_Details, cohort_map, left_on='VariableName',right_on = 'Variable Name').drop(['Variable Name'],axis=1)
cols = [c for c in cohort_check.columns if c[0:9] != 'MERGED_19']
# eliminate earlier cohorts
cohort_2000 = cohort_check[cols]

The analysis below is done to check which data set is relevant for our study. Based on the numbers, 2014 has most values for almost all categories, 2016 and 2017 do not have a lot of completion data. 2015, 2016 and 2017 does not have earnings data. Also rows before 2000 have very low non-null values.
2014 have lesser school data but school data can also be taken from other data sets if needed. Hence 2014 is the dataset that will be considered for our analysis

In [149]:
count_by_cat = cohort_check.groupby('Category').agg('count')
#total of all columns to check total non null values
count_by_cat.sum(axis=0) #rows before 2000 have very low null values

#remove 19's from the count dataframe
cols = [c for c in count_by_cat.columns if c[0:9] != 'MERGED_19']
count_by_cat= count_by_cat[cols]

#maxvalues = count_by_cat.loc[:,~count_by_cat.columns.isin(['Name','DevName','DataType','VariableName'])].max(axis=1)
#sum across columns 
count_by_cat["max"] = count_by_cat.loc[:,'MERGED_2000-01 datafile':'MERGED_2017-18 datafile'].max(axis=1)

count_by_cat["diff_2014"] = count_by_cat['MERGED_2014-15 datafile'] - count_by_cat["max"]
#examine for everything other than repayment and aid categories
count_by_cat.iloc[[0,1,3,4,5,7,8,9]]

Unnamed: 0_level_0,Name,DevName,DataType,VariableName,MERGED_2000-01 datafile,MERGED_2001-02 datafile,MERGED_2002-03 datafile,MERGED_2003-04 datafile,MERGED_2004-05 datafile,MERGED_2005-06 datafile,MERGED_2006-07 datafile,MERGED_2007-08 datafile,MERGED_2008-09 datafile,MERGED_2009-10 datafile,MERGED_2010-11 datafile,MERGED_2011-12 datafile,MERGED_2012-13 datafile,MERGED_2013-14 datafile,MERGED_2014-15 datafile,MERGED_2015-16 datafile,MERGED_2016-17 datafile,MERGED_2017-18 datafile,max,diff_2014
Category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1
academics,247,247,247,247,241,241,241,241,241,241,247,247,247,247,247,247,247,247,247,247,247,247,247,0
admissions,25,25,25,25,0,19,19,19,19,19,22,22,25,25,25,25,25,25,25,25,19,19,25,0
completion,1214,1214,1214,1214,622,624,820,820,1016,1016,1016,1016,1048,1050,1050,1034,1034,1034,1034,1082,112,216,1082,-48
cost,77,77,77,77,9,9,9,9,9,9,9,9,9,71,71,47,47,47,47,47,47,47,71,-24
earnings,76,76,76,76,0,0,0,26,5,36,10,62,10,62,10,62,52,52,52,0,0,0,62,-10
root,5,5,5,5,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,5,5,-2
school,44,44,44,44,16,17,18,18,18,18,19,19,19,19,19,20,20,20,20,20,19,42,42,-22
student,113,113,113,113,52,68,55,68,59,72,60,61,68,69,63,64,63,64,63,64,63,36,72,-9


In [150]:
#Examine Null values in 2014_2015 in detail

cohort_2000[(cohort_2000.Category == 'completion') & (cohort_2000['MERGED_2014-15 datafile'].isnull())].iloc[:,[0,2,4,19]] #has nulls but is not required
cohort_2000[(cohort_2000.Category == 'cost') & (cohort_2000['MERGED_2014-15 datafile'].isnull())].iloc[:,[0,2,4,19]] 
cohort_2000[(cohort_2000.Category == 'root') & (cohort_2000['MERGED_2014-15 datafile'].isnull())].iloc[:,[0,2,4,19]] # latitude and longitude is null
cohort_2000[(cohort_2000.Category == 'school') & (cohort_2000['MERGED_2014-15 datafile'].isnull())].iloc[:,[0,2,4,19]] # there are some null columns but this can be pulled from other cohorts
cohort_2000[(cohort_2000.Category == 'student') & (cohort_2000['MERGED_2014-15 datafile'].isnull())].iloc[:,[0,2,4,19]] # has null values but that will not affect our analysis
cohort_2000[(cohort_2000.Category == 'admissions') & (cohort_2000['MERGED_2014-15 datafile'].isnull())].iloc[:,[0,2,4,19]] #all non null values


Unnamed: 0,Name,DevName,VariableName,MERGED_2014-15 datafile


### Read the 2014_15 dataset

In [151]:
college_2014 = pd.read_csv('data\MERGED2014_15_PP.csv', na_values='PrivacySuppressed',low_memory=False)
college_2014.shape

(7703, 1977)

### Clean Up to remove unnecessary columns
1. Remove colleges with highest degree as associate or certificate as our analysis is focused on Bachelors and Graduate program
2. Remove columns will all null values

In [152]:
print("Before removing associate and certificate records:")
print(college_2014.shape)
print("After removing associate and certificate records:")
# use ICLevel and HIGHDEG
college_2014 = college_2014[(college_2014.ICLEVEL == 1) & (college_2014.HIGHDEG.isin([3,4]))]
college_2014.shape

Before removing associate and certificate records:
(7703, 1977)
After removing associate and certificate records:


(2969, 1977)

### Change the column data type
1. Update column data type to categorical wherever appropriate
2. Update column data type to numeric wherever appropriate

In [153]:
cat_index = ['MAIN','PREDDEG','HIGHDEG','CONTROL','ST_FIPS','REGION','LOCALE','LOCALE2','CCBASIC','CCUGPROF','CCSIZSET',
             'HBCU','PBI','ANNHI','TRIBAL','AANAPII','HSI','NANTI','MENONLY','WOMENONLY','RELAFFIL','DISTANCEONLY']

pattern = re.compile('CIP\d+.')

cip_columns = [col for col in college_2014.columns if ((bool(pattern.match(col)) == True)  | (col in cat_index))]

print("Before setting Categorical column type")
print(college_2014.info())

college_2014[cip_columns] = college_2014[cip_columns].astype('category')
print("After setting Categorical column type")

college_2014.info()

Before setting Categorical column type
<class 'pandas.core.frame.DataFrame'>
Int64Index: 2969 entries, 0 to 7273
Columns: 1977 entries, UNITID to OMENRUP_PARTTIME_POOLED_SUPP
dtypes: float64(1953), int64(11), object(13)
memory usage: 44.8+ MB
None
After setting Categorical column type
<class 'pandas.core.frame.DataFrame'>
Int64Index: 2969 entries, 0 to 7273
Columns: 1977 entries, UNITID to OMENRUP_PARTTIME_POOLED_SUPP
dtypes: category(212), float64(1747), int64(5), object(13)
memory usage: 40.6+ MB


In [154]:
print("The count of columns with all null values: ",college_2014.isna().all().value_counts().loc[True]) #1977 columns
#drop all the columns with all null values
college_2014 = college_2014.dropna(how='all',axis='columns')
print("Is there columns with all null values after removing them?",True in college_2014.isna().all().value_counts())


The count of columns with all null values:  364
Is there columns with all null values after removing them? False


### Set the UNITID as index

In [155]:
print("check if there are duplicate UNITIDs")
print((college_2014.duplicated(subset=['UNITID'], keep='first')).value_counts())
college_2014.set_index(['UNITID'],inplace=True)
college_2014.info()

check if there are duplicate UNITIDs
False    2969
dtype: int64
<class 'pandas.core.frame.DataFrame'>
Int64Index: 2969 entries, 100654 to 485290
Columns: 1612 entries, OPEID to MTHCMP6
dtypes: category(197), float64(1398), int64(4), object(13)
memory usage: 32.7+ MB


### Create Column Counts Dataframe

In [156]:
def return_counts(college):
    counts_c = college.count().sort_values().to_frame().reset_index()
    counts_c.columns = ['VariableName','NumValues']
    
    counts_c = pd.merge(data_Details,counts_c, left_on='VariableName',right_on='VariableName').iloc[:,[0,1,2,4,5]]
    return counts_c

### 1. REPAYMENT and AID

Repayment and aid categories are not required for our study. Hence these two categories are dropped

In [157]:
counts_check = return_counts(college_2014)
repaymentaid_col = list(counts_check[counts_check.Category.isin(['repayment','aid'])]['VariableName'])

college_2014 = college_2014.drop(repaymentaid_col,axis=1)
college_2014.shape


(2969, 1444)

### 2. EARNINGS
<b>Drawbacks with earnings data:</b>
1. Data is not yet available to produce program-level earnings data. Research suggests that the variation across programs within an institution may be even greater than aggregate earnings across institutions; for instance, STEM and health majors frequently earn more than students who study in other fields. 
2. Also the data include only Title IV-receiving students, so figures may not be representative of institutions with a low proportion of Title IV-eligible students.

<b>Data Cleaning:</b>
6yrs data and 10yrs data have most non null values. These measures will be retained from 6yr and 10yr for the analysis:
1. Mean earnings of male/female students working and also the count
2. Mean/Median/Std Deviation of students working and also the count
3. Mean earnings of students working and also the count
4. Share of students earning over 28k/year
5. number of students not working


In [158]:
print(college_2014.shape)
#examine the earnings col
counts_check[counts_check.Category == 'earnings'].sort_values(by='NumValues')
earnings_col = list(counts_check[(counts_check.Category == 'earnings') & (counts_check.VariableName.str.contains('P8|INDEP|PCT|MD|SD'))]['VariableName'])

(2969, 1444)


In [159]:
print(college_2014.shape)
#examine the earnings col
counts_check[counts_check.Category == 'earnings'].sort_values(by='NumValues')

earnings_col = list(counts_check[(counts_check.Category == 'earnings') & (counts_check.VariableName.str.contains('P8|INDEP|PCT|MD|SD'))]['VariableName'])

college_2014 = college_2014.drop(earnings_col,axis=1)



college_2014.columns = college_2014.columns.str.replace('MN_EARN_WNE','MN_EARN')
college_2014.columns = college_2014.columns.str.replace('COUNT_WNE','NUM_EARN')
college_2014.columns = college_2014.columns.str.replace('INC1','LOW')
college_2014.columns = college_2014.columns.str.replace('INC2','MED')
college_2014.columns = college_2014.columns.str.replace('INC3','HIGH')
college_2014.columns = college_2014.columns.str.replace('INC3','HIGH')
college_2014.columns = college_2014.columns.str.replace('MALE0','FEM')
college_2014.columns = college_2014.columns.str.replace('MALE1','MALE')
college_2014.columns = college_2014.columns.str.replace('COUNT_NWNE','NUM_NW')


data_Details.loc[data_Details.VariableName.str.contains('MN_EARN_WNE'),"VariableName"] = data_Details.loc[data_Details.VariableName.str.contains('MN_EARN_WNE'),"VariableName"].str.replace('MN_EARN_WNE','MN_EARN')
data_Details.loc[data_Details.VariableName.str.contains('COUNT_WNE'),"VariableName"] = data_Details.loc[data_Details.VariableName.str.contains('COUNT_WNE'),"VariableName"].str.replace('COUNT_WNE','NUM_EARN')
data_Details.loc[data_Details.VariableName.str.contains('INC1'),"VariableName"] = data_Details.loc[data_Details.VariableName.str.contains('INC1'),"VariableName"].str.replace('INC1','LOW')
data_Details.loc[data_Details.VariableName.str.contains('INC2'),"VariableName"] = data_Details.loc[data_Details.VariableName.str.contains('INC2'),"VariableName"].str.replace('INC2','MED')
data_Details.loc[data_Details.VariableName.str.contains('INC3'),"VariableName"] = data_Details.loc[data_Details.VariableName.str.contains('INC3'),"VariableName"].str.replace('INC3','HIGH')

data_Details.loc[data_Details.VariableName.str.contains('MALE0'),"VariableName"] = data_Details.loc[data_Details.VariableName.str.contains('MALE0'),"VariableName"].str.replace('MALE0','FEM')
data_Details.loc[data_Details.VariableName.str.contains('MALE1'),"VariableName"] = data_Details.loc[data_Details.VariableName.str.contains('MALE1'),"VariableName"].str.replace('MALE1','MALE')
data_Details.loc[data_Details.VariableName.str.contains('COUNT_NWNE'),"VariableName"] = data_Details.loc[data_Details.VariableName.str.contains('COUNT_NWNE'),"VariableName"].str.replace('COUNT_NWNE','NUM_NW')


(2969, 1444)


### 3. STUDENT
The students' self report race and gender data will be retained for the analysis. The number of degree seeking undergraduates, average age of entry and number of graduate students are also retained.

All the columns starting with UGDS are percentages. This needs to be multiplied by # of undergraduate degree seeking students. 

In [160]:
stud_col = list(counts_check[(counts_check.Category == 'student') & ~counts_check.VariableName.str.startswith('INC_PCT') & ~counts_check.DevName.str.contains('age_entry|size|grad_students|race_ethni|demographics.men|demographics.women')]['VariableName'])

college_2014 = college_2014.drop(stud_col,axis=1)
college_2014.shape

student_income_list = ['INC_PCT_H2','INC_PCT_H1','INC_PCT_LO','INC_PCT_M1','INC_PCT_M2']

# Change the UGDS data type
college_2014.loc[:,college_2014.columns.str.contains('UGDS_')] = (college_2014.loc[:,college_2014.columns.str.contains('UGDS_')]).mul(college_2014['UGDS'],axis=0).round(0)
#college_2014.loc[:,college_2014.columns.isin(student_income_list)] = (college_2014.loc[:,college_2014.columns.isin(student_income_list)]).mul(college_2014['UGDS'],axis=0).round(0)


### 4. ADMISSIONS
Only the SAT AVG OVERALL and SAT Averages by OPEID will be retained. The ACT scores have large null values.


In [161]:
counts_check[(counts_check.Category == 'admissions') & (counts_check.NumValues >1200)].sort_values("NumValues")

Unnamed: 0,Name,Category,DevName,VariableName,NumValues
25,25th percentile of the ACT cumulative score,admissions,act_scores.25th_percentile.cumulative,ACTCM25,1233
26,75th percentile of the ACT cumulative score,admissions,act_scores.75th_percentile.cumulative,ACTCM75,1233
33,Midpoint of the ACT cumulative score,admissions,act_scores.midpoint.cumulative,ACTCMMID,1233
37,Average SAT equivalent score of students admitted,admissions,sat_scores.average.overall,SAT_AVG,1277
38,Average SAT equivalent score of students admitted for all campuses rolled up to the 6-digit OPE ID,admissions,sat_scores.average.by_ope_id,SAT_AVG_ALL,1329
14,Admission rate,admissions,admission_rate.overall,ADM_RATE,1844
15,Admission rate for all campuses rolled up to the 6-digit OPE ID,admissions,admission_rate.by_ope_id,ADM_RATE_ALL,1912


In [162]:
pd.options.display.max_rows=1000

#remove all the admissions columns except SAT average and SAT avegerage overall and admission rates. Because ACT scores has more null values
admissions_col = list(counts_check[(counts_check.Category == 'admissions') & (~counts_check.VariableName.isin(['ACTCMMID','SAT_AVG','SAT_AVG_ALL','ADM_RATE','ADM_RATE_ALL']))]['VariableName'])
college_2014 = college_2014.drop(admissions_col,axis=1)

### 5. COMPLETION
Only Title IV completion numbers will be retained since the earnings are reported for only Title IV students

In [163]:
# Remove title iv columns that are not required
titleiv_col = list(counts_check[(counts_check.Category == 'completion') & counts_check.DevName.str.contains('title_iv') & counts_check.DevName.str.contains('transf|unknown|died|pell|first_gen|loan|depend|enrolled|withdrawn')]['VariableName'])

college_2014 = college_2014.drop(titleiv_col,axis=1)

#recrete the counts
counts_check = return_counts(college_2014)

#Remove completion columns that are not required
compl_col = list(counts_check[counts_check.Category.isin(['completion']) & counts_check.DevName.str.contains('dependent|pell|loan|first_gen|separation|transfer|2_yr|3_yr|2yrs|3yrs')]['VariableName'])

college_2014 = college_2014.drop(compl_col,axis=1)
print(college_2014.shape)


#recrete the counts
counts_check = return_counts(college_2014)


(2969, 428)


### 6. ACADEMICS
The CERTIFICATE and ASSOCIATE information can be removed as the analysis is focussed on 

In [164]:
#Exploring using RE for removing columns
pattern = re.compile(r'CIP\d+[CERTASSOC].')
#academics_col = list(counts_check[(counts_check.Category == 'academics') & (counts_check.VariableName.str.contains('CERT|ASSOC'))]['VariableName'])
cip_columns = [col for col in college_2014.columns if bool(pattern.match(col)) == True]

college_2014 = college_2014.drop(cip_columns, axis=1)
college_2014.shape

(2969, 276)

### 6. COST
Merge the public and private columns as they are mutually exclusive. Remove private columns and then rename public columns to a generic column name.

In [165]:
def privpub_combine(pubpattern, privpattern):
    pattern = re.compile(pubpattern)
    global college_2014
    npt_pubcolumns = [col for col in college_2014.columns if bool(pattern.match(col)) == True]
    pattern = re.compile(privpattern)
    npt_privcolumns = [col for col in college_2014.columns if bool(pattern.match(col)) == True]

    college_2014["sum_pub"] = college_2014[npt_pubcolumns].sum(axis=1)
    college_2014["sum_priv"] = college_2014[npt_privcolumns].sum(axis=1)

    #private columns are not present for public controls
    college_2014[(college_2014.sum_priv !=0) & (college_2014.CONTROL == 1)]
    #public columns are not present for private controls
    college_2014[(college_2014.sum_pub !=0) & ((college_2014.CONTROL == 3) | (college_2014.CONTROL == 2))]
    
    #so copy public count to private columns
    for pubcolumn,privcolumn  in zip(npt_pubcolumns, npt_privcolumns):
        college_2014[pubcolumn].fillna(college_2014[privcolumn],inplace=True)# = college_2014[privcolumn] + college_2014[pubcolumn]
        college_2014.rename(columns={pubcolumn: pubcolumn[0:-4]},inplace=True)

    college_2014 = college_2014.drop(["sum_priv","sum_pub"],axis=1)    
    college_2014 = college_2014.drop(npt_privcolumns, axis=1)
    print(college_2014.shape)
    data_Details.loc[data_Details.VariableName.isin(npt_pubcolumns),"DevName"] = data_Details.loc[data_Details.VariableName.isin(npt_pubcolumns),"DevName"].str.replace("public","pubpriv")
    data_Details.loc[data_Details.VariableName.isin(npt_pubcolumns),"Name"] = data_Details.loc[data_Details.VariableName.isin(npt_pubcolumns),"Name"].str.replace("public","pubpriv")

    data_Details.loc[data_Details.VariableName.isin(npt_pubcolumns),"VariableName"] = data_Details.loc[data_Details.VariableName.isin(npt_pubcolumns),"VariableName"].str[0:-4]

In [166]:
college_2014["COSTT4_A"] = college_2014["COSTT4_A"].fillna(0) + college_2014["COSTT4_P"].fillna(0)
college_2014["TUITIONFEE_IN"] = college_2014["TUITIONFEE_IN"].fillna(0) + college_2014["TUITIONFEE_PROG"].fillna(0)
college_2014.rename(columns={'COSTT4_A': 'AVGCOST'},inplace=True)

data_Details.loc[data_Details.VariableName=='COSTT4_A',"VariableName"] = 'AVGCOST'
data_Details.loc[data_Details.VariableName=='AVGCOST',"Name"] = 'Average cost of attendance'
data_Details.loc[data_Details.VariableName=='AVGCOST',"DevName"] = 'attendance.cost'

data_Details.loc[data_Details.VariableName=='PREDDEG',"Name"] = '0 Not classified, 1 Pred certificate, 2 Pred associate, 3 Pred bachelor, 4 Entirely graduate'


In [167]:
privpub_combine('NPT4.*PUB','NPT4.*PRIV')
privpub_combine('NUM4.*PUB','NUM4.*PRIV')

(2969, 267)
(2969, 261)


### 7. Analyze other categories
<b>Root: </b> This will be needed<br>
<b>School: </b> This will be needed<br>

In [168]:
pd.set_option('max_colwidth', 230)

counts_check[counts_check.Category == 'root'].sort_values(by='NumValues') #root column has two non-null values
counts_check[counts_check.Category == 'school'].sort_values(by='NumValues') #school has several non null columns that will provide information about the school. Hence that will be retained.
counts_check.shape

(428, 5)

### 8. Perform other cleanups
1. Clean up zip to contain only the first five numbers and then convert to numeric
2. Remove all columns which has less than 500 non null values


In [169]:
# Extract only first 5 letters of zip and convert the file to numeric.
college_2014["ZIP"] = college_2014["ZIP"].str[0:5]
college_2014["ZIP"] = college_2014["ZIP"].apply(pd.to_numeric, errors = 'coerce')
college_2014 = college_2014.drop(['SCH_DEG','ST_FIPS'],axis=1)
college_2014.shape

(2969, 259)

In [170]:
print(college_2014.shape)

counts_check = return_counts(college_2014)
   
more_null_columns = ((list(counts_check[counts_check.NumValues <500]['VariableName'])))
college_2014 = college_2014.drop(more_null_columns,axis=1)
print(college_2014.shape)

#recalculate counts after dropping
counts_check = return_counts(college_2014)

(2969, 259)
(2969, 225)


### UNDERSTAND THE CORRELATIONS

<b>COST:</b> <br>
It is evident that all of the demographic (income wise) are weakly correlated with earnings. Hence we will only retain average cost and in-state tuition fees

In [171]:
earnings_column = list(counts_check.loc[(counts_check.Category=='earnings') & (counts_check.VariableName.str.contains('MAL|FEM')) ,"VariableName"])
completion_column = list(counts_check.loc[(counts_check.Category=='completion') & (counts_check.VariableName.str.contains('MAL|FEM')) ,"VariableName"])
student_column = list(counts_check.loc[(counts_check.Category=='student') & (counts_check.VariableName.str.contains('INC')) ,"VariableName"])

In [172]:
pd.options.display.max_columns=50
correlation_table = college_2014.corr(method='pearson').round(2).abs()
earnings_column = list(counts_check.loc[(counts_check.Category=='earnings')  ,"VariableName"])
cost_column =list(counts_check.loc[(counts_check.Category=='cost') ,"VariableName"])

corr_col = correlation_table.loc[cost_column, earnings_column]
corr_col[~((corr_col<.3).all(axis=1))]
cost_column_remove = list(counts_check.loc[(counts_check.Category=='cost') & ~(counts_check.VariableName.isin(['AVGCOST','NPT4','NUM4','TUITIONFEE_IN','TUITIONFEE_OUT'])) ,"VariableName"]) 
college_2014= college_2014.drop(cost_column_remove,axis=1)
college_2014.shape

(2969, 212)

<b>COMPLETION:</b><br>
These are the completion data available:
1. Completion Rate by race for 150% completion and overall completion rate for 100%, 150% and 200% completion. <font color= blue><i>This is not correlated with student information and also with earnings. Hence these will be <b>removed</b>. This is all the C150_ columns. Only overall completion rate at 100%, 150% and 200% will be retained</i></font>
2. Adjusted cohort count by race for 150% completion, and overall cohort count for 100%, 150% and 200% completion. <font color= blue><i>Although this is not correlated with earnings, this is correlated with student demography by race. Hence this will be retained to study the completion rate by race. This is all the D100/D150/D200 columns</i></font>
3. The number of students completed by gender/income(high/medium/low) and overall number of students. This is provided for 4yr completion, 6 year completion and 8 year completion. <font color= blue><i>This is strongly correlated with number of earnings by gender/income(high/medium/low). Hence this will be retained. </i></font>
4. Title IV overall percent completed within 4 yr/6yr and 8 yr. Title IV percent completed by gender, income for 4yr6yr and 8yr completion.<font color= blue><i>The title IV completions are moderately correlated to the earnings themselves than the number of earnings</i></font>

In [173]:
#1 & 2. Completion rate by race correlation and adjusted cohort count by race correlation with earnings
earnings_column = list(counts_check.loc[(counts_check.Category=='earnings') ,"VariableName"])
completion_column = list(counts_check.loc[(counts_check.Category=='completion') & (counts_check.VariableName.str.contains('C150|C200|C100|D150|D100|D200')) ,"VariableName"])
corr_col = correlation_table.loc[completion_column, earnings_column]
corr_col[~((corr_col<.6).all(axis=1))]

Unnamed: 0,NUM_NW_P10,NUM_EARN_P10,MN_EARN_P10,NUM_EARN_LOW_P10,NUM_EARN_MED_P10,NUM_EARN_HIGH_P10,NUM_EARN_FEM_P10,NUM_EARN_MALE_P10,GT_28K_P10,MN_EARN_LOW_P10,MN_EARN_MED_P10,MN_EARN_HIGH_P10,MN_EARN_FEM_P10,MN_EARN_MALE_P10,NUM_NW_P6,NUM_EARN_P6,MN_EARN_P6,NUM_EARN_LOW_P6,NUM_EARN_MED_P6,NUM_EARN_HIGH_P6,NUM_EARN_FEM_P6,NUM_EARN_MALE_P6,GT_28K_P6,MN_EARN_LOW_P6,MN_EARN_MED_P6,MN_EARN_HIGH_P6,MN_EARN_FEM_P6,MN_EARN_MALE_P6
C150_4,0.28,0.25,0.53,0.29,0.27,0.19,0.25,0.25,0.46,0.6,0.5,0.36,0.56,0.53,0.26,0.25,0.49,0.26,0.26,0.22,0.24,0.25,0.48,0.59,0.4,0.19,0.53,0.46
C200_4,0.28,0.25,0.54,0.28,0.26,0.19,0.24,0.26,0.49,0.61,0.49,0.38,0.58,0.55,0.25,0.25,0.51,0.25,0.26,0.21,0.24,0.25,0.51,0.61,0.39,0.21,0.55,0.48


In [174]:
#1 & 2. Completion rate by race correlation and adjusted cohort count by race correlation with student demography by race
completion_column = list(counts_check.loc[(counts_check.Category=='completion') & (counts_check.VariableName.str.contains('D150|C150')) ,"VariableName"])
student_column = list(counts_check.loc[(counts_check.Category=='student') & (counts_check.VariableName.str.contains('UGDS')) ,"VariableName"])
corr_col = correlation_table.loc[completion_column, student_column]
corr_col[~((corr_col<.6).all(axis=1))]
# It is evident here that only the cohort count by race is correlated with student information and not the completion rate by race.


Unnamed: 0,UGDS,UGDS_WHITE,UGDS_BLACK,UGDS_HISP,UGDS_ASIAN,UGDS_AIAN,UGDS_NHPI,UGDS_2MOR,UGDS_NRA,UGDS_UNKN,UGDS_MEN,UGDS_WOMEN
D150_4,0.71,0.56,0.52,0.33,0.28,0.2,0.31,0.92,0.38,0.84,0.62,0.76
D150_4_WHITE,0.78,0.75,0.47,0.25,0.28,0.22,0.27,0.87,0.47,0.71,0.72,0.79
D150_4_BLACK,0.53,0.32,0.61,0.2,0.1,0.14,0.27,0.81,0.16,0.83,0.41,0.6
D150_4_HISP,0.61,0.3,0.39,0.87,0.36,0.13,0.22,0.54,0.36,0.45,0.56,0.63
D150_4_ASIAN,0.53,0.38,0.17,0.39,0.96,0.08,0.24,0.43,0.68,0.14,0.57,0.49
D150_4_AIAN,0.47,0.35,0.34,0.17,0.11,0.66,0.24,0.74,0.19,0.68,0.39,0.52
D150_4_NHPI,0.38,0.21,0.29,0.16,0.19,0.11,0.7,0.71,0.14,0.65,0.29,0.44
D150_4_NRA,0.66,0.54,0.37,0.32,0.48,0.14,0.24,0.67,0.77,0.54,0.64,0.65
D150_4_UNKN,0.45,0.27,0.41,0.14,0.06,0.14,0.27,0.82,0.13,0.88,0.33,0.53


In [175]:
#1 & 2. Based on analysis in above two cells, the completion rate by race columns are removed as they are neither correlated with earnings nor correlated with student information
compl_rate_race = list(counts_check.loc[(counts_check.Category=='completion') & (counts_check.VariableName.str.contains('C150_4_')) ,"VariableName"])
print(compl_rate_race)
college_2014= college_2014.drop(compl_rate_race,axis=1) 

['C150_4_WHITE', 'C150_4_BLACK', 'C150_4_HISP', 'C150_4_ASIAN', 'C150_4_AIAN', 'C150_4_2MOR', 'C150_4_NRA', 'C150_4_UNKN']


In [176]:
#3 & #4 Number of students completed by income and percent Title IV completed by income - Correlation with earnings by income

earnings_column = list(counts_check.loc[(counts_check.Category=='earnings') & (counts_check.VariableName.str.contains('HIGH|MED|LOW')) ,"VariableName"])
completion_column = list(counts_check.loc[(counts_check.Category=='completion') & (counts_check.VariableName.str.contains('INC')) ,"VariableName"])
corr_col = correlation_table.loc[completion_column, earnings_column]
corr_col[~((corr_col<.6).all(axis=1))]
# It can be seen that TitleIV percent are moderately correlated with mean earnings and number of students completed is strongly correlated with number of earnings 

Unnamed: 0,NUM_EARN_LOW_P10,NUM_EARN_MED_P10,NUM_EARN_HIGH_P10,MN_EARN_LOW_P10,MN_EARN_MED_P10,MN_EARN_HIGH_P10,NUM_EARN_LOW_P6,NUM_EARN_MED_P6,NUM_EARN_HIGH_P6,MN_EARN_LOW_P6,MN_EARN_MED_P6,MN_EARN_HIGH_P6
LO_INC_COMP_ORIG_YR4_RT,0.26,0.23,0.15,0.63,0.54,0.35,0.22,0.22,0.17,0.66,0.47,0.23
LO_INC_COMP_ORIG_YR6_RT,0.28,0.24,0.15,0.62,0.52,0.35,0.23,0.23,0.18,0.66,0.45,0.21
LO_INC_COMP_ORIG_YR8_RT,0.26,0.21,0.13,0.62,0.52,0.36,0.2,0.2,0.15,0.64,0.45,0.24
LO_INC_YR4_N,0.74,0.73,0.66,0.11,0.01,0.13,0.74,0.74,0.69,0.2,0.02,0.19
MD_INC_YR4_N,0.74,0.74,0.69,0.09,0.03,0.15,0.73,0.74,0.71,0.18,0.04,0.21
HI_INC_YR4_N,0.69,0.69,0.77,0.02,0.13,0.22,0.64,0.66,0.74,0.05,0.11,0.24
LO_INC_YR6_N,0.95,0.98,0.92,0.05,0.05,0.23,1.0,1.0,0.96,0.14,0.0,0.22
MD_INC_YR6_N,0.96,0.99,0.94,0.05,0.06,0.25,1.0,1.0,0.97,0.14,0.02,0.23
HI_INC_YR6_N,0.93,0.97,1.0,0.02,0.12,0.29,0.94,0.95,1.0,0.06,0.06,0.26
LO_INC_YR8_N,0.97,0.99,0.93,0.05,0.05,0.23,1.0,1.0,0.96,0.14,0.0,0.22


In [177]:
#3 & #4 Number of students completed by gender and percent Title IV completed by gender - Correlation with earnings by gender

earnings_column = list(counts_check.loc[(counts_check.Category=='earnings') & (counts_check.VariableName.str.contains('MAL|FEM')) ,"VariableName"])
completion_column = list(counts_check.loc[(counts_check.Category=='completion') & (counts_check.VariableName.str.contains('MAL|FEM')) ,"VariableName"])
#correlation_table = correlation_table.reindex(sorted(correlation_table.columns), axis=1)
corr_col = correlation_table.loc[sorted(completion_column), sorted(earnings_column)]
corr_col[~((corr_col<.5).all(axis=1))]
# Again, it can be seen that TitleIV percent are moderately correlated with mean earnings and number of students completed is strongly correlated with number of earnings 

Unnamed: 0,MN_EARN_FEM_P10,MN_EARN_FEM_P6,MN_EARN_MALE_P10,MN_EARN_MALE_P6,NUM_EARN_FEM_P10,NUM_EARN_FEM_P6,NUM_EARN_MALE_P10,NUM_EARN_MALE_P6
FEMALE_COMP_ORIG_YR4_RT,0.57,0.55,0.51,0.47,0.21,0.2,0.21,0.21
FEMALE_COMP_ORIG_YR6_RT,0.59,0.57,0.53,0.47,0.2,0.21,0.22,0.22
FEMALE_COMP_ORIG_YR8_RT,0.59,0.56,0.53,0.48,0.17,0.17,0.2,0.19
FEMALE_YR4_N,0.04,0.13,0.01,0.02,0.91,0.92,0.88,0.93
FEMALE_YR6_N,0.03,0.07,0.07,0.01,0.98,1.0,0.9,0.98
FEMALE_YR8_N,0.03,0.07,0.06,0.01,0.99,1.0,0.91,0.98
MALE_COMP_ORIG_YR4_RT,0.6,0.57,0.56,0.52,0.18,0.17,0.17,0.17
MALE_COMP_ORIG_YR6_RT,0.61,0.59,0.58,0.54,0.18,0.19,0.18,0.19
MALE_COMP_ORIG_YR8_RT,0.6,0.57,0.57,0.54,0.14,0.13,0.15,0.15
MALE_YR6_N,0.03,0.07,0.07,0.01,0.98,0.98,0.96,1.0


In [178]:
# This is correlation of all the rest of the completion columns
earnings_column = list(counts_check.loc[(counts_check.Category=='earnings') &  (counts_check.VariableName.str.contains('MN|NUM')),"VariableName"])
completion_column = list(counts_check.loc[(counts_check.Category=='completion') & (counts_check.VariableName.isin(['COMP_ORIG_YR8_RT','COMP_ORIG_YR6_RT','COMP_ORIG_YR4_RT','OVERALL_YR8_N','OVERALL_YR6_N','OVERALL_YR4_N'])) ,"VariableName"])
corr_col = correlation_table.loc[sorted(completion_column), sorted(earnings_column)]
corr_col[~((corr_col<.6).all(axis=1))]
#Again overall numbers are correlated with number of earnings and overall percentage is correlated with mean earnings

Unnamed: 0,MN_EARN_FEM_P10,MN_EARN_FEM_P6,MN_EARN_HIGH_P10,MN_EARN_HIGH_P6,MN_EARN_LOW_P10,MN_EARN_LOW_P6,MN_EARN_MALE_P10,MN_EARN_MALE_P6,MN_EARN_MED_P10,MN_EARN_MED_P6,MN_EARN_P10,MN_EARN_P6,NUM_EARN_FEM_P10,NUM_EARN_FEM_P6,NUM_EARN_HIGH_P10,NUM_EARN_HIGH_P6,NUM_EARN_LOW_P10,NUM_EARN_LOW_P6,NUM_EARN_MALE_P10,NUM_EARN_MALE_P6,NUM_EARN_MED_P10,NUM_EARN_MED_P6,NUM_EARN_P10,NUM_EARN_P6,NUM_NW_P10,NUM_NW_P6
COMP_ORIG_YR4_RT,0.59,0.58,0.39,0.24,0.62,0.65,0.55,0.51,0.54,0.46,0.57,0.56,0.19,0.19,0.13,0.15,0.25,0.2,0.19,0.19,0.21,0.21,0.2,0.19,0.24,0.2
COMP_ORIG_YR6_RT,0.6,0.59,0.38,0.23,0.61,0.65,0.54,0.52,0.53,0.45,0.56,0.56,0.19,0.19,0.12,0.15,0.26,0.21,0.2,0.21,0.22,0.22,0.2,0.2,0.25,0.21
COMP_ORIG_YR8_RT,0.6,0.58,0.39,0.26,0.6,0.63,0.55,0.53,0.52,0.45,0.57,0.55,0.15,0.15,0.09,0.11,0.22,0.17,0.17,0.17,0.18,0.17,0.16,0.15,0.21,0.17
OVERALL_YR4_N,0.07,0.16,0.14,0.2,0.1,0.19,0.02,0.04,0.02,0.03,0.02,0.08,0.71,0.72,0.68,0.71,0.74,0.74,0.75,0.77,0.74,0.74,0.73,0.74,0.73,0.73
OVERALL_YR6_N,0.03,0.07,0.24,0.23,0.05,0.13,0.07,0.01,0.06,0.01,0.04,0.05,0.99,1.0,0.94,0.97,0.95,1.0,0.93,0.99,0.99,1.0,0.98,1.0,0.97,1.0
OVERALL_YR8_N,0.03,0.07,0.25,0.23,0.04,0.13,0.06,0.01,0.06,0.01,0.03,0.04,0.99,1.0,0.95,0.98,0.96,1.0,0.94,0.99,0.99,1.0,0.99,1.0,0.98,0.99


In [179]:
#Creating a dataframe of all look up columns so that can be added to the cleaned scorecard
pd.set_option('mode.chained_assignment', None)
lookup_value = data_dictionary.iloc[:,[4,5,6]]
lookup_value.columns = ['VariableName','Value','Label']
lookup_value.loc[:,"VariableName"] = lookup_value.loc[:,"VariableName"].fillna(method='ffill')
category_col = ['MAIN','PREDDEG','HIGHDEG','CONTROL','ST_FIPS','REGION','DISTANCEONLY','ICLEVEL','OPENADMP']


pd.reset_option('mode.chained_assignment')

lookups = lookup_value[lookup_value['VariableName'].isin(category_col)]

In [180]:
writer = pd.ExcelWriter('data\Scorecard_2014_15.xlsx',engine = 'xlsxwriter')
college_2014 = college_2014.reset_index()
counts_check = return_counts(college_2014).sort_values(["Category","DevName"])

college_2014.to_excel(writer,sheet_name='Cleaned Data',index=False)
counts_check.to_excel(writer,sheet_name='Data Dictionary', index=False)
lookups.to_excel(writer,sheet_name='Look Up',index=False)
writer.save()
writer.close()

In [181]:
# Order columns in order of groups, change categorical numbers to values

In [182]:
college_2014.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2969 entries, 0 to 2968
Columns: 205 entries, UNITID to GT_28K_P6
dtypes: category(44), float64(151), int64(6), object(4)
memory usage: 3.8+ MB


In [208]:
pd.options.display.max_columns=50
earnings_column = list(counts_check.loc[(counts_check.Category=='completion') & (counts_check.VariableName.str.contains('INC')) ,"VariableName"])
cost_column =list(counts_check.loc[(counts_check.Category=='student') & (counts_check.VariableName.str.contains('INC'))  ,"VariableName"])

correlation_table.fillna(0,inplace=True)
corr_col = correlation_table.loc[cost_column, sorted(earnings_column)]

corr_col = corr_col[~((corr_col<.6).all(axis=1))]
corr_t = corr_col.T
corr_t[~((corr_t<.6).all(axis=1))]
#STUDENT + COMPLETION
#D100_4, D150_4, D250_4 are correlated with UGDS_ in student body
#besides D150_race is strongly correlated with corresponding UGDS_race in student body, race correlation
# male/female in completion is not correlated with male female in student body
#there is no meaningful correlation between income in completion and income in earnings

Unnamed: 0,INC_PCT_H2
HI_INC_COMP_ORIG_YR4_RT,0.62
HI_INC_COMP_ORIG_YR6_RT,0.63
LO_INC_COMP_ORIG_YR4_RT,0.64
LO_INC_COMP_ORIG_YR6_RT,0.66
LO_INC_COMP_ORIG_YR8_RT,0.62
MD_INC_COMP_ORIG_YR4_RT,0.62
MD_INC_COMP_ORIG_YR6_RT,0.65
MD_INC_COMP_ORIG_YR8_RT,0.6
