In [300]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.impute import KNNImputer
from sklearn.model_selection import train_test_split

pd.set_option('display.max_columns', None)

In [303]:
columns = ['BYHOMLIT','BYRISKFC','BYOCC30',
          'BYTXMSTD','BYTXRSTD','BYMATHSE','BYENGLSE','BYWRTNGA','BYHMWRK',
          'BYTVVIGM','BYWRKHRS', 'BYNSPORT','BYXTRACU','F1STEXP','F1OCC30',
           'F1TXMSTD','F1HIMATH','F1PSEPLN','F1RGPP2','F1XTRACU','F1WRKHRS',
           'F1TVVIGM','F1MATHSE','F1RHTUNP','F3EVRATT','F3PS1LVL','F3PSLCRED',
           'F3PS2BA','F3ATTAINMENT','F3F1EDEXPFF','F3ERN2011']


These are the columns from the initial study I am keeping for this analysis.

In [304]:
df = pd.read_csv('../els_02_12_byf3pststu_v1_0.csv',
                 usecols = columns)

In [305]:
df.head(1)

Unnamed: 0,BYHOMLIT,BYRISKFC,BYOCC30,BYTXMSTD,BYTXRSTD,BYMATHSE,BYENGLSE,BYWRTNGA,BYNSPORT,BYXTRACU,BYHMWRK,BYTVVIGM,BYWRKHRS,F1STEXP,F1OCC30,F1TXMSTD,F1HIMATH,F1PSEPLN,F1XTRACU,F1WRKHRS,F1TVVIGM,F1MATHSE,F1RHTUNP,F1RGPP2,F3EVRATT,F3PS1LVL,F3PSLCRED,F3PS2BA,F3ATTAINMENT,F3F1EDEXPFF,F3ERN2011
0,0,2,-1,52.11,59.53,-1.118,-0.633,1.191,0,1,7,99,6,3,7,49.6,5,2,1,6,3,-0.258,31,2,1,2,0,-3,3,1,4000


This function renames columns to more descriptive names. 

In [306]:
def rename_cols(df):
    rename_list = ['literacy_home','risk_factors','aspired_occ_b','math_b',
                    'reading_b','math_conf_b','verbal_confidence','writing','sports','by_xcurr',
                    'homework','hedonics_b','hours_working_b','edu_confidence',
                    'aspired_occ_1','math_1','math_status_1','ps_step_1','f1_xcurr','hours_working_1',
                    'hedonics_1','math_conf_1','hs_ac_load','hs_gpa','any_ps','ps_level','graduated_Y',
                    'time_to_grad','edu_achievment','expected_edu','wages_yr']
    df.columns = rename_list
    return df

In [307]:
rename_cols(df)

Unnamed: 0,literacy_home,risk_factors,aspired_occ_b,math_b,reading_b,math_conf_b,verbal_confidence,writing,sports,by_xcurr,homework,hedonics_b,hours_working_b,edu_confidence,aspired_occ_1,math_1,math_status_1,ps_step_1,f1_xcurr,hours_working_1,hedonics_1,math_conf_1,hs_ac_load,hs_gpa,any_ps,ps_level,graduated_Y,time_to_grad,edu_achievment,expected_edu,wages_yr
0,0,2,-1,52.11,59.53,-1.118,-0.633,1.191,0,1,7,99,6,3,7,49.60,5,2,1,6,3,-0.258,31,2,1,2,0,-3,3,1,4000
1,3,0,9,57.65,56.70,1.533,0.331,1.191,0,3,5,4,0,8,10,60.64,6,5,2,1,2,0.528,31,4,1,1,1,44,10,2,3000
2,2,-9,10,66.44,64.46,-0.154,-0.933,0.996,0,2,-9,1,0,6,-9,64.26,6,5,2,0,2,-0.215,30,4,1,1,1,45,6,2,37000
3,1,-4,10,44.68,48.69,1.030,1.146,-0.137,1,0,11,99,0,6,-1,45.59,4,4,1,5,99,0.238,32,4,1,2,0,-3,4,3,1500
4,1,2,16,40.57,33.53,0.121,0.143,-0.435,0,0,10,4,3,7,16,38.79,5,5,0,6,1,1.024,32,4,1,2,0,-3,4,3,48000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16192,1,-4,15,32.98,44.62,-0.118,-0.372,-4.000,1,0,4,2,7,6,-1,-8.00,6,5,2,7,99,-8.000,24,2,-4,-4,-4,-4,-4,-9,-4
16193,1,5,-1,35.91,33.78,-0.381,-0.207,-4.000,1,1,6,3,0,4,-9,34.57,5,3,3,8,5,0.577,16,3,0,-3,-3,-3,1,3,7000
16194,3,3,-1,36.85,40.56,-0.355,-1.122,-4.000,0,0,4,99,0,8,-1,-8.00,6,-8,-8,8,1,-8.000,6,2,-4,-4,-4,-4,-4,-9,-4
16195,2,4,-1,48.87,43.51,0.068,-0.118,-4.000,0,1,25,5,4,5,-1,-8.00,5,6,1,5,4,-8.000,23,5,1,2,0,-3,4,5,20000


Subsetting dataframe for and initial exploration to justify why this analysis will neglect a portion of the data.

In [308]:
lost = df[df.ps_level < 0]
post = df[(df.math_1 > 0)
        &(df.math_b > 0)
        &(df.reading_b > 0)
        & (df.ps_level >= 0)
        &(df.graduated_Y >=0)]

In [309]:
post.shape

(9901, 31)

Less than zero responses are null values.  Significant amounts of data is null in the'lost' subset.  

In [310]:
print(lost[lost.wages_yr < 0].shape)

(2947, 31)


In [311]:
print(f'base year reading score mean of whole dataset:  {round(df.reading_b.mean())}')
print(f'base year math score mean of whole dateset:  {round(df.math_b.mean())} \n')

print(f'standard deviation of base year reading score for whole dataset:  {round(df.reading_b.std())}')
print(f'standard deviation of base year math score for whole dataset:  {round(df.math_b.std())} \n')

print(f'base year reading score mean of lost:  {round(lost.reading_b.mean())}')
print(f'base year math score mean of lost:  {round(lost.math_b.mean())} \n')

print(f'base year reading score mean of ps:  {round(post.reading_b.mean())}')
print(f'base year math score mean of ps:  {round(post.math_b.mean())}\n')


print(f'null wages of lost: {lost[lost.wages_yr < 0].shape}')
print(f'mean wages of lost: {round(lost.wages_yr.mean())}\n')

print(f'null wages of ps: {post[post.wages_yr < 0].shape}')
print(f'mean wages of ps:  {round(post.wages_yr.mean())}')

base year reading score mean of whole dataset:  49.0
base year math score mean of whole dateset:  50.0 

standard deviation of base year reading score for whole dataset:  13.0
standard deviation of base year math score for whole dataset:  13.0 

base year reading score mean of lost:  44.0
base year math score mean of lost:  44.0 

base year reading score mean of ps:  53.0
base year math score mean of ps:  53.0

null wages of lost: (2947, 31)
mean wages of lost: 7933.0

null wages of ps: (0, 31)
mean wages of ps:  27801.0


There is significant null value in the 'lost'  subset.  Also what is probably not a coincidence these indiviudals did not pursue any form of post seconadary training as defined by the study.  As the analysis goes further we focus on this subset that pursued post secondary training in the form of 4 yr college, community college, or trade school.  

## post secondary

In [312]:
df = pd.read_csv('../els_02_12_byf3pststu_v1_0.csv',
                 usecols = columns,
                 na_values = [-9,-8,-4, -7,-3, 97, 98, 99])

'na_values'  are defined in the study above and I read them in as null values.  Below the columns are renamed as above.

In [313]:
rename_cols(df)

Unnamed: 0,literacy_home,risk_factors,aspired_occ_b,math_b,reading_b,math_conf_b,verbal_confidence,writing,sports,by_xcurr,homework,hedonics_b,hours_working_b,edu_confidence,aspired_occ_1,math_1,math_status_1,ps_step_1,f1_xcurr,hours_working_1,hedonics_1,math_conf_1,hs_ac_load,hs_gpa,any_ps,ps_level,graduated_Y,time_to_grad,edu_achievment,expected_edu,wages_yr
0,0.0,2.0,-1.0,52.11,59.53,-1.118,-0.633,1.191,0.0,1.0,7.0,,6.0,3.0,7.0,49.60,5.0,2.0,1.0,6.0,3.0,-0.258,31.0,2.0,1.0,2.0,0.0,,3.0,1.0,4000.0
1,3.0,0.0,9.0,57.65,56.70,1.533,0.331,1.191,0.0,3.0,5.0,4.0,0.0,8.0,10.0,60.64,6.0,5.0,2.0,1.0,2.0,0.528,31.0,4.0,1.0,1.0,1.0,44.0,10.0,2.0,3000.0
2,2.0,,10.0,66.44,64.46,-0.154,-0.933,0.996,0.0,2.0,,1.0,0.0,6.0,,64.26,6.0,5.0,2.0,0.0,2.0,-0.215,30.0,4.0,1.0,1.0,1.0,45.0,6.0,2.0,37000.0
3,1.0,,10.0,44.68,48.69,1.030,1.146,-0.137,1.0,0.0,11.0,,0.0,6.0,-1.0,45.59,4.0,4.0,1.0,5.0,,0.238,32.0,4.0,1.0,2.0,0.0,,4.0,3.0,1500.0
4,1.0,2.0,16.0,40.57,33.53,0.121,0.143,-0.435,0.0,0.0,10.0,4.0,3.0,7.0,16.0,38.79,5.0,5.0,0.0,6.0,1.0,1.024,32.0,4.0,1.0,2.0,0.0,,4.0,3.0,48000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16192,1.0,,15.0,32.98,44.62,-0.118,-0.372,,1.0,0.0,4.0,2.0,7.0,6.0,-1.0,,6.0,5.0,2.0,7.0,,,24.0,2.0,,,,,,,
16193,1.0,5.0,-1.0,35.91,33.78,-0.381,-0.207,,1.0,1.0,6.0,3.0,0.0,4.0,,34.57,5.0,3.0,3.0,8.0,5.0,0.577,16.0,3.0,0.0,,,,1.0,3.0,7000.0
16194,3.0,3.0,-1.0,36.85,40.56,-0.355,-1.122,,0.0,0.0,4.0,,0.0,8.0,-1.0,,6.0,,,8.0,1.0,,6.0,2.0,,,,,,,
16195,2.0,4.0,-1.0,48.87,43.51,0.068,-0.118,,0.0,1.0,25.0,5.0,4.0,5.0,-1.0,,5.0,6.0,1.0,5.0,4.0,,23.0,5.0,1.0,2.0,0.0,,4.0,5.0,20000.0


Subsetting the data to observations values critical to the analysis are present.

In [314]:
ps = df[(df.math_1 > 0) 
        & (df.math_b > 0)
        & (df.reading_b > 0)
        &  (df.ps_level >= 0)
        & (df.graduated_Y >=0)]

From these manipulations I have lost about 7000 observations from the original approx. 16000. 

In [315]:
ps.shape

(9901, 31)

adjusting NaN value in 'hs_gpa' separately because some of its NaN value are positive integers above '6'.  These value are not NaN in other features, so it must be handled individually.

In [316]:
ps.hs_gpa.value_counts()

5.0    2473
6.0    2286
4.0    2113
3.0    1532
2.0     671
1.0     161
0.0      33
Name: hs_gpa, dtype: int64

In [317]:
ps.loc[ps.hs_gpa > 6] = np.NaN

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [318]:
ps.hs_gpa.isnull().sum()

632

Adding features to enhance interpretability and hopefully enhace meaning and allow for future dimensinoality reduction.  

In [319]:
def add_cool_features(df):
    df.loc[:,'effort'] = (df.homework*df.hs_ac_load)
    df.loc[:,'testing'] = df.math_b+df.reading_b+df.math_1
    df.loc[:,'v_facility'] = df.reading_b*(df.literacy_home + 1)
    df.loc[:,'m_facility'] = df.math_status_1*df.math_1
    df.loc[:,'academic_p'] = (df.effort)*df.hs_gpa
    df.loc[:,'iq_by_concientiousness'] = (df.v_facility+df.m_facility)*(df.homework+df.hs_ac_load)
    df.loc[:,'delinquency'] = df.risk_factors*(df.hedonics_b + df.hedonics_1)
    #df['effort'] = (df.homework+df.hs_ac_load)
    df.loc[:,'wages_cont'] = df.testing*df.wages_yr
    return df

In [320]:
add_cool_features(ps)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item_labels[indexer[info_axis]]] = value


Unnamed: 0,literacy_home,risk_factors,aspired_occ_b,math_b,reading_b,math_conf_b,verbal_confidence,writing,sports,by_xcurr,homework,hedonics_b,hours_working_b,edu_confidence,aspired_occ_1,math_1,math_status_1,ps_step_1,f1_xcurr,hours_working_1,hedonics_1,math_conf_1,hs_ac_load,hs_gpa,any_ps,ps_level,graduated_Y,time_to_grad,edu_achievment,expected_edu,wages_yr,effort,testing,v_facility,m_facility,academic_p,iq_by_concientiousness,delinquency,wages_cont
0,0.0,2.0,-1.0,52.11,59.53,-1.118,-0.633,1.191,0.0,1.0,7.0,,6.0,3.0,7.0,49.60,5.0,2.0,1.0,6.0,3.0,-0.258,31.0,2.0,1.0,2.0,0.0,,3.0,1.0,4000.0,217.0,161.24,59.53,248.00,434.0,11686.14,,644960.0
1,3.0,0.0,9.0,57.65,56.70,1.533,0.331,1.191,0.0,3.0,5.0,4.0,0.0,8.0,10.0,60.64,6.0,5.0,2.0,1.0,2.0,0.528,31.0,4.0,1.0,1.0,1.0,44.0,10.0,2.0,3000.0,155.0,174.99,226.80,363.84,620.0,21263.04,0.0,524970.0
2,2.0,,10.0,66.44,64.46,-0.154,-0.933,0.996,0.0,2.0,,1.0,0.0,6.0,,64.26,6.0,5.0,2.0,0.0,2.0,-0.215,30.0,4.0,1.0,1.0,1.0,45.0,6.0,2.0,37000.0,,195.16,193.38,385.56,,,,7220920.0
3,1.0,,10.0,44.68,48.69,1.030,1.146,-0.137,1.0,0.0,11.0,,0.0,6.0,-1.0,45.59,4.0,4.0,1.0,5.0,,0.238,32.0,4.0,1.0,2.0,0.0,,4.0,3.0,1500.0,352.0,138.96,97.38,182.36,1408.0,12028.82,,208440.0
4,1.0,2.0,16.0,40.57,33.53,0.121,0.143,-0.435,0.0,0.0,10.0,4.0,3.0,7.0,16.0,38.79,5.0,5.0,0.0,6.0,1.0,1.024,32.0,4.0,1.0,2.0,0.0,,4.0,3.0,48000.0,320.0,112.89,67.06,193.95,1280.0,10962.42,10.0,5418720.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16169,3.0,,-1.0,48.94,59.41,,,0.052,2.0,0.0,19.0,4.0,0.0,6.0,-1.0,52.52,6.0,5.0,2.0,5.0,3.0,0.264,33.0,4.0,1.0,1.0,1.0,45.0,6.0,2.0,40000.0,627.0,160.87,237.64,315.12,2508.0,28743.52,,6434800.0
16172,1.0,,-1.0,74.90,64.83,,,0.052,1.0,1.0,9.0,0.0,0.0,6.0,9.0,74.48,6.0,5.0,4.0,0.0,0.0,0.528,28.0,6.0,1.0,1.0,1.0,59.0,6.0,2.0,90000.0,252.0,214.21,129.66,446.88,1512.0,21331.98,,19278900.0
16174,3.0,1.0,-1.0,54.31,59.29,0.571,,,0.0,0.0,19.0,0.0,0.0,6.0,9.0,50.56,6.0,5.0,1.0,8.0,5.0,-0.755,29.0,4.0,1.0,1.0,1.0,88.0,6.0,2.0,8000.0,551.0,164.16,237.16,303.36,2204.0,25944.96,5.0,1313280.0
16175,2.0,0.0,7.0,69.18,69.76,0.808,0.404,0.618,2.0,0.0,26.0,2.0,0.0,7.0,7.0,69.14,6.0,5.0,4.0,9.0,2.0,1.811,29.0,5.0,1.0,1.0,0.0,,3.0,3.0,15000.0,754.0,208.08,209.28,414.84,3770.0,34326.60,0.0,3121200.0


The following csv is saved here to have a pre-imputation subsetted dataset available.  

In [321]:
ps.to_csv('post_secondary.csv')

Dropping these features because they may leak data in modelling.  I have kept them because they may be useful in a future iteration of the project.   This set will also be useful in some of the exploratory analysis.  

In [322]:
drop_list = ['any_ps','ps_step_1', 'wages_yr',
            'time_to_grad','edu_achievment','expected_edu']
def drop_stuff(df, l):
    for i in l:
        df.drop(i, axis = 1, inplace = True)
    return df

In [323]:
ps_dropped = drop_stuff(ps,drop_list)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


Function to split and into train/test and rejoin features and target for convenience.  

In [324]:
def x_y_split(df):    
    #splits target from data
    x = df.iloc[:,(df.columns!='graduated_Y')]
    y = df.graduated_Y
    
    #creates test train split
    X_train, X_test, y_train, y_test = train_test_split(x, y, random_state=4, test_size=0.2)
    
    #rejoins target to dataset 
    test = pd.concat([X_test, y_test], axis = 1)
    train = pd.concat([X_train, y_train], axis = 1)
    
    return train, test

In [325]:
college_train, college_test = x_y_split(ps_dropped)

Inspecting sizes of the train and test sets.  

In [326]:
print("Training set shape: ", college_train.shape)
print("Test set shape: ", college_test.shape)

Training set shape:  (7920, 33)
Test set shape:  (1981, 33)


Creating a file of train set before a series of preprocessing steps.

In [327]:
college_train.to_csv('college_train.csv')

In [328]:
def KNNimpute_DF(df):
    #filling in missing values with knn imputer
    imputer_knn = KNNImputer(n_neighbors = 10)
    imputer_knn.fit(df)

    x = imputer_knn.transform(df)
    
    #casting the numpy array to dataframe
    df = pd.DataFrame(x)
    return df

Imputation of train set.

In [329]:
imputed_coll_train = KNNimpute_DF(college_train)

In [330]:
def rename_cols_x(df):
    rename_list = college_train.columns
    df.columns = rename_list
    return df

Renaming columns.

In [331]:
train_college_imp = rename_cols_x(imputed_coll_train)

In [332]:
train_college_imp.head()

Unnamed: 0,literacy_home,risk_factors,aspired_occ_b,math_b,reading_b,math_conf_b,verbal_confidence,writing,sports,by_xcurr,homework,hedonics_b,hours_working_b,edu_confidence,aspired_occ_1,math_1,math_status_1,f1_xcurr,hours_working_1,hedonics_1,math_conf_1,hs_ac_load,hs_gpa,ps_level,effort,testing,v_facility,m_facility,academic_p,iq_by_concientiousness,delinquency,wages_cont,graduated_Y
0,3.0,1.0,10.0,64.65,65.7,0.7487,0.5229,0.052,0.0,1.0,10.0,1.0,0.0,7.0,-1.0,65.65,6.0,3.0,3.0,1.0,0.9824,32.0,6.0,1.0,320.0,196.0,262.8,393.9,1920.0,27581.4,2.0,0.0,1.0
1,3.0,0.4,9.0,63.74,64.71,0.36,0.666,1.001,2.0,2.0,27.0,5.0,0.0,7.0,9.0,60.59,5.0,3.0,1.0,1.0,0.746,24.0,6.0,1.0,648.0,189.04,258.84,302.95,3888.0,28651.29,3.1,8884880.0,1.0
2,2.0,0.0,-1.0,40.5,46.23,-0.183,0.078,-0.137,2.0,0.0,11.0,2.0,0.0,6.0,-1.0,42.39,5.0,2.0,0.0,3.0,0.495,24.0,4.0,1.0,264.0,129.12,138.69,211.95,1056.0,12272.4,0.0,90384.0,1.0
3,2.0,1.0,10.0,59.68,51.03,0.781,-1.194,0.052,0.0,2.0,8.0,1.0,0.0,8.0,10.0,63.61,6.0,3.0,4.0,2.0,0.495,28.0,6.0,1.0,224.0,174.32,153.09,381.66,1344.0,19251.0,3.0,14817200.0,0.0
4,2.1,2.0,10.0,55.44,61.51,0.0139,0.1298,0.052,0.0,1.0,13.0,2.0,1.0,9.0,10.0,46.34,4.0,6.0,2.0,3.0,-1.28,21.0,3.0,1.0,273.0,163.29,162.56,185.36,819.0,14068.011,10.0,4245540.0,0.0


In [333]:
def round_ordinals(x):
    ordinal_feature_list = ['literacy_home','risk_factors','aspired_occ_b','sports','by_xcurr',
                    'homework','hedonics_b','hours_working_b','edu_confidence',
                    'aspired_occ_1','math_status_1','f1_xcurr','hours_working_1',
                    'hedonics_1','hs_ac_load','hs_gpa','academic_p','graduated_Y']
    
    for k in ordinal_feature_list:
        x[k] = x.loc[:,k].apply(round)
    return x

during imputation ordinal and categorical values were made continious.  Reasserting the nature of these features by rounding.

In [334]:
round_ordinals(train_college_imp)

Unnamed: 0,literacy_home,risk_factors,aspired_occ_b,math_b,reading_b,math_conf_b,verbal_confidence,writing,sports,by_xcurr,homework,hedonics_b,hours_working_b,edu_confidence,aspired_occ_1,math_1,math_status_1,f1_xcurr,hours_working_1,hedonics_1,math_conf_1,hs_ac_load,hs_gpa,ps_level,effort,testing,v_facility,m_facility,academic_p,iq_by_concientiousness,delinquency,wages_cont,graduated_Y
0,3,1,10,64.65,65.70,0.7487,0.5229,0.0520,0,1,10,1,0,7,-1,65.65,6,3,3,1,0.9824,32,6,1.0,320.0,196.00,262.80,393.90,1920,27581.400,2.0,0.0,1
1,3,0,9,63.74,64.71,0.3600,0.6660,1.0010,2,2,27,5,0,7,9,60.59,5,3,1,1,0.7460,24,6,1.0,648.0,189.04,258.84,302.95,3888,28651.290,3.1,8884880.0,1
2,2,0,-1,40.50,46.23,-0.1830,0.0780,-0.1370,2,0,11,2,0,6,-1,42.39,5,2,0,3,0.4950,24,4,1.0,264.0,129.12,138.69,211.95,1056,12272.400,0.0,90384.0,1
3,2,1,10,59.68,51.03,0.7810,-1.1940,0.0520,0,2,8,1,0,8,10,63.61,6,3,4,2,0.4950,28,6,1.0,224.0,174.32,153.09,381.66,1344,19251.000,3.0,14817200.0,0
4,2,2,10,55.44,61.51,0.0139,0.1298,0.0520,0,1,13,2,1,9,10,46.34,4,6,2,3,-1.2800,21,3,1.0,273.0,163.29,162.56,185.36,819,14068.011,10.0,4245540.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7915,0,2,10,48.97,39.82,-1.8310,-1.9050,-0.8980,0,0,2,3,0,6,-1,48.69,4,1,5,1,-0.8084,24,4,2.0,48.0,137.48,39.82,194.76,192,6099.080,12.5,13748.0,1
7916,2,0,9,62.47,61.26,1.3220,0.3310,0.3100,0,1,8,8,0,7,11,56.32,5,3,1,2,1.0240,15,4,1.0,120.0,180.05,183.78,281.60,480,10703.740,2.6,5221450.0,0
7917,1,1,2,60.06,55.22,-0.6300,-0.1260,0.1600,1,1,9,3,1,7,9,49.98,5,2,2,5,-0.5100,27,5,1.0,243.0,165.26,110.44,249.90,1215,12972.240,6.5,8758780.0,0
7918,2,0,-1,50.40,48.81,-0.1540,-0.9330,0.3531,1,1,12,4,0,6,9,44.59,6,5,1,3,0.0014,28,5,1.0,336.0,143.80,146.43,267.54,1680,16558.800,5.2,3595000.0,1


Creating dummy variable for these two categorical features.

In [335]:
train_college_imp = pd.get_dummies(train_college_imp,
            columns = ['aspired_occ_b','aspired_occ_1'])

Saving a copy of train set after these manipulation.

In [336]:
train_college_imp.to_csv('train_college_imp.csv')

Doing the process of saving, imputing, renaming, and rounding to the test set.

In [337]:
college_test.to_csv('college_test.csv')

imputed_coll_test = KNNimpute_DF(college_test)

test_college_imp = rename_cols_x(imputed_coll_test)

round_ordinals(test_college_imp)

Unnamed: 0,literacy_home,risk_factors,aspired_occ_b,math_b,reading_b,math_conf_b,verbal_confidence,writing,sports,by_xcurr,homework,hedonics_b,hours_working_b,edu_confidence,aspired_occ_1,math_1,math_status_1,f1_xcurr,hours_working_1,hedonics_1,math_conf_1,hs_ac_load,hs_gpa,ps_level,effort,testing,v_facility,m_facility,academic_p,iq_by_concientiousness,delinquency,wages_cont,graduated_Y
0,3,1,10,61.29,66.44,1.7720,1.5960,1.951,2,3,2,1,0,8,10,63.02,6,5,0,1,1.3020,26,6,1.0,52.0,190.75,265.760,378.12,312,18028.640,4.7,2670500.0,1
1,2,1,10,52.98,56.70,-1.1060,0.3310,1.001,1,1,13,3,0,6,9,51.00,5,6,8,2,-0.2590,28,5,1.0,364.0,160.68,170.100,255.00,1820,17429.100,5.0,0.0,0
2,2,1,9,41.14,41.79,0.3330,-1.1940,-0.703,0,0,2,4,0,3,-1,43.86,5,1,0,2,-0.3042,21,4,1.0,42.0,126.79,125.370,219.30,168,7927.410,6.0,0.0,0
3,3,0,-1,69.05,67.17,1.7720,1.5960,1.951,1,1,10,3,0,6,-1,71.56,6,4,0,2,1.8110,26,6,1.0,260.0,207.78,268.680,429.36,1560,25129.440,0.0,4155600.0,1
4,3,0,10,65.82,65.33,0.5710,0.8930,0.539,1,0,2,3,2,8,10,65.92,6,3,1,3,-0.5100,26,5,1.0,52.0,197.07,261.320,395.52,260,18391.520,0.0,2364840.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1976,2,0,10,64.42,61.01,-0.1708,0.0458,0.812,4,7,12,0,0,7,-1,63.07,6,6,6,1,1.3030,28,6,1.0,336.0,188.50,161.835,378.42,2016,14170.718,0.0,1790750.0,0
1977,1,0,-1,57.94,46.72,-0.6300,-0.7450,0.515,2,0,15,4,0,6,9,58.57,6,3,5,3,0.5280,25,4,1.0,375.0,163.23,93.440,351.42,1500,17794.400,0.0,8161500.0,1
1978,2,0,11,71.53,58.79,1.5330,-0.2300,0.733,0,1,34,3,1,7,11,70.78,6,2,4,1,0.8791,27,5,1.0,445.7,201.10,176.370,424.68,2492,25206.352,0.0,15082500.0,1
1979,1,1,10,48.41,46.23,0.1748,-0.3639,0.247,2,5,8,4,4,8,12,51.03,5,3,4,1,-0.0130,31,4,1.0,178.3,145.67,92.460,255.15,602,11150.495,5.0,0.0,0


Generating dummies from categoricals in the test set.

In [338]:
test_college_imp = pd.get_dummies(test_college_imp, 
            columns = ['aspired_occ_b','aspired_occ_1'])


Verifying the two sets have the same number of features.

In [339]:
print(train_college_imp.shape)
print(test_college_imp.shape)

(7920, 67)
(1981, 67)


Saving the pre-processed test set. 

In [340]:
test_college_imp.to_csv('test_college_imp.csv')

With these steps I have created the foundational datasets that will be used in the rest of the analysis.  