#### Analyst: Dhruv Singh <br> Report Name: Success Classifier, Phase 1 <br> Report Quarter, Year: FY 2011-2021 <br> Date Updated: 10/22/2021

# Phase I: Exploratory Data Analysis

In [1]:
# libraries
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import re

### Reading in Data

In [2]:
df = pd.read_csv('1_readonly\Services.csv', low_memory=False)

In [3]:
df.shape

(165367, 78)

In [4]:
subset = ['StateId', 'ProgramTitle', 'Age_AtReg', 'Disability', 'Gender', 'CitizenStatus', 'EducationLevel', 'Race_Hispanic', 'Race_NativeAmerican', 'Race_Asian', 'Race_AfricanAmerican', 'Race_PacificIslander', 'Race_White', 'ActualstartDate', 'ActualEndDate', 'EmpName']

### Subsetting Data to Relevant Columns

In [5]:
df = df[subset]

In [6]:
df.dtypes

StateId                   int64
ProgramTitle             object
Age_AtReg                 int64
Disability              float64
Gender                    int64
CitizenStatus            object
EducationLevel          float64
Race_Hispanic             int64
Race_NativeAmerican       int64
Race_Asian                int64
Race_AfricanAmerican      int64
Race_PacificIslander      int64
Race_White                int64
ActualstartDate          object
ActualEndDate            object
EmpName                  object
dtype: object

### Missing Values

In [7]:
# checking if there are any missing values
df.isnull().sum()

StateId                      0
ProgramTitle                 0
Age_AtReg                    0
Disability                 179
Gender                       0
CitizenStatus                0
EducationLevel            3181
Race_Hispanic                0
Race_NativeAmerican          0
Race_Asian                   0
Race_AfricanAmerican         0
Race_PacificIslander         0
Race_White                   0
ActualstartDate              0
ActualEndDate              834
EmpName                 110359
dtype: int64

In [8]:
# Check if EducationLevel appears to be missing at random
df.groupby(df['EducationLevel'].isnull()).mean()

Unnamed: 0_level_0,StateId,Age_AtReg,Disability,Gender,EducationLevel,Race_Hispanic,Race_NativeAmerican,Race_Asian,Race_AfricanAmerican,Race_PacificIslander,Race_White
EducationLevel,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
False,265387.576166,36.957573,0.255174,1.480103,54.220161,1.589798,1.053352,1.043074,1.888831,1.039325,1.068199
True,287496.201823,34.092109,0.505501,1.453002,,2.340773,0.856334,0.849733,1.722414,0.847218,0.862622


In [9]:
# filling missing values
df = df.fillna(0)

### Adding Target Column: Employed

In [10]:
df['Employed'] = 0
df.loc[df["EmpName"] != 0, "Employed"] = 1

In [11]:
# See the distribution of our target variable
df['Employed'].value_counts()

0    110359
1     55008
Name: Employed, dtype: int64

### Datetime Manipulations

##### Dropping Missing End Dates

In [12]:
# converting date variables to datetime type
df['ActualEndDate'] = pd.to_datetime(df['ActualEndDate'], errors='coerce')
df['ActualstartDate'] = pd.to_datetime(df['ActualstartDate'])

In [13]:
# dropping rows with start date > end date
df = df[df.ActualstartDate <= df.ActualEndDate]

##### Creating Length Variable

In [14]:
df.shape

(164530, 17)

In [15]:
df['Length'] = (df.ActualEndDate - df.ActualstartDate).dt.days

In [16]:
df['Length'].describe()

count    164530.000000
mean         20.424306
std          72.866674
min           0.000000
25%           0.000000
50%           0.000000
75%           0.000000
max        2975.000000
Name: Length, dtype: float64

##### Creating month and year variables

In [17]:
# extracting month and year columns
df['StartMonth'] = pd.DatetimeIndex(df['ActualstartDate']).month
df['StartYear'] = pd.DatetimeIndex(df['ActualstartDate']).year
df['EndMonth'] = pd.DatetimeIndex(df['ActualEndDate']).month
df['EndYear'] = pd.DatetimeIndex(df['ActualEndDate']).year

#### De-Duplicating Data

In [18]:
df.dtypes

StateId                          int64
ProgramTitle                    object
Age_AtReg                        int64
Disability                     float64
Gender                           int64
CitizenStatus                   object
EducationLevel                 float64
Race_Hispanic                    int64
Race_NativeAmerican              int64
Race_Asian                       int64
Race_AfricanAmerican             int64
Race_PacificIslander             int64
Race_White                       int64
ActualstartDate         datetime64[ns]
ActualEndDate           datetime64[ns]
EmpName                         object
Employed                         int64
Length                           int64
StartMonth                       int64
StartYear                        int64
EndMonth                         int64
EndYear                          int64
dtype: object

In [19]:
df.shape

(164530, 22)

In [20]:
# services are inflated: often recording the same services on multiple rows
df = df.sort_values(by=["StateId", "Age_AtReg", "ProgramTitle", "EmpName", "StartYear", "StartMonth"])

In [21]:
# dropping dups 
df = df.drop_duplicates(subset = ["StateId", "ProgramTitle", "Age_AtReg", "EmpName", "StartYear", "StartMonth"], keep='first')

In [22]:
df.shape

(85456, 22)

### Recoding Variables

Citizenship Variable (original):

	1: Citizen of U.S. or Territory
	2: Alien/Refugee Lawfully Admitted to US
	3: US Permanent Resident
	4: None of the Above

In [23]:
df.CitizenStatus.unique()

# since 0's and blanks aren't part of the data dictionary, they must be recoded to 4's

array(['1', '  ', '3', '2', '4', '0'], dtype=object)

In [24]:
df.CitizenStatus.value_counts()

1     82534
3      1780
2       845
        227
4        57
0        13
Name: CitizenStatus, dtype: int64

In [25]:
# Citizenship Status: recoding to swap 2 and 3
df.loc[df["CitizenStatus"] == "3", "CitizenStatus"] = "2_interim"
df.loc[df["CitizenStatus"] == "2", "CitizenStatus"] = "3"
df.loc[df["CitizenStatus"] == "2_interim", "CitizenStatus"] = "2"

In [26]:
df.CitizenStatus.value_counts()

1     82534
2      1780
3       845
        227
4        57
0        13
Name: CitizenStatus, dtype: int64

In [27]:
# recoding 0's and blanks to 4's
df.loc[(df["CitizenStatus"] != "1") & (df["CitizenStatus"] != "2") & (df["CitizenStatus"] != "3"), "CitizenStatus"] = "4"

In [28]:
df.CitizenStatus.value_counts()

1    82534
2     1780
3      845
4      297
Name: CitizenStatus, dtype: int64

In [29]:
# converting citizen status to numeric
df['CitizenStatus'] = pd.to_numeric(df['CitizenStatus'])

In [30]:
df.dtypes

StateId                          int64
ProgramTitle                    object
Age_AtReg                        int64
Disability                     float64
Gender                           int64
CitizenStatus                    int64
EducationLevel                 float64
Race_Hispanic                    int64
Race_NativeAmerican              int64
Race_Asian                       int64
Race_AfricanAmerican             int64
Race_PacificIslander             int64
Race_White                       int64
ActualstartDate         datetime64[ns]
ActualEndDate           datetime64[ns]
EmpName                         object
Employed                         int64
Length                           int64
StartMonth                       int64
StartYear                        int64
EndMonth                         int64
EndYear                          int64
dtype: object

In [31]:
df[["Disability", "Gender", "Race_Hispanic", "Race_NativeAmerican", "Race_Asian", "Race_AfricanAmerican", "Race_PacificIslander", "Race_White"]].apply(pd.Series.value_counts)

Unnamed: 0,Disability,Gender,Race_Hispanic,Race_NativeAmerican,Race_Asian,Race_AfricanAmerican,Race_PacificIslander,Race_White
0.0,79413.0,,68726.0,74274.0,75161.0,2648.0,75499.0,73060.0
1.0,3974.0,44563.0,1517.0,1371.0,484.0,73019.0,146.0,2585.0
2.0,,40795.0,,,,,,
9.0,2069.0,98.0,15213.0,9811.0,9811.0,9789.0,9811.0,9811.0


For race variables:

	1: Yes
	0: No
	9: Prefer not to disclose


For gender variable: 

	1: Male
	2: Female
	9: Prefer not to disclose

In [32]:
# recoding 9 (undisclosed) to 0 (no) for race and disability

# race
df.loc[df["Race_Hispanic"] == 9, "Race_Hispanic"] = 0
df.loc[df["Race_NativeAmerican"] == 9, "Race_NativeAmerican"] = 0
df.loc[df["Race_Asian"] == 9, "Race_Asian"] = 0
df.loc[df["Race_AfricanAmerican"] == 9, "Race_AfricanAmerican"] = 0
df.loc[df["Race_PacificIslander"] == 9, "Race_PacificIslander"] = 0
df.loc[df["Race_White"] == 9, "Race_White"] = 0

# disability
df.loc[df["Disability"] == 9, "Disability"] = 0

# recoding 9 (undisclosed) to 0 (undisclosed) for gender

# gender
df.loc[df["Gender"] == 9, "Gender"] = 0

In [33]:
df[["Disability", "Gender", "Race_Hispanic", "Race_NativeAmerican", "Race_Asian", "Race_AfricanAmerican", "Race_PacificIslander", "Race_White"]].apply(pd.Series.value_counts)

Unnamed: 0,Disability,Gender,Race_Hispanic,Race_NativeAmerican,Race_Asian,Race_AfricanAmerican,Race_PacificIslander,Race_White
0.0,81482.0,98,83939.0,84085.0,84972.0,12437.0,85310.0,82871.0
1.0,3974.0,44563,1517.0,1371.0,484.0,73019.0,146.0,2585.0
2.0,,40795,,,,,,


In [34]:
df.EducationLevel.value_counts()

87.0    31815
16.0     9644
13.0     6530
14.0     6259
88.0     6194
17.0     5668
90.0     4845
15.0     2683
11.0     2600
91.0     2541
10.0     1938
0.0      1628
12.0     1215
9.0      1073
8.0       431
7.0       221
6.0        60
89.0       27
3.0        23
2.0        20
4.0        16
1.0        13
5.0        12
Name: EducationLevel, dtype: int64

Education Level (original):

	9: 9th Grade
	10: 10th Grade
	11: 11th grade
	12: 12th grade no Diploma
	13: HS Grad and 1 year of College or a Technical or Vocational School
	14: HS Grad and 2 years of College or a Technical or Vocational School
	15: HS Grad and 3 years of College or a Technical or Vocational School
	16: Bachelor’s Degree or equivalent
	17: Attained degree beyond a Bachelor’s degree
	87: Attained High School Diploma
	88: Attained GED or Equivalent
	90: Attained a postsecondary technical or vocational certificate (non-degree)
	91: Associate’s Degree

In [35]:
# recoding education level to make it ordinal

# recoding 88 to 13
# recoding 87 to 14
# recoding 13 to 15
# recoding 14 to 16
# recoding 15 to 17
# recoding 90 to 18
# recoding 91 to 19
# recoding 16 to 20
# recoding 17 to 21

# first recoding to interim values
df.loc[df["EducationLevel"] == 88, "EducationLevel"] = 188
df.loc[df["EducationLevel"] == 89, "EducationLevel"] = 189
df.loc[df["EducationLevel"] == 87, "EducationLevel"] = 187
df.loc[df["EducationLevel"] == 13, "EducationLevel"] = 113
df.loc[df["EducationLevel"] == 14, "EducationLevel"] = 114
df.loc[df["EducationLevel"] == 15, "EducationLevel"] = 115
df.loc[df["EducationLevel"] == 90, "EducationLevel"] = 190
df.loc[df["EducationLevel"] == 91, "EducationLevel"] = 191
df.loc[df["EducationLevel"] == 16, "EducationLevel"] = 116
df.loc[df["EducationLevel"] == 17, "EducationLevel"] = 117

# then recoding to final ordinal values
df.loc[df["EducationLevel"] == 188, "EducationLevel"] = 13
df.loc[df["EducationLevel"] == 189, "EducationLevel"] = 13
df.loc[df["EducationLevel"] == 187, "EducationLevel"] = 14
df.loc[df["EducationLevel"] == 113, "EducationLevel"] = 15
df.loc[df["EducationLevel"] == 114, "EducationLevel"] = 16
df.loc[df["EducationLevel"] == 115, "EducationLevel"] = 17
df.loc[df["EducationLevel"] == 190, "EducationLevel"] = 18
df.loc[df["EducationLevel"] == 191, "EducationLevel"] = 19
df.loc[df["EducationLevel"] == 116, "EducationLevel"] = 20
df.loc[df["EducationLevel"] == 117, "EducationLevel"] = 21

Education Level (recoded):

	0: No schooling
	1: 1st Grade
	2: 2nd Grade
	3: 3rd Grade
	4: 4th Grade
	5: 5th Grade
	6: 6th Grade
	7: 7th Grade
	8: 8th Grade
	9: 9th Grade
	10: 10th Grade
	11: 11th grade
	12: 12th grade no Diploma
	13: Attained GED or Equivalent
	14: Attained High School Diploma
	15: HS Grad and 1 year of College or a Technical or Vocational School
	16: HS Grad and 2 years of College or a Technical or Vocational School
	17: HS Grad and 3 years of College or a Technical or Vocational School
	18: Attained a postsecondary technical or vocational certificate (non-degree)
	19: Associate’s Degree
	20: Bachelor’s Degree or equivalent
	21: Attained degree beyond a Bachelor’s degree

In [36]:
df.EducationLevel.value_counts()

14.0    31815
20.0     9644
15.0     6530
16.0     6259
13.0     6221
21.0     5668
18.0     4845
17.0     2683
11.0     2600
19.0     2541
10.0     1938
0.0      1628
12.0     1215
9.0      1073
8.0       431
7.0       221
6.0        60
3.0        23
2.0        20
4.0        16
1.0        13
5.0        12
Name: EducationLevel, dtype: int64

### Describing Data

In [37]:
df.describe()

Unnamed: 0,StateId,Age_AtReg,Disability,Gender,CitizenStatus,EducationLevel,Race_Hispanic,Race_NativeAmerican,Race_Asian,Race_AfricanAmerican,Race_PacificIslander,Race_White,Employed,Length,StartMonth,StartYear,EndMonth,EndYear
count,85456.0,85456.0,85456.0,85456.0,85456.0,85456.0,85456.0,85456.0,85456.0,85456.0,85456.0,85456.0,85456.0,85456.0,85456.0,85456.0,85456.0,85456.0
mean,264793.393103,36.959979,0.046503,1.476233,1.051032,15.156373,0.017752,0.016043,0.005664,0.854463,0.001708,0.030249,0.332358,25.537001,6.298458,2016.64047,6.356078,2016.704304
std,163727.066018,13.15701,0.210574,0.501729,0.298426,3.683426,0.132049,0.125643,0.075045,0.352643,0.041299,0.171274,0.471061,82.082859,3.387316,2.844043,3.357473,2.827862
min,20.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2011.0,1.0,2011.0
25%,115430.0,25.0,0.0,1.0,1.0,14.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,3.0,2015.0,3.0,2015.0
50%,257591.0,35.0,0.0,1.0,1.0,14.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,6.0,2017.0,6.0,2017.0
75%,420994.75,48.0,0.0,2.0,1.0,18.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,9.0,2019.0,9.0,2019.0
max,646834.0,83.0,1.0,2.0,4.0,21.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2975.0,12.0,2021.0,12.0,2021.0


In [38]:
df.shape

(85456, 22)

### Summarizing

In [39]:
# Look at the correlation matrix
corr = df.corr()
corr
# corr.to_csv('corr.csv')

Unnamed: 0,StateId,Age_AtReg,Disability,Gender,CitizenStatus,EducationLevel,Race_Hispanic,Race_NativeAmerican,Race_Asian,Race_AfricanAmerican,Race_PacificIslander,Race_White,Employed,Length,StartMonth,StartYear,EndMonth,EndYear
StateId,1.0,-0.216006,-0.010709,-0.033312,0.079597,0.013884,0.072346,0.00931,0.046276,-0.217824,0.025423,0.100618,0.027109,-0.004758,-0.000818,0.28753,-0.006517,0.289331
Age_AtReg,-0.216006,1.0,0.118714,-0.012475,0.010701,0.224762,0.009111,0.005683,0.001889,-0.061487,-0.016177,0.051361,0.003485,-0.065247,-0.001033,-0.074473,-0.004566,-0.079677
Disability,-0.010709,0.118714,1.0,-0.017118,-0.024544,0.041423,-0.002755,0.021338,0.021099,-0.018223,0.00432,0.068069,-0.032418,-0.018222,0.005413,0.024889,0.004319,0.023702
Gender,-0.033312,-0.012475,-0.017118,1.0,0.001496,0.119207,0.013345,0.010597,0.011345,-0.042004,-0.001429,0.010205,0.03367,-0.033526,0.012206,0.015211,0.005055,0.01346
CitizenStatus,0.079597,0.010701,-0.024544,0.001496,1.0,0.038048,0.099654,-0.020587,0.02158,-0.095997,-0.002327,0.041229,0.005876,-0.02396,-0.00745,-0.000663,-0.009458,-0.002378
EducationLevel,0.013884,0.224762,0.041423,0.119207,0.038048,1.0,0.031753,0.025857,0.043406,-0.144803,-0.004064,0.110214,0.091261,-0.097973,0.008691,0.071933,0.00208,0.065274
Race_Hispanic,0.072346,0.009111,-0.002755,0.013345,0.099654,0.031753,1.0,0.043492,0.054803,-0.203106,0.018042,0.149072,-0.006055,-0.006482,-0.001642,-0.001016,-0.00275,-0.001424
Race_NativeAmerican,0.00931,0.005683,0.021338,0.010597,-0.020587,0.025857,0.043492,1.0,0.077239,-0.045551,0.107478,0.171582,0.008964,-0.009839,0.001067,0.008053,-0.001503,0.007555
Race_Asian,0.046276,0.001889,0.021099,0.011345,0.02158,0.043406,0.054803,0.077239,1.0,-0.090896,0.106375,0.077714,0.00468,-0.009833,-6.7e-05,0.00806,-0.000852,0.007451
Race_AfricanAmerican,-0.217824,-0.061487,-0.018223,-0.042004,-0.095997,-0.144803,-0.203106,-0.045551,-0.090896,1.0,-0.025513,-0.310535,-0.028492,0.046731,0.00622,-0.059383,0.007784,-0.056239


In [40]:
df.to_csv('2_cleaned_data/df.csv', index=False)