#### Analyst: Dhruv Singh <br> Report Name: Success Classifier, Phase 1.1: EDA <br> Report Quarter, Year: FY 2011-2021 <br> Date Updated: 10/22/2021

# Phase I. A: Exploratory Data Analysis

In [1]:
# libraries
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import re

### Reading in Data

In [2]:
df = pd.read_csv('1_readonly\Services.csv', low_memory=False)

In [3]:
df.shape

(165367, 78)

In [4]:
subset = ['StateId', 'ProgramTitle', 'Age_AtReg', 'Disability', 'Gender', 'CitizenStatus', 'EducationLevel', 'Race_Hispanic', 'Race_NativeAmerican', 'Race_Asian', 'Race_AfricanAmerican', 'Race_PacificIslander', 'Race_White', 'ActualstartDate', 'ActualEndDate', 'EmpName']

### Subsetting Data to Relevant Columns

In [5]:
df = df[subset]

In [6]:
df.dtypes

StateId                   int64
ProgramTitle             object
Age_AtReg                 int64
Disability              float64
Gender                    int64
CitizenStatus            object
EducationLevel          float64
Race_Hispanic             int64
Race_NativeAmerican       int64
Race_Asian                int64
Race_AfricanAmerican      int64
Race_PacificIslander      int64
Race_White                int64
ActualstartDate          object
ActualEndDate            object
EmpName                  object
dtype: object

### Missing Values

In [7]:
# checking if there are any missing values
df.isnull().sum()

StateId                      0
ProgramTitle                 0
Age_AtReg                    0
Disability                 179
Gender                       0
CitizenStatus                0
EducationLevel            3181
Race_Hispanic                0
Race_NativeAmerican          0
Race_Asian                   0
Race_AfricanAmerican         0
Race_PacificIslander         0
Race_White                   0
ActualstartDate              0
ActualEndDate              834
EmpName                 110359
dtype: int64

In [8]:
# Check if EducationLevel appears to be missing at random
df.groupby(df['EducationLevel'].isnull()).mean()

Unnamed: 0_level_0,StateId,Age_AtReg,Disability,Gender,EducationLevel,Race_Hispanic,Race_NativeAmerican,Race_Asian,Race_AfricanAmerican,Race_PacificIslander,Race_White
EducationLevel,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
False,265387.576166,36.957573,0.255174,1.480103,54.220161,1.589798,1.053352,1.043074,1.888831,1.039325,1.068199
True,287496.201823,34.092109,0.505501,1.453002,,2.340773,0.856334,0.849733,1.722414,0.847218,0.862622


In [9]:
# filling missing values
df = df.fillna(0)

### Adding Target Column: Employed

In [10]:
df['Employed'] = 0
df.loc[df["EmpName"] != 0, "Employed"] = 1

In [11]:
# See the distribution of our target variable
df['Employed'].value_counts()

0    110359
1     55008
Name: Employed, dtype: int64

### Datetime Manipulations

##### Dropping Missing End Dates

In [12]:
# converting date variables to datetime type
df['ActualEndDate'] = pd.to_datetime(df['ActualEndDate'], errors='coerce')
df['ActualstartDate'] = pd.to_datetime(df['ActualstartDate'])

In [13]:
# dropping rows with start date > end date
df = df[df.ActualstartDate <= df.ActualEndDate]

##### Creating Length Variable

In [14]:
df.shape

(164530, 17)

In [15]:
df['Length'] = (df.ActualEndDate - df.ActualstartDate).dt.days

In [16]:
df['Length'].describe()

count    164530.000000
mean         20.424306
std          72.866674
min           0.000000
25%           0.000000
50%           0.000000
75%           0.000000
max        2975.000000
Name: Length, dtype: float64

##### Creating month and year variables

In [17]:
# extracting month and year columns
df['StartMonth'] = pd.DatetimeIndex(df['ActualstartDate']).month
df['StartYear'] = pd.DatetimeIndex(df['ActualstartDate']).year
df['EndMonth'] = pd.DatetimeIndex(df['ActualEndDate']).month
df['EndYear'] = pd.DatetimeIndex(df['ActualEndDate']).year

#### De-Duplicating Data

In [18]:
df.dtypes

StateId                          int64
ProgramTitle                    object
Age_AtReg                        int64
Disability                     float64
Gender                           int64
CitizenStatus                   object
EducationLevel                 float64
Race_Hispanic                    int64
Race_NativeAmerican              int64
Race_Asian                       int64
Race_AfricanAmerican             int64
Race_PacificIslander             int64
Race_White                       int64
ActualstartDate         datetime64[ns]
ActualEndDate           datetime64[ns]
EmpName                         object
Employed                         int64
Length                           int64
StartMonth                       int64
StartYear                        int64
EndMonth                         int64
EndYear                          int64
dtype: object

In [19]:
df.shape

(164530, 22)

In [20]:
# services are inflated: often recording the same services on multiple rows
df = df.sort_values(by=["StateId", "ProgramTitle", "EmpName", "StartYear"])

In [21]:
# dropping dups 
df = df.drop_duplicates(subset = ["StateId", "ProgramTitle", "EmpName", "StartYear"], keep='first')

In [22]:
df.shape

(38263, 22)

##### Converting float to int: disability, education

In [23]:
df["Disability"] = df["Disability"].astype(np.int64)
df["EducationLevel"] = df["EducationLevel"].astype(np.int64)

### Recoding Variables

Citizenship Variable (original):

	1: Citizen of U.S. or Territory
	2: Alien/Refugee Lawfully Admitted to US
	3: US Permanent Resident
	4: None of the Above

In [24]:
df.CitizenStatus.unique()

# since 0's and blanks aren't part of the data dictionary, they must be recoded to 4's

array(['1', '  ', '3', '2', '4', '0'], dtype=object)

In [25]:
df.CitizenStatus.value_counts()

1     37117
3       653
2       318
        141
4        29
0         5
Name: CitizenStatus, dtype: int64

In [26]:
# Citizenship Status: recoding to swap 2 and 3
df.loc[df["CitizenStatus"] == "3", "CitizenStatus"] = "2_interim"
df.loc[df["CitizenStatus"] == "2", "CitizenStatus"] = "3"
df.loc[df["CitizenStatus"] == "2_interim", "CitizenStatus"] = "2"

In [27]:
df.CitizenStatus.value_counts()

1     37117
2       653
3       318
        141
4        29
0         5
Name: CitizenStatus, dtype: int64

In [28]:
# recoding 0's and blanks to 4's
df.loc[(df["CitizenStatus"] != "1") & (df["CitizenStatus"] != "2") & (df["CitizenStatus"] != "3"), "CitizenStatus"] = "4"

In [29]:
df.CitizenStatus.value_counts()

1    37117
2      653
3      318
4      175
Name: CitizenStatus, dtype: int64

In [30]:
# converting citizen status to numeric
df['CitizenStatus'] = pd.to_numeric(df['CitizenStatus'])

In [31]:
df.dtypes

StateId                          int64
ProgramTitle                    object
Age_AtReg                        int64
Disability                       int64
Gender                           int64
CitizenStatus                    int64
EducationLevel                   int64
Race_Hispanic                    int64
Race_NativeAmerican              int64
Race_Asian                       int64
Race_AfricanAmerican             int64
Race_PacificIslander             int64
Race_White                       int64
ActualstartDate         datetime64[ns]
ActualEndDate           datetime64[ns]
EmpName                         object
Employed                         int64
Length                           int64
StartMonth                       int64
StartYear                        int64
EndMonth                         int64
EndYear                          int64
dtype: object

In [32]:
df[["Disability", "Gender", "Race_Hispanic", "Race_NativeAmerican", "Race_Asian", "Race_AfricanAmerican", "Race_PacificIslander", "Race_White"]].apply(pd.Series.value_counts)

Unnamed: 0,Disability,Gender,Race_Hispanic,Race_NativeAmerican,Race_Asian,Race_AfricanAmerican,Race_PacificIslander,Race_White
0,35695.0,,30866.0,33710.0,34092.0,1079.0,34194.0,33243.0
1,1665.0,20964.0,677.0,564.0,182.0,33202.0,80.0,1031.0
2,,17263.0,,,,,,
9,903.0,36.0,6720.0,3989.0,3989.0,3982.0,3989.0,3989.0


For race variables:

	1: Yes
	0: No
	9: Prefer not to disclose


For gender variable: 

	1: Male
	2: Female
	9: Prefer not to disclose

In [33]:
# recoding 9 (undisclosed) to 0 (no) for race and disability

# race
df.loc[df["Race_Hispanic"] == 9, "Race_Hispanic"] = 0
df.loc[df["Race_NativeAmerican"] == 9, "Race_NativeAmerican"] = 0
df.loc[df["Race_Asian"] == 9, "Race_Asian"] = 0
df.loc[df["Race_AfricanAmerican"] == 9, "Race_AfricanAmerican"] = 0
df.loc[df["Race_PacificIslander"] == 9, "Race_PacificIslander"] = 0
df.loc[df["Race_White"] == 9, "Race_White"] = 0

# disability
df.loc[df["Disability"] == 9, "Disability"] = 0

# recoding 9 (undisclosed) to 0 (undisclosed) for gender

# gender
df.loc[df["Gender"] == 9, "Gender"] = 0

In [34]:
df[["Disability", "Gender", "Race_Hispanic", "Race_NativeAmerican", "Race_Asian", "Race_AfricanAmerican", "Race_PacificIslander", "Race_White"]].apply(pd.Series.value_counts)

Unnamed: 0,Disability,Gender,Race_Hispanic,Race_NativeAmerican,Race_Asian,Race_AfricanAmerican,Race_PacificIslander,Race_White
0,36598.0,36,37586.0,37699.0,38081.0,5061.0,38183.0,37232.0
1,1665.0,20964,677.0,564.0,182.0,33202.0,80.0,1031.0
2,,17263,,,,,,


In [35]:
df.EducationLevel.value_counts()

87    15128
16     3498
13     3012
88     2931
14     2714
90     2017
17     1846
11     1555
15     1155
10     1097
91      912
0       787
9       593
12      534
8       264
7       126
6        39
3        11
89       11
2        10
1         8
4         8
5         7
Name: EducationLevel, dtype: int64

Education Level (original):

	9: 9th Grade
	10: 10th Grade
	11: 11th grade
	12: 12th grade no Diploma
	13: HS Grad and 1 year of College or a Technical or Vocational School
	14: HS Grad and 2 years of College or a Technical or Vocational School
	15: HS Grad and 3 years of College or a Technical or Vocational School
	16: Bachelor’s Degree or equivalent
	17: Attained degree beyond a Bachelor’s degree
	87: Attained High School Diploma
	88: Attained GED or Equivalent
	90: Attained a postsecondary technical or vocational certificate (non-degree)
	91: Associate’s Degree

In [36]:
# recoding education level to make it ordinal

# recoding 88 to 13
# recoding 87 to 14
# recoding 13 to 15
# recoding 14 to 16
# recoding 15 to 17
# recoding 90 to 18
# recoding 91 to 19
# recoding 16 to 20
# recoding 17 to 21

# first recoding to interim values
df.loc[df["EducationLevel"] == 88, "EducationLevel"] = 188
df.loc[df["EducationLevel"] == 89, "EducationLevel"] = 189
df.loc[df["EducationLevel"] == 87, "EducationLevel"] = 187
df.loc[df["EducationLevel"] == 13, "EducationLevel"] = 113
df.loc[df["EducationLevel"] == 14, "EducationLevel"] = 114
df.loc[df["EducationLevel"] == 15, "EducationLevel"] = 115
df.loc[df["EducationLevel"] == 90, "EducationLevel"] = 190
df.loc[df["EducationLevel"] == 91, "EducationLevel"] = 191
df.loc[df["EducationLevel"] == 16, "EducationLevel"] = 116
df.loc[df["EducationLevel"] == 17, "EducationLevel"] = 117

# then recoding to final ordinal values
df.loc[df["EducationLevel"] == 188, "EducationLevel"] = 13
df.loc[df["EducationLevel"] == 189, "EducationLevel"] = 13
df.loc[df["EducationLevel"] == 187, "EducationLevel"] = 14
df.loc[df["EducationLevel"] == 113, "EducationLevel"] = 15
df.loc[df["EducationLevel"] == 114, "EducationLevel"] = 16
df.loc[df["EducationLevel"] == 115, "EducationLevel"] = 17
df.loc[df["EducationLevel"] == 190, "EducationLevel"] = 18
df.loc[df["EducationLevel"] == 191, "EducationLevel"] = 19
df.loc[df["EducationLevel"] == 116, "EducationLevel"] = 20
df.loc[df["EducationLevel"] == 117, "EducationLevel"] = 21

Education Level (recoded):

	0: No schooling
	1: 1st Grade
	2: 2nd Grade
	3: 3rd Grade
	4: 4th Grade
	5: 5th Grade
	6: 6th Grade
	7: 7th Grade
	8: 8th Grade
	9: 9th Grade
	10: 10th Grade
	11: 11th grade
	12: 12th grade no Diploma
	13: Attained GED or Equivalent
	14: Attained High School Diploma
	15: HS Grad and 1 year of College or a Technical or Vocational School
	16: HS Grad and 2 years of College or a Technical or Vocational School
	17: HS Grad and 3 years of College or a Technical or Vocational School
	18: Attained a postsecondary technical or vocational certificate (non-degree)
	19: Associate’s Degree
	20: Bachelor’s Degree or equivalent
	21: Attained degree beyond a Bachelor’s degree

In [37]:
df.EducationLevel.value_counts()

14    15128
20     3498
15     3012
13     2942
16     2714
18     2017
21     1846
11     1555
17     1155
10     1097
19      912
0       787
9       593
12      534
8       264
7       126
6        39
3        11
2        10
1         8
4         8
5         7
Name: EducationLevel, dtype: int64

### Describing Data

In [38]:
df.describe()

Unnamed: 0,StateId,Age_AtReg,Disability,Gender,CitizenStatus,EducationLevel,Race_Hispanic,Race_NativeAmerican,Race_Asian,Race_AfricanAmerican,Race_PacificIslander,Race_White,Employed,Length,StartMonth,StartYear,EndMonth,EndYear
count,38263.0,38263.0,38263.0,38263.0,38263.0,38263.0,38263.0,38263.0,38263.0,38263.0,38263.0,38263.0,38263.0,38263.0,38263.0,38263.0,38263.0,38263.0
mean,269411.827457,35.537647,0.043515,1.450226,1.047409,14.735201,0.017693,0.01474,0.004757,0.867731,0.002091,0.026945,0.28095,26.379322,6.074406,2016.488671,6.17466,2016.551551
std,163979.894088,13.070649,0.204015,0.49941,0.298709,3.630815,0.131836,0.120512,0.068804,0.338787,0.045678,0.161925,0.449469,85.667889,3.479703,2.896886,3.423544,2.886643
min,20.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2011.0,1.0,2011.0
25%,124618.0,24.0,0.0,1.0,1.0,14.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,3.0,2014.0,3.0,2015.0
50%,264403.0,33.0,0.0,1.0,1.0,14.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,6.0,2017.0,6.0,2017.0
75%,423754.0,47.0,0.0,2.0,1.0,16.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,9.0,2019.0,9.0,2019.0
max,646834.0,83.0,1.0,2.0,4.0,21.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2975.0,12.0,2021.0,12.0,2021.0


In [39]:
df.shape

(38263, 22)

### Summarizing

In [40]:
# Look at the correlation matrix
corr = df.corr()
corr
# corr.to_csv('corr.csv')

Unnamed: 0,StateId,Age_AtReg,Disability,Gender,CitizenStatus,EducationLevel,Race_Hispanic,Race_NativeAmerican,Race_Asian,Race_AfricanAmerican,Race_PacificIslander,Race_White,Employed,Length,StartMonth,StartYear,EndMonth,EndYear
StateId,1.0,-0.236235,-0.007725,-0.025417,0.038824,-0.004842,0.069946,0.010288,0.035239,-0.202039,0.027061,0.082317,0.024101,0.009546,0.016978,0.305266,0.007147,0.308097
Age_AtReg,-0.236235,1.0,0.113534,-0.043074,0.009524,0.208703,0.003549,0.000428,0.003928,-0.059192,-0.017292,0.050119,0.020021,-0.085002,-0.026471,-0.104088,-0.029082,-0.111085
Disability,-0.007725,0.113534,1.0,-0.020425,-0.018842,0.036655,-0.00239,0.017495,0.009459,-0.005208,0.009869,0.041247,-0.029295,-0.020109,0.009981,0.02823,0.002065,0.0275
Gender,-0.025417,-0.043074,-0.020425,1.0,-0.005731,0.110807,0.012384,0.011322,0.010693,-0.037754,0.004562,0.013515,0.041109,-0.001771,0.01996,0.00451,0.01431,0.005066
CitizenStatus,0.038824,0.009524,-0.018842,-0.005731,1.0,0.012081,0.094841,-0.016509,0.025906,-0.097122,0.002313,0.035728,0.004352,-0.018882,-0.00528,0.001376,-0.008276,0.000106
EducationLevel,-0.004842,0.208703,0.036655,0.110807,0.012081,1.0,0.016668,0.023973,0.037683,-0.126084,-0.003911,0.105979,0.093169,-0.078607,-0.007125,0.062653,-0.01441,0.057173
Race_Hispanic,0.069946,0.003549,-0.00239,0.012384,0.094841,0.016668,1.0,0.036225,0.065634,-0.216773,0.032917,0.139273,0.006967,0.001882,-0.004066,-0.004642,-0.008295,-0.004148
Race_NativeAmerican,0.010288,0.000428,0.017495,0.011322,-0.016509,0.023973,0.036225,1.0,0.051432,-0.06171,0.094106,0.163134,0.006053,-0.006754,2e-06,0.012007,-0.002503,0.011715
Race_Asian,0.035239,0.003928,0.009459,0.010693,0.025906,0.037683,0.065634,0.051432,1.0,-0.097464,0.104942,0.051834,0.002423,-0.008106,-0.001697,0.000664,-0.00042,-5e-05
Race_AfricanAmerican,-0.202039,-0.059192,-0.005208,-0.037754,-0.097122,-0.126084,-0.216773,-0.06171,-0.097464,1.0,-0.034484,-0.308069,-0.040868,0.039547,0.011674,-0.055227,0.016223,-0.052664


In [41]:
df.to_csv('2_data/1_cleaned_data.csv', index=False)