#### Analyst: Dhruv Singh <br> Report Name: Success Classifier, Phase 1 <br> Report Quarter, Year: FY 2011-2021 <br> Date Updated: 10/22/2021

# Phase I: Exploratory Data Analysis

In [1]:
# libraries
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import re

### Reading in Data

In [3]:
df = pd.read_csv('1_readonly\Services.csv', low_memory=False)

In [4]:
df.shape

(165367, 78)

In [5]:
subset = ['StateId', 'ProgramTitle', 'Age_AtReg', 'Disability', 'Gender', 'CitizenStatus', 'EducationLevel', 'Race_Hispanic', 'Race_NativeAmerican', 'Race_Asian', 'Race_AfricanAmerican', 'Race_PacificIslander', 'Race_White', 'ActualstartDate', 'ActualEndDate', 'EmpName']

### Subsetting Data to Relevant Columns

In [6]:
df = df[subset]

In [8]:
df.dtypes

StateId                   int64
ProgramTitle             object
Age_AtReg                 int64
Disability              float64
Gender                    int64
CitizenStatus            object
EducationLevel          float64
Race_Hispanic             int64
Race_NativeAmerican       int64
Race_Asian                int64
Race_AfricanAmerican      int64
Race_PacificIslander      int64
Race_White                int64
ActualstartDate          object
ActualEndDate            object
EmpName                  object
dtype: object

### Missing Values

In [9]:
# checking if there are any missing values
df.isnull().sum()

StateId                      0
ProgramTitle                 0
Age_AtReg                    0
Disability                 179
Gender                       0
CitizenStatus                0
EducationLevel            3181
Race_Hispanic                0
Race_NativeAmerican          0
Race_Asian                   0
Race_AfricanAmerican         0
Race_PacificIslander         0
Race_White                   0
ActualstartDate              0
ActualEndDate              834
EmpName                 110359
dtype: int64

In [10]:
# Check if EducationLevel appears to be missing at random
df.groupby(df['EducationLevel'].isnull()).mean()

Unnamed: 0_level_0,StateId,Age_AtReg,Disability,Gender,EducationLevel,Race_Hispanic,Race_NativeAmerican,Race_Asian,Race_AfricanAmerican,Race_PacificIslander,Race_White
EducationLevel,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
False,265387.576166,36.957573,0.255174,1.480103,54.220161,1.589798,1.053352,1.043074,1.888831,1.039325,1.068199
True,287496.201823,34.092109,0.505501,1.453002,,2.340773,0.856334,0.849733,1.722414,0.847218,0.862622


In [11]:
# filling missing values
df = df.fillna(0)

### Adding Target Column: Employed

In [12]:
df['Employed'] = 0
df.loc[df["EmpName"] != 0, "Employed"] = 1

In [13]:
# See the distribution of our target variable
df['Employed'].value_counts()

0    110359
1     55008
Name: Employed, dtype: int64

### Datetime Manipulations

##### Dropping Missing End Dates

In [14]:
# converting date variables to datetime type
df['ActualEndDate'] = pd.to_datetime(df['ActualEndDate'], errors='coerce')
df['ActualstartDate'] = pd.to_datetime(df['ActualstartDate'])

In [15]:
# dropping rows with start date > end date
df = df[df.ActualstartDate <= df.ActualEndDate]

##### Creating Length Variable

In [16]:
df.shape

(164530, 17)

In [17]:
df['Length'] = (df.ActualEndDate - df.ActualstartDate).dt.days

In [19]:
df['Length'].describe()

count    164530.000000
mean         20.424306
std          72.866674
min           0.000000
25%           0.000000
50%           0.000000
75%           0.000000
max        2975.000000
Name: Length, dtype: float64

##### Creating month and year variables

In [21]:
# extracting month and year columns
df['StartMonth'] = pd.DatetimeIndex(df['ActualstartDate']).month
df['StartYear'] = pd.DatetimeIndex(df['ActualstartDate']).year
df['EndMonth'] = pd.DatetimeIndex(df['ActualEndDate']).month
df['EndYear'] = pd.DatetimeIndex(df['ActualEndDate']).year

#### De-Duplicating Data

In [22]:
df.dtypes

StateId                          int64
ProgramTitle                    object
Age_AtReg                        int64
Disability                     float64
Gender                           int64
CitizenStatus                   object
EducationLevel                 float64
Race_Hispanic                    int64
Race_NativeAmerican              int64
Race_Asian                       int64
Race_AfricanAmerican             int64
Race_PacificIslander             int64
Race_White                       int64
ActualstartDate         datetime64[ns]
ActualEndDate           datetime64[ns]
EmpName                         object
Employed                         int64
Length                           int64
StartMonth                       int64
StartYear                        int64
EndMonth                         int64
EndYear                          int64
dtype: object

In [23]:
df.shape

(164530, 22)

In [24]:
# services are inflated: often recording the same services on multiple rows
df = df.sort_values(by=["StateId", "Age_AtReg", "ProgramTitle", "EmpName", "StartYear", "StartMonth"])

In [25]:
# dropping dups 
df = df.drop_duplicates(subset = ["StateId", "ProgramTitle", "Age_AtReg", "EmpName", "StartYear", "StartMonth"], keep='first')

In [26]:
df.shape

(85456, 22)

### Dropping Date and EmpName Columns

In [None]:
df.drop(['StateId', 'ActualstartDate', 'ActualEndDate', 'EmpName'], axis=1, inplace = True)

In [None]:
df.shape

In [None]:
df.dtypes

#### One-Hot Encoding Program Title

In [None]:
# Get one hot encoding of column Program Title
one_hot = pd.get_dummies(df['ProgramTitle'])
# Drop column Program Title as it is now encoded
df = df.drop('ProgramTitle',axis = 1)
# Join the encoded df
df = df.join(one_hot)

In [None]:
df.dtypes

In [None]:
df = df.rename(columns={"Back to Work 50+ DC": "BTW", 
                        "DC Career Connections (DCCC)": "DCCC", 
                        "DC Infrastructure Academy": "DCIA", 
                        "L.E.A.P. (Learn, Earn, Advance, Prosper)": "LEAP", 
                        "Local Training Account": "LTA", 
                        "Pre-Apprenticeship Program": "Pre-App", 
                        "Title I - Workforce Development (WIOA)": "WIOA", 
                        "Transitional Employment Program (Way to Work)": "TEP"})

In [None]:
df.dtypes

### Recoding Variables

In [None]:
# Citizenship Status
df.loc[df["CitizenStatus"] == "3", "CitizenStatus"] = "2_interim"
df.loc[(df["CitizenStatus"] != "1") & (df["CitizenStatus"] != "2_interim"), "CitizenStatus"] = "3"
df.loc[df["CitizenStatus"] == "2_interim", "CitizenStatus"] = "2"

In [None]:
# converting citizen status to numeric
df['CitizenStatus'] = pd.to_numeric(df['CitizenStatus'])

In [None]:
df.dtypes

In [None]:
# recoding race, gender, and disability

# disability
df.loc[df["Disability"] == 9, "Disability"] = 0

# gender
df.loc[df["Gender"] == 9, "Gender"] = 0

# race
df.loc[df["Race_Hispanic"] == 9, "Race_Hispanic"] = 0
df.loc[df["Race_NativeAmerican"] == 9, "Race_NativeAmerican"] = 0
df.loc[df["Race_Asian"] == 9, "Race_Asian"] = 0
df.loc[df["Race_AfricanAmerican"] == 9, "Race_AfricanAmerican"] = 0
df.loc[df["Race_PacificIslander"] == 9, "Race_PacificIslander"] = 0
df.loc[df["Race_White"] == 9, "Race_White"] = 0

In [None]:
# education level to make it ordinal

# first recoding to interim values
df.loc[df["EducationLevel"] == 88, "EducationLevel"] = 188
df.loc[df["EducationLevel"] == 89, "EducationLevel"] = 189
df.loc[df["EducationLevel"] == 87, "EducationLevel"] = 187
df.loc[df["EducationLevel"] == 13, "EducationLevel"] = 113
df.loc[df["EducationLevel"] == 14, "EducationLevel"] = 114
df.loc[df["EducationLevel"] == 15, "EducationLevel"] = 115
df.loc[df["EducationLevel"] == 90, "EducationLevel"] = 190
df.loc[df["EducationLevel"] == 91, "EducationLevel"] = 191
df.loc[df["EducationLevel"] == 16, "EducationLevel"] = 116
df.loc[df["EducationLevel"] == 17, "EducationLevel"] = 117

# then recoding to final ordinal values
df.loc[df["EducationLevel"] == 188, "EducationLevel"] = 13
df.loc[df["EducationLevel"] == 189, "EducationLevel"] = 13
df.loc[df["EducationLevel"] == 187, "EducationLevel"] = 14
df.loc[df["EducationLevel"] == 113, "EducationLevel"] = 15
df.loc[df["EducationLevel"] == 114, "EducationLevel"] = 16
df.loc[df["EducationLevel"] == 115, "EducationLevel"] = 17
df.loc[df["EducationLevel"] == 190, "EducationLevel"] = 18
df.loc[df["EducationLevel"] == 191, "EducationLevel"] = 19
df.loc[df["EducationLevel"] == 116, "EducationLevel"] = 20
df.loc[df["EducationLevel"] == 117, "EducationLevel"] = 21

### Describing Data

In [None]:
df.describe()

In [None]:
df.shape

### Summarizing

In [None]:
# Look at the correlation matrix
corr = df.corr()
corr
# corr.to_csv('corr.csv')

In [None]:
# Look at employed by different education levels
df.groupby('EducationLevel')['Employed'].describe()

#### T Tests

In [None]:
def describe_cont_feature(feature):
    print('\n*** Results for {} ***'.format(feature))
    print(df.groupby('Employed')[feature].describe())
    print(ttest(feature))
    
def ttest(feature):
    employed = df[df['Employed']==1][feature]
    not_employed = df[df['Employed']==0][feature]
    tstat, pval = stats.ttest_ind(employed, not_employed, equal_var=False)
    print('t-statistic: {:.1f}, p-value: {:.3}'.format(tstat, pval))

In [None]:
# Look at the distribution of each feature at each level of the target variable
for feature in ['Age_AtReg', 'Disability', 'Gender', 'CitizenStatus', 'EducationLevel']:
    describe_cont_feature(feature)

In [None]:
# Look at the average value of each feature based on whether Age is missing
df.groupby(df['Age_AtReg'].isnull()).mean()

### Plotting Features

In [None]:
# Plot overlaid histograms for continuous features
for i in ['Age_AtReg', 'EducationLevel']:
    unemployed = list(df[df['Employed'] == 0][i].dropna())
    employed = list(df[df['Employed'] == 1][i].dropna())
    xmin = min(min(unemployed), min(employed))
    xmax = max(max(unemployed), max(employed))
    width = (xmax - xmin) / 40
    sns.distplot(unemployed, color='r', kde=False, bins=np.arange(xmin, xmax, width))
    sns.distplot(employed, color='g', kde=False, bins=np.arange(xmin, xmax, width))
    plt.legend(['Did not gain employment', 'Employed'])
    plt.title('Overlaid histogram for {}'.format(i))
    plt.show()

In [None]:
# Generate categorical plots for ordinal features
for col in ['CitizenStatus', 'EducationLevel']:
    sns.catplot(x=col, y='Employed', data=df, kind='point', aspect=2, )
    plt.ylim(0, 1)

#### Number of Unique Values

In [None]:
# Explore the number of unique values for each feature
for col in df.columns:
    print('{}: {} unique values'.format(col, df[col].nunique()))

In [None]:
# Checking employment rate by gender
df[['Employed', 'Gender']].groupby('Gender').mean()

In [None]:
# Check survival rate by citizen status
df[['Employed', 'CitizenStatus']].groupby('CitizenStatus').mean()

In [None]:
# run this line when you have the employed variable
# and have not yet filled missing values

# Is education level missing at random?
# df[['EducationLevel', 'Employed']].groupby(df['EducationLevel'].isnull()).mean()

In [None]:
# Look at employment rate by citizenship and gender
df.pivot_table('Employed', index=['CitizenStatus', 'Gender'], aggfunc=['count', 'mean'])

In [None]:
df.to_csv('cleaned_data/df.csv', index=False)