In [1]:
import pandas as pd
import seaborn as sns
from sklearn.preprocessing import LabelBinarizer, LabelEncoder

# Leitura dos dados

Os dados estão disponíveis em uma planilha de Excel.

Com o Pandas, podemos ler o arquivo diretamente fornecendo a URL para a função `read_excel` ou `read_csv`

In [2]:
df = pd.read_excel('https://community.watsonanalytics.com/wp-content/uploads/2015/03/WA_Fn-UseC_-HR-Employee-Attrition.xlsx?cm_mc_uid=16995342974615132859040&cm_mc_sid_50200000=1513285904&cm_mc_sid_52640000=1513285904')

In [4]:
df.describe()

Unnamed: 0,Age,DailyRate,DistanceFromHome,Education,EmployeeCount,EmployeeNumber,EnvironmentSatisfaction,HourlyRate,JobInvolvement,JobLevel,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
count,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,...,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0
mean,36.92381,802.485714,9.192517,2.912925,1.0,1024.865306,2.721769,65.891156,2.729932,2.063946,...,2.712245,80.0,0.793878,11.279592,2.79932,2.761224,7.008163,4.229252,2.187755,4.123129
std,9.135373,403.5091,8.106864,1.024165,0.0,602.024335,1.093082,20.329428,0.711561,1.10694,...,1.081209,0.0,0.852077,7.780782,1.289271,0.706476,6.126525,3.623137,3.22243,3.568136
min,18.0,102.0,1.0,1.0,1.0,1.0,1.0,30.0,1.0,1.0,...,1.0,80.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
25%,30.0,465.0,2.0,2.0,1.0,491.25,2.0,48.0,2.0,1.0,...,2.0,80.0,0.0,6.0,2.0,2.0,3.0,2.0,0.0,2.0
50%,36.0,802.0,7.0,3.0,1.0,1020.5,3.0,66.0,3.0,2.0,...,3.0,80.0,1.0,10.0,3.0,3.0,5.0,3.0,1.0,3.0
75%,43.0,1157.0,14.0,4.0,1.0,1555.75,4.0,83.75,3.0,3.0,...,4.0,80.0,1.0,15.0,3.0,3.0,9.0,7.0,3.0,7.0
max,60.0,1499.0,29.0,5.0,1.0,2068.0,4.0,100.0,4.0,5.0,...,4.0,80.0,3.0,40.0,6.0,4.0,40.0,18.0,15.0,17.0


In [5]:
all_columns = df.columns
numerical_columns = df._get_numeric_data().columns
categorical_columns = list(set(all_columns) - set(numerical_columns))
categorical_columns

['BusinessTravel',
 'EducationField',
 'Gender',
 'MaritalStatus',
 'OverTime',
 'Department',
 'JobRole',
 'Over18',
 'Attrition']

In [6]:
for c in categorical_columns:
    print("%s : %s" % (c, df[c].unique()))

BusinessTravel : ['Travel_Rarely' 'Travel_Frequently' 'Non-Travel']
EducationField : ['Life Sciences' 'Other' 'Medical' 'Marketing' 'Technical Degree'
 'Human Resources']
Gender : ['Female' 'Male']
MaritalStatus : ['Single' 'Married' 'Divorced']
OverTime : ['Yes' 'No']
Department : ['Sales' 'Research & Development' 'Human Resources']
JobRole : ['Sales Executive' 'Research Scientist' 'Laboratory Technician'
 'Manufacturing Director' 'Healthcare Representative' 'Manager'
 'Sales Representative' 'Research Director' 'Human Resources']
Over18 : ['Y']
Attrition : ['Yes' 'No']


In [7]:
label_encoders = {}
df_hr = df.copy()
for c in categorical_columns:
    le = LabelEncoder()
    le.fit(df_hr[c])
    label_encoders[c] = le
    df_hr[c] = le.transform(df_hr[c])

In [8]:
df_hr.head()

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,1,2,1102,2,1,2,1,1,1,...,1,80,0,8,0,1,6,4,0,5
1,49,0,1,279,1,8,1,1,1,2,...,4,80,1,10,3,3,10,7,1,7
2,37,1,2,1373,1,2,2,4,1,4,...,2,80,0,7,3,3,0,0,0,0
3,33,0,1,1392,1,3,4,1,1,5,...,3,80,0,8,3,3,8,7,3,0
4,27,0,2,591,1,2,1,3,1,7,...,4,80,1,6,3,3,2,2,2,2


In [9]:
df_hr['Attrition'].value_counts()

0    1233
1     237
Name: Attrition, dtype: int64

In [10]:
from sklearn.tree import DecisionTreeClassifier

In [11]:
tree = DecisionTreeClassifier(max_depth=3)

In [12]:
y = df_hr['Attrition']

In [13]:
y.head()

0    1
1    0
2    1
3    0
4    0
Name: Attrition, dtype: int64

In [14]:
X = df_hr[list(set(df_hr.columns) - set(['Attrition', 'Attrition2']))]

In [15]:
X.head()

Unnamed: 0,EducationField,StandardHours,JobSatisfaction,EmployeeCount,Age,TotalWorkingYears,DailyRate,Department,PercentSalaryHike,RelationshipSatisfaction,...,MonthlyRate,EmployeeNumber,YearsWithCurrManager,EnvironmentSatisfaction,JobLevel,BusinessTravel,JobInvolvement,WorkLifeBalance,Gender,OverTime
0,1,80,4,1,41,8,1102,2,11,1,...,19479,1,5,2,2,2,3,1,0,1
1,1,80,2,1,49,10,279,1,23,4,...,24907,2,7,3,2,1,2,3,1,0
2,4,80,3,1,37,7,1373,1,15,2,...,2396,4,0,4,1,2,2,3,1,1
3,1,80,3,1,33,8,1392,1,11,3,...,23159,5,0,4,1,1,3,3,0,1
4,3,80,2,1,27,6,591,1,12,4,...,16632,7,2,1,1,2,3,3,1,0


In [16]:
X.columns

Index(['EducationField', 'StandardHours', 'JobSatisfaction', 'EmployeeCount',
       'Age', 'TotalWorkingYears', 'DailyRate', 'Department',
       'PercentSalaryHike', 'RelationshipSatisfaction', 'MaritalStatus',
       'Education', 'MonthlyIncome', 'PerformanceRating', 'JobRole', 'Over18',
       'HourlyRate', 'YearsAtCompany', 'YearsSinceLastPromotion',
       'StockOptionLevel', 'DistanceFromHome', 'YearsInCurrentRole',
       'TrainingTimesLastYear', 'NumCompaniesWorked', 'MonthlyRate',
       'EmployeeNumber', 'YearsWithCurrManager', 'EnvironmentSatisfaction',
       'JobLevel', 'BusinessTravel', 'JobInvolvement', 'WorkLifeBalance',
       'Gender', 'OverTime'],
      dtype='object')

In [17]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3)

In [18]:
tree.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=3,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [19]:
y_pred = tree.predict(X_test)

In [20]:
from sklearn.metrics import precision_score, accuracy_score, confusion_matrix

In [21]:
accuracy_score(y_test, y_pred)

0.85260770975056688

In [22]:
precision_score(y_test, y_pred)

0.61904761904761907

In [23]:
confusion_matrix(y_test, y_pred)

array([[363,   8],
       [ 57,  13]])

In [24]:
from sklearn.tree import export_graphviz

In [25]:
export_graphviz(tree, out_file='rh_tree.dot', feature_names = X.columns)

In [26]:
!dot -Tpng rh_tree.dot -o rh_tree.png

![Tree](rh_tree.png)

In [27]:
from sklearn.utils import resample

In [28]:
df_majority = df_hr[df_hr.Attrition==0]
df_minority = df_hr[df_hr.Attrition==1]

In [29]:
df_minority_upsampled = resample(df_minority, 
                                 replace=True,     # sample with replacement
                                 n_samples=1233,    # to match majority class
                                 random_state=123) # reproducible results

In [30]:
df_upsampled = pd.concat([df_majority, df_minority_upsampled])

In [31]:
df_upsampled.Attrition.value_counts()

1    1233
0    1233
Name: Attrition, dtype: int64

In [32]:
y = df_upsampled['Attrition']

In [33]:
df_upsampled.columns

Index(['Age', 'Attrition', 'BusinessTravel', 'DailyRate', 'Department',
       'DistanceFromHome', 'Education', 'EducationField', 'EmployeeCount',
       'EmployeeNumber', 'EnvironmentSatisfaction', 'Gender', 'HourlyRate',
       'JobInvolvement', 'JobLevel', 'JobRole', 'JobSatisfaction',
       'MaritalStatus', 'MonthlyIncome', 'MonthlyRate', 'NumCompaniesWorked',
       'Over18', 'OverTime', 'PercentSalaryHike', 'PerformanceRating',
       'RelationshipSatisfaction', 'StandardHours', 'StockOptionLevel',
       'TotalWorkingYears', 'TrainingTimesLastYear', 'WorkLifeBalance',
       'YearsAtCompany', 'YearsInCurrentRole', 'YearsSinceLastPromotion',
       'YearsWithCurrManager'],
      dtype='object')

In [34]:
X = df_upsampled[list(set(df_upsampled.columns) - set(['Attrition', 'Attrition2']))]

In [35]:
X.columns

Index(['EducationField', 'StandardHours', 'JobSatisfaction', 'EmployeeCount',
       'Age', 'TotalWorkingYears', 'DailyRate', 'Department',
       'PercentSalaryHike', 'RelationshipSatisfaction', 'MaritalStatus',
       'Education', 'MonthlyIncome', 'PerformanceRating', 'JobRole', 'Over18',
       'HourlyRate', 'YearsAtCompany', 'YearsSinceLastPromotion',
       'StockOptionLevel', 'DistanceFromHome', 'YearsInCurrentRole',
       'TrainingTimesLastYear', 'NumCompaniesWorked', 'MonthlyRate',
       'EmployeeNumber', 'YearsWithCurrManager', 'EnvironmentSatisfaction',
       'JobLevel', 'BusinessTravel', 'JobInvolvement', 'WorkLifeBalance',
       'Gender', 'OverTime'],
      dtype='object')

In [36]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3)

In [37]:
tree = DecisionTreeClassifier()

In [38]:
tree.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [39]:
y_pred = tree.predict(X_test)

In [40]:
accuracy_score(y_test, y_pred)

0.91756756756756752

In [41]:
precision_score(y_test, y_pred)

0.86854460093896713

In [42]:
confusion_matrix(y_test, y_pred)

array([[309,  56],
       [  5, 370]])

In [43]:
from sklearn.ensemble import RandomForestClassifier


In [44]:
rf = RandomForestClassifier(n_estimators=500)

In [45]:
rf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [46]:
y_pred = rf.predict(X_test)

In [47]:
accuracy_score(y_test, y_pred)

0.97027027027027024

In [48]:
precision_score(y_test, y_pred)

0.95607235142118863

In [49]:
confusion_matrix(y_test, y_pred)

array([[348,  17],
       [  5, 370]])

In [50]:
from sklearn.svm import SVC

In [51]:
svc = SVC(kernel='rbf', C=1, gamma=1)

In [52]:
from sklearn.decomposition import PCA

In [53]:
pca = PCA(9)

In [54]:
X_transformed = pca.fit_transform(X)

In [55]:
X_train, X_test, y_train, y_test = train_test_split(X_transformed, y)

In [56]:
svc.fit(X_train, y_train)

SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=1, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [57]:
y_pred = svc.predict(X_test)

In [58]:
accuracy_score(y_test, y_pred)

0.98055105348460292

In [59]:
precision_score(y_test, y_pred)

1.0

In [60]:
confusion_matrix(y_test, y_pred)

array([[336,   0],
       [ 12, 269]])

In [61]:
df_majority = df_hr[df_hr.Attrition==0]
df_minority = df_hr[df_hr.Attrition==1]

In [62]:
from sklearn.linear_model import LogisticRegression

In [63]:
logist = LogisticRegression()

In [64]:
logist.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [65]:
y_pred = logist.predict(X_test)

In [66]:
accuracy_score(y_test, y_pred)

0.63209076175040524

In [67]:
precision_score(y_test, y_pred)

0.58437499999999998

In [68]:
confusion_matrix(y_test, y_pred)

array([[203, 133],
       [ 94, 187]])

In [216]:
df_majority_downsampled = resample(df_majority, 
                                 replace=False,    # sample without replacement
                                 n_samples=237,     # to match minority class
                                 random_state=123) # reproducible results

In [217]:
df_downsampled = pd.concat([df_majority_downsampled, df_minority])

In [219]:
df_downsampled.Attrition.value_counts()

1    237
0    237
Name: Attrition, dtype: int64

In [220]:
X = df_downsampled[list(set(df_downsampled.columns) - set(['Attrition', 'Attrition2']))]

In [223]:
y = df_downsampled['Attrition']

In [225]:
rf = RandomForestClassifier(n_estimators=100)

In [226]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [227]:
rf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [228]:
y_pred = rf.predict(X_test)

In [229]:
accuracy_score(y_test, y_pred)

0.67832167832167833

In [230]:
precision_score(y_test, y_pred)

0.71641791044776115

In [231]:
confusion_matrix(y_test, y_pred)

array([[49, 19],
       [27, 48]])

In [283]:
svc

SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=1, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [284]:
import pickle

In [285]:
pickle.dump(svc, open('modelo_svc.out', 'wb'))

In [286]:
!ls -lrhat

total 4248
drwxr-xr-x  14 Sato  staff   476B 14 Dez 19:14 [34m..[m[m
drwxr-xr-x   3 Sato  staff   102B 14 Dez 19:15 [34m.ipynb_checkpoints[m[m
-rw-r--r--   1 Sato  staff    27K 14 Dez 20:43 rh_tree.out
-rw-r--r--   1 Sato  staff   1,5M 14 Dez 20:44 tree.png
-rw-r--r--   1 Sato  staff   1,3K 14 Dez 20:54 rh_tree.dot
-rw-r--r--@  1 Sato  staff    93K 14 Dez 20:54 rh_tree.png
-rw-r--r--   1 Sato  staff    54K 14 Dez 21:28 Exercício.ipynb
-rw-r--r--   1 Sato  staff   371K 14 Dez 21:28 modelo_svc.out
drwxr-xr-x   9 Sato  staff   306B 14 Dez 21:28 [34m.[m[m
