### HR Analytics

In [10]:
# Lectura de datos de csv
import pandas as pd
filename = 'HR_comma_sep.csv'
df = pd.read_csv(filename)

In [11]:
df.describe(include='all')

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,sales,salary
count,14999.0,14999.0,14999.0,14999.0,14999.0,14999.0,14999.0,14999.0,14999,14999
unique,,,,,,,,,10,3
top,,,,,,,,,sales,low
freq,,,,,,,,,4140,7316
mean,0.612834,0.716102,3.803054,201.050337,3.498233,0.14461,0.238083,0.021268,,
std,0.248631,0.171169,1.232592,49.943099,1.460136,0.351719,0.425924,0.144281,,
min,0.09,0.36,2.0,96.0,2.0,0.0,0.0,0.0,,
25%,0.44,0.56,3.0,156.0,3.0,0.0,0.0,0.0,,
50%,0.64,0.72,4.0,200.0,3.0,0.0,0.0,0.0,,
75%,0.82,0.87,5.0,245.0,4.0,0.0,0.0,0.0,,


#### Pequeño Análisis Exploratorio

In [12]:
df['left'].describe()

count    14999.000000
mean         0.238083
std          0.425924
min          0.000000
25%          0.000000
50%          0.000000
75%          0.000000
max          1.000000
Name: left, dtype: float64

In [13]:
# Desbalanceo?
df['left'].value_counts()

0    11428
1     3571
Name: left, dtype: int64

#### Posibles categóricas

In [14]:
df['promotion_last_5years'].value_counts()

0    14680
1      319
Name: promotion_last_5years, dtype: int64

In [15]:
df['Work_accident'].value_counts()

0    12830
1     2169
Name: Work_accident, dtype: int64

In [16]:
df['sales'].value_counts()

sales          4140
technical      2720
support        2229
IT             1227
product_mng     902
marketing       858
RandD           787
accounting      767
hr              739
management      630
Name: sales, dtype: int64

In [17]:
df['salary'].value_counts()

low       7316
medium    6446
high      1237
Name: salary, dtype: int64

#### Ponemos one hot en las categóricas

In [18]:

cols = ['sales', 'salary']
df = pd.get_dummies(df, columns = cols )

In [19]:
df.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,sales_IT,sales_RandD,...,sales_hr,sales_management,sales_marketing,sales_product_mng,sales_sales,sales_support,sales_technical,salary_high,salary_low,salary_medium
0,0.38,0.53,2,157,3,0,1,0,0,0,...,0,0,0,0,1,0,0,0,1,0
1,0.8,0.86,5,262,6,0,1,0,0,0,...,0,0,0,0,1,0,0,0,0,1
2,0.11,0.88,7,272,4,0,1,0,0,0,...,0,0,0,0,1,0,0,0,0,1
3,0.72,0.87,5,223,5,0,1,0,0,0,...,0,0,0,0,1,0,0,0,1,0
4,0.37,0.52,2,159,3,0,1,0,0,0,...,0,0,0,0,1,0,0,0,1,0


#### Split estándar Train/Test 

In [20]:
import numpy as np
def get_train_test(df, y_col, ratio):
    mask = np.random.rand(len(df)) < ratio
    df_train = df[mask]
    df_test = df[~mask]
    
    Y_train = df_train[y_col].values
    Y_test = df_test[y_col].values
    del df_train[y_col]
    del df_test[y_col]
 
    X_train = df_train.values
    X_test = df_test.values
    return X_train, Y_train, X_test, Y_test

RANDOM_SEED = 111
np.random.seed (RANDOM_SEED)
y_col = 'left'
train_test_ratio = 0.7
X_train, Y_train, X_test, Y_test = get_train_test(df, y_col, train_test_ratio)

In [21]:
df.shape

(14999, 21)

In [22]:
X_train.shape

(10442, 20)

### Entrenamos el clasificador: un Random Forest

In [23]:
from sklearn.ensemble import RandomForestClassifier
# from sklearn.datasets import make_classification

clf = RandomForestClassifier(criterion='gini', random_state=RANDOM_SEED)
clf.fit(X_train, Y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=111, verbose=0, warm_start=False)

El clasificador ya tiene la importancia directamente incorporada

In [24]:
clf.feature_importances_

array([ 0.30492223,  0.13714603,  0.15632288,  0.12943454,  0.22106429,
        0.01270287,  0.00155311,  0.00134647,  0.00164479,  0.00170478,
        0.00131802,  0.00127354,  0.00117367,  0.00104622,  0.00293155,
        0.00313983,  0.00356503,  0.0060701 ,  0.00704517,  0.00459487])

In [25]:
threshold = 0.05
importantFeatures = [index for (index, importance) in enumerate (clf.feature_importances_)
                     if importance > threshold]
importantFeatures

[0, 1, 2, 3, 4]

### Columnas importantes

In [26]:
selection = df.columns[importantFeatures]
selection

Index(['satisfaction_level', 'last_evaluation', 'number_project',
       'average_montly_hours', 'time_spend_company'],
      dtype='object')

#### Evaluamos performance
¿Tendrá diferencias si eliminamos las columnas menos importantes?

In [27]:
clf.score(X_test, Y_test)

0.98661400043888525

In [28]:
from sklearn import metrics

pred = clf.predict(X_test)
fpr, tpr, thresholds = metrics.roc_curve(Y_test, pred)
metrics.auc(fpr, tpr)

0.97543566612793053

In [29]:
selection = selection.insert(len (selection), 'left')
dfSelection = df[selection]
dfSelection

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,left
0,0.38,0.53,2,157,3,1
1,0.80,0.86,5,262,6,1
2,0.11,0.88,7,272,4,1
3,0.72,0.87,5,223,5,1
4,0.37,0.52,2,159,3,1
5,0.41,0.50,2,153,3,1
6,0.10,0.77,6,247,4,1
7,0.92,0.85,5,259,5,1
8,0.89,1.00,5,224,5,1
9,0.42,0.53,2,142,3,1


In [30]:
X_train1, Y_train1, X_test1, Y_test1 = get_train_test(dfSelection, y_col, train_test_ratio)
clfred = RandomForestClassifier(criterion='gini', random_state=RANDOM_SEED)
clfred.fit(X_train1, Y_train1)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=111, verbose=0, warm_start=False)

In [31]:
clfred.score(X_test1, Y_test1)

0.98712737127371275

In [32]:
pred = clfred.predict(X_test1)
fpr, tpr, thresholds = metrics.roc_curve(Y_test1, pred)
metrics.auc(fpr, tpr)

0.97655405338385393

## Insertamos ruido

In [33]:
filename = 'HR_comma_sep.csv'
df = pd.read_csv(filename)

In [34]:
cols = ['sales', 'salary']
dfRuido = pd.get_dummies(df, columns = cols )

In [35]:
numRandomExtra = 20
mu = 0
sigma = 1
for j in range(numRandomExtra):
    name = "randomCol" + str(j +1)
    dfRuido[name] = np.random.normal(mu, sigma, dfRuido.shape[0])


dfRuido.head(5)

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,sales_IT,sales_RandD,...,randomCol11,randomCol12,randomCol13,randomCol14,randomCol15,randomCol16,randomCol17,randomCol18,randomCol19,randomCol20
0,0.38,0.53,2,157,3,0,1,0,0,0,...,0.045001,-2.110347,0.581363,-0.935612,0.233159,-0.231303,0.521065,-1.008143,1.521878,-0.488156
1,0.8,0.86,5,262,6,0,1,0,0,0,...,-1.167138,0.601079,-0.122554,0.326476,-2.987942,-0.468813,-1.481487,1.556378,-1.359711,-0.913871
2,0.11,0.88,7,272,4,0,1,0,0,0,...,-0.863938,0.737363,-0.067015,0.207785,0.819725,-0.28827,-1.098499,0.587468,0.260434,-0.419024
3,0.72,0.87,5,223,5,0,1,0,0,0,...,0.334084,0.26599,-1.406164,1.987137,0.267195,0.759235,-0.604976,-0.223552,-0.560891,0.439037
4,0.37,0.52,2,159,3,0,1,0,0,0,...,-1.930425,-0.414509,0.69083,-0.077957,1.118692,-1.137772,-0.611787,1.406306,0.102402,-0.92735


In [36]:
X_trainRuido, Y_trainRuido, X_testRuido, Y_testRuido = get_train_test(dfRuido, y_col, train_test_ratio)

In [37]:
from sklearn.ensemble import RandomForestClassifier
# from sklearn.datasets import make_classification
clf = RandomForestClassifier(criterion='gini', random_state=RANDOM_SEED)
clf.fit(X_trainRuido, Y_trainRuido)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=111, verbose=0, warm_start=False)

In [38]:
clf.feature_importances_

array([ 0.25843692,  0.10854137,  0.17960657,  0.11999687,  0.14415972,
        0.00830299,  0.00105259,  0.00081368,  0.00103844,  0.00056485,
        0.00057861,  0.00036273,  0.0006333 ,  0.0003327 ,  0.00127643,
        0.00067668,  0.00153869,  0.00487396,  0.0056746 ,  0.00278497,
        0.0081045 ,  0.00771159,  0.00697458,  0.00721949,  0.00950905,
        0.00833033,  0.00814239,  0.00770719,  0.00761337,  0.00899764,
        0.00741422,  0.00703435,  0.00897903,  0.00817724,  0.00828749,
        0.0058814 ,  0.00875335,  0.00868681,  0.00803112,  0.00719819])

In [39]:
threshold = 0.05
importantFeatures = [index for (index, importance) in enumerate (clf.feature_importances_)
                     if importance > threshold]
importantFeatures

[0, 1, 2, 3, 4]

In [40]:
importancias = clf.feature_importances_
index = np.argmax (dfRuido.columns == 'left')
importancias = np.insert (importancias, index, 1.0)

### Listado de importancias

In [41]:
dfImpor = pd.DataFrame(
    {'name': dfRuido.columns,
     'importance': importancias
    })
dfImpor.sort_values(by=['importance'], ascending=False)

Unnamed: 0,importance,name
6,1.0,left
0,0.258437,satisfaction_level
2,0.179607,number_project
4,0.14416,time_spend_company
3,0.119997,average_montly_hours
1,0.108541,last_evaluation
25,0.009509,randomCol5
30,0.008998,randomCol10
33,0.008979,randomCol13
37,0.008753,randomCol17


### Eliminamos columnas importantes

In [42]:
cols = dfRuido.columns[:]
cols

Index(['satisfaction_level', 'last_evaluation', 'number_project',
       'average_montly_hours', 'time_spend_company', 'Work_accident', 'left',
       'promotion_last_5years', 'sales_IT', 'sales_RandD', 'sales_accounting',
       'sales_hr', 'sales_management', 'sales_marketing', 'sales_product_mng',
       'sales_sales', 'sales_support', 'sales_technical', 'salary_high',
       'salary_low', 'salary_medium', 'randomCol1', 'randomCol2', 'randomCol3',
       'randomCol4', 'randomCol5', 'randomCol6', 'randomCol7', 'randomCol8',
       'randomCol9', 'randomCol10', 'randomCol11', 'randomCol12',
       'randomCol13', 'randomCol14', 'randomCol15', 'randomCol16',
       'randomCol17', 'randomCol18', 'randomCol19', 'randomCol20'],
      dtype='object')

In [43]:
selectionRuido = dfRuido.columns[4:]
dfSelectionRuido = dfRuido[selectionRuido]
dfSelectionRuido

Unnamed: 0,time_spend_company,Work_accident,left,promotion_last_5years,sales_IT,sales_RandD,sales_accounting,sales_hr,sales_management,sales_marketing,...,randomCol11,randomCol12,randomCol13,randomCol14,randomCol15,randomCol16,randomCol17,randomCol18,randomCol19,randomCol20
0,3,0,1,0,0,0,0,0,0,0,...,0.045001,-2.110347,0.581363,-0.935612,0.233159,-0.231303,0.521065,-1.008143,1.521878,-0.488156
1,6,0,1,0,0,0,0,0,0,0,...,-1.167138,0.601079,-0.122554,0.326476,-2.987942,-0.468813,-1.481487,1.556378,-1.359711,-0.913871
2,4,0,1,0,0,0,0,0,0,0,...,-0.863938,0.737363,-0.067015,0.207785,0.819725,-0.288270,-1.098499,0.587468,0.260434,-0.419024
3,5,0,1,0,0,0,0,0,0,0,...,0.334084,0.265990,-1.406164,1.987137,0.267195,0.759235,-0.604976,-0.223552,-0.560891,0.439037
4,3,0,1,0,0,0,0,0,0,0,...,-1.930425,-0.414509,0.690830,-0.077957,1.118692,-1.137772,-0.611787,1.406306,0.102402,-0.927350
5,3,0,1,0,0,0,0,0,0,0,...,-0.990022,0.801214,-1.187905,-1.116071,-1.226753,1.188184,-0.632546,-0.339986,-1.386551,0.831998
6,4,0,1,0,0,0,0,0,0,0,...,1.155562,-0.756065,1.083009,-0.029016,0.412685,-1.179648,-0.382941,0.191157,-0.494639,1.079249
7,5,0,1,0,0,0,0,0,0,0,...,0.105089,1.110798,-0.353353,1.157559,-2.180407,-0.931017,0.330538,1.229931,-1.544440,0.979983
8,5,0,1,0,0,0,0,0,0,0,...,1.845731,-0.369857,-0.274291,-0.496053,0.984833,0.051901,0.587057,0.558586,1.099378,-1.144652
9,3,0,1,0,0,0,0,0,0,0,...,0.654676,1.109652,-0.676762,0.255804,0.737756,-0.777393,-0.174314,-0.441199,0.490918,1.507485


In [44]:
X_trainRuido2, Y_trainRuido2, X_testRuido2, Y_testRuido2 = get_train_test(dfSelectionRuido, y_col, train_test_ratio)

In [45]:
clf = RandomForestClassifier(criterion='gini', random_state=RANDOM_SEED)
clf.fit(X_trainRuido2, Y_trainRuido2)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=111, verbose=0, warm_start=False)

In [46]:
clf.feature_importances_

array([ 0.10756911,  0.02389927,  0.00112253,  0.00220566,  0.00239458,
        0.00396147,  0.00379425,  0.00160921,  0.00275277,  0.00274748,
        0.00561291,  0.00317682,  0.00445738,  0.00741667,  0.01622723,
        0.00535977,  0.03614683,  0.04100863,  0.04036429,  0.0415931 ,
        0.04285339,  0.03949502,  0.04354788,  0.03684807,  0.03893225,
        0.03705337,  0.03984796,  0.04133082,  0.04164046,  0.03895165,
        0.03976548,  0.04034117,  0.04271376,  0.04315434,  0.03819477,
        0.04190965])

In [47]:
importancias = clf.feature_importances_
index = np.argmax (dfSelectionRuido.columns == 'left')
importancias = np.insert (importancias, index, 1.0)

In [48]:
# pd.DataFrame([df.columns, clf.feature_importances_], columns=['name', 'importance'])
dfImpor = pd.DataFrame(
    {'name': dfSelectionRuido.columns,
     'importance': importancias
    })
dfImpor.sort_values(by=['importance'], ascending=False)

Unnamed: 0,importance,name
2,1.0,left
0,0.107569,time_spend_company
23,0.043548,randomCol7
34,0.043154,randomCol18
21,0.042853,randomCol5
33,0.042714,randomCol17
36,0.04191,randomCol20
29,0.04164,randomCol13
20,0.041593,randomCol4
28,0.041331,randomCol12


In [49]:
clf.score(X_testRuido2, Y_testRuido2)

0.75176678445229683

In [50]:
pred = clf.predict(X_testRuido2)
fpr, tpr, thresholds = metrics.roc_curve(Y_testRuido2, pred)
metrics.auc(fpr, tpr)

0.5408244098939271

In [51]:
from sklearn.metrics import confusion_matrix
confusion_matrix(Y_testRuido2, pred)# labels=["ant", "bird", "cat"])

array([[3249,  230],
       [ 894,  155]])