### Librerías que vamos a usar

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler, MinMaxScaler, RobustScaler
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix

### Creación del dataframe y exploración de datos
Fuente: https://www.kaggle.com/arashnic/hr-analytics-job-change-of-data-scientists
Luego de crear el objeto DataFrame y asignarle el archivo que elegimos, observamos los nombres de las columnas, la cantidad y los tipos de datos que tenemos, y la cantidad de valores nulos por columna.

In [2]:
train = pd.read_csv('aug_train.csv')
train

Unnamed: 0,enrollee_id,city,city_development_index,gender,relevent_experience,enrolled_university,education_level,major_discipline,experience,company_size,company_type,last_new_job,training_hours,target
0,8949,city_103,0.920,Male,Has relevent experience,no_enrollment,Graduate,STEM,>20,,,1,36,1.0
1,29725,city_40,0.776,Male,No relevent experience,no_enrollment,Graduate,STEM,15,50-99,Pvt Ltd,>4,47,0.0
2,11561,city_21,0.624,,No relevent experience,Full time course,Graduate,STEM,5,,,never,83,0.0
3,33241,city_115,0.789,,No relevent experience,,Graduate,Business Degree,<1,,Pvt Ltd,never,52,1.0
4,666,city_162,0.767,Male,Has relevent experience,no_enrollment,Masters,STEM,>20,50-99,Funded Startup,4,8,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19153,7386,city_173,0.878,Male,No relevent experience,no_enrollment,Graduate,Humanities,14,,,1,42,1.0
19154,31398,city_103,0.920,Male,Has relevent experience,no_enrollment,Graduate,STEM,14,,,4,52,1.0
19155,24576,city_103,0.920,Male,Has relevent experience,no_enrollment,Graduate,STEM,>20,50-99,Pvt Ltd,4,44,0.0
19156,5756,city_65,0.802,Male,Has relevent experience,no_enrollment,High School,,<1,500-999,Pvt Ltd,2,97,0.0


In [3]:
train.shape

(19158, 14)

In [4]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19158 entries, 0 to 19157
Data columns (total 14 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   enrollee_id             19158 non-null  int64  
 1   city                    19158 non-null  object 
 2   city_development_index  19158 non-null  float64
 3   gender                  14650 non-null  object 
 4   relevent_experience     19158 non-null  object 
 5   enrolled_university     18772 non-null  object 
 6   education_level         18698 non-null  object 
 7   major_discipline        16345 non-null  object 
 8   experience              19093 non-null  object 
 9   company_size            13220 non-null  object 
 10  company_type            13018 non-null  object 
 11  last_new_job            18735 non-null  object 
 12  training_hours          19158 non-null  int64  
 13  target                  19158 non-null  float64
dtypes: float64(2), int64(2), object(10)
me

In [5]:
# Cantidad de valores nulos por columna

for col in train.columns:
    print(col + ": " + str(train[col].isna().sum()))

enrollee_id: 0
city: 0
city_development_index: 0
gender: 4508
relevent_experience: 0
enrolled_university: 386
education_level: 460
major_discipline: 2813
experience: 65
company_size: 5938
company_type: 6140
last_new_job: 423
training_hours: 0
target: 0


### Imputación de valores nulos

Elegimos imputar en la mayoría de los casos con una nueva categoría ad hoc llamada "Unknown". Dado que creemos que la mayoría de las variables tienen valor predictivo para el desafío que nos planteamos, preferimos no perder observaciones y dejar que el modelo clasifique con la información desconocida.

La excepción son las columnas de "Company size", "Company type", "Experience" y "Last new job". Las primeras dos porque hay demasiados valores nulos y las otras dos porque son pocas observaciones y no creemos que haya una categoría ad hoc que valga la pena incorporar. En este caso, decidimos dropear las filas que contienen observaciones faltantes, dado que sospechamos que deben coincidir bastante (más adelante veremos que entre las cuatro columnas perdimos menos de 7.000 filas, es decir, mucho menos que la suma de ellas).

In [6]:
train.gender.value_counts()

Male      13221
Female     1238
Other       191
Name: gender, dtype: int64

In [7]:
train['gender'] = train['gender'].fillna("Unknown")
train.gender.value_counts()

Male       13221
Unknown     4508
Female      1238
Other        191
Name: gender, dtype: int64

In [8]:
train.major_discipline.value_counts()

STEM               14492
Humanities           669
Other                381
Business Degree      327
Arts                 253
No Major             223
Name: major_discipline, dtype: int64

In [9]:
train['major_discipline'] = train['major_discipline'].fillna("Unknown")
train.major_discipline.value_counts()

STEM               14492
Unknown             2813
Humanities           669
Other                381
Business Degree      327
Arts                 253
No Major             223
Name: major_discipline, dtype: int64

In [10]:
train['enrolled_university'] = train['enrolled_university'].fillna("Unknown")
train.enrolled_university.value_counts()

no_enrollment       13817
Full time course     3757
Part time course     1198
Unknown               386
Name: enrolled_university, dtype: int64

In [11]:
train['education_level'] = train['education_level'].fillna("Unknown")
train.education_level.value_counts()

Graduate          11598
Masters            4361
High School        2017
Unknown             460
Phd                 414
Primary School      308
Name: education_level, dtype: int64

In [12]:
# Volvemos a contar la cantidad de valores nulos por columna

for col in train.columns:
    print(col + ": " + str(train[col].isna().sum()))

enrollee_id: 0
city: 0
city_development_index: 0
gender: 0
relevent_experience: 0
enrolled_university: 0
education_level: 0
major_discipline: 0
experience: 65
company_size: 5938
company_type: 6140
last_new_job: 423
training_hours: 0
target: 0


In [13]:
# Dropeamos los valores nulos que quedaron

train = train.dropna()

In [14]:
# Vemos cuántas observaciones nos quedaron

train.shape

(12253, 14)

In [15]:
for col in train.columns:
    print(col + ": " + str(train[col].isna().sum()))

enrollee_id: 0
city: 0
city_development_index: 0
gender: 0
relevent_experience: 0
enrolled_university: 0
education_level: 0
major_discipline: 0
experience: 0
company_size: 0
company_type: 0
last_new_job: 0
training_hours: 0
target: 0


### Preprocesamiento de datos

Para empezar a trabajar con los datos para nuestros modelos, separamos las variables categóricas de las numéricas y usamos OneHotEncoder para trabajar con las primeras.

In [16]:
# Separamos las variables en categóricas y numéricas

categoricas = [c for c in train.columns if train[c].dtypes == 'object']
print('Categorías: ', categoricas)

numericas = [c for c in train.columns if c not in categoricas]
print('Numericas: ', numericas)

Categorías:  ['city', 'gender', 'relevent_experience', 'enrolled_university', 'education_level', 'major_discipline', 'experience', 'company_size', 'company_type', 'last_new_job']
Numericas:  ['enrollee_id', 'city_development_index', 'training_hours', 'target']


In [17]:
train.head()

Unnamed: 0,enrollee_id,city,city_development_index,gender,relevent_experience,enrolled_university,education_level,major_discipline,experience,company_size,company_type,last_new_job,training_hours,target
1,29725,city_40,0.776,Male,No relevent experience,no_enrollment,Graduate,STEM,15,50-99,Pvt Ltd,>4,47,0.0
4,666,city_162,0.767,Male,Has relevent experience,no_enrollment,Masters,STEM,>20,50-99,Funded Startup,4,8,0.0
6,28806,city_160,0.92,Male,Has relevent experience,no_enrollment,High School,Unknown,5,50-99,Funded Startup,1,24,0.0
7,402,city_46,0.762,Male,Has relevent experience,no_enrollment,Graduate,STEM,13,<10,Pvt Ltd,>4,18,1.0
8,27107,city_103,0.92,Male,Has relevent experience,no_enrollment,Graduate,STEM,7,50-99,Pvt Ltd,1,46,1.0


### Creacion de Pipelines para variables numéricas y categóricas

Vamos a crear un pipeline para cada tipo de variable.

In [18]:
from sklearn.pipeline import make_pipeline, make_union
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.base import BaseEstimator, TransformerMixin

#Creamos la clase columselector para selecionar las columnas de las variables
# en el pipeline
class ColumnSelector(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns
    
    def transform(self, X, *_):
        if isinstance(X, pd.DataFrame):
            return pd.DataFrame(X[self.columns])
        else:
            raise TypeError("Este Transformador solo funciona en DF de Pandas")
    
    def fit(self, X, *_):
        return self

In [19]:
#Creamos el pipeline para selecinonar las columnas numericas y estandarisarlas.
numericas_pipe = make_pipeline(ColumnSelector(['city_development_index', 'training_hours']),StandardScaler())
numericas_pipe.fit_transform(train)

array([[-0.50681136, -0.30869686],
       [-0.58171247, -0.95229837],
       [ 0.69160642, -0.68825672],
       ...,
       [ 0.69160642, -0.67175412],
       [ 0.69160642, -0.35820467],
       [-0.29043037,  0.51643329]])

In [None]:
#Creamos el pipeline para selecinonar las columnas categoricas y usar onehotencoder.



In [20]:
categoricas_pipe = make_pipeline(ColumnSelector(categoricas),
                                 OneHotEncoder(sparse=False,handle_unknown='ignore'))
categoricas_pipe.fit_transform(train)

array([[0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [21]:
# Usamos make_union para unir los pipelines
union = make_union(numericas_pipe, categoricas_pipe)

union.fit_transform(train.head())

array([[-0.7120158 ,  1.188111  ,  0.        ,  0.        ,  0.        ,
         1.        ,  0.        ,  1.        ,  0.        ,  1.        ,
         1.        ,  1.        ,  0.        ,  0.        ,  1.        ,
         0.        ,  0.        ,  1.        ,  0.        ,  0.        ,
         0.        ,  1.        ,  0.        ,  0.        ,  1.        ,
         0.        ,  0.        ,  1.        ],
       [-0.83292415, -1.33016775,  0.        ,  0.        ,  1.        ,
         0.        ,  0.        ,  1.        ,  1.        ,  0.        ,
         1.        ,  0.        ,  0.        ,  1.        ,  1.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         1.        ,  1.        ,  0.        ,  1.        ,  0.        ,
         0.        ,  1.        ,  0.        ],
       [ 1.2225177 , -0.29702775,  0.        ,  1.        ,  0.        ,
         0.        ,  0.        ,  1.        ,  1.        ,  0.        ,
         1.        ,  0.    

In [22]:
# Idetificamos las variables del modelo 
X= train.drop(columns=['target','enrollee_id'])
y= train['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=30, stratify=y)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((8577, 12), (3676, 12), (8577,), (3676,))

In [23]:
y.value_counts()

0.0    10091
1.0     2162
Name: target, dtype: int64

In [24]:
pasos = [('union', union), ('knn', KNeighborsClassifier())]
pipe = Pipeline(pasos)
pipe.set_params( knn__n_neighbors=4)

Pipeline(steps=[('union',
                 FeatureUnion(transformer_list=[('pipeline-1',
                                                 Pipeline(steps=[('columnselector',
                                                                  ColumnSelector(columns=['city_development_index',
                                                                                          'training_hours'])),
                                                                 ('standardscaler',
                                                                  StandardScaler())])),
                                                ('pipeline-2',
                                                 Pipeline(steps=[('columnselector',
                                                                  ColumnSelector(columns=['city',
                                                                                          'gender',
                                                                                   

In [25]:
 pipe.fit(X_train,y_train)

Pipeline(steps=[('union',
                 FeatureUnion(transformer_list=[('pipeline-1',
                                                 Pipeline(steps=[('columnselector',
                                                                  ColumnSelector(columns=['city_development_index',
                                                                                          'training_hours'])),
                                                                 ('standardscaler',
                                                                  StandardScaler())])),
                                                ('pipeline-2',
                                                 Pipeline(steps=[('columnselector',
                                                                  ColumnSelector(columns=['city',
                                                                                          'gender',
                                                                                   

In [26]:
pipe.score(X_test,y_test)

0.8354189336235038

In [27]:
accuracy_score(pipe.predict(X_test), y_test)

0.8354189336235038

In [28]:
folds = StratifiedKFold(n_splits=5,shuffle=True, random_state=42)

In [29]:
param_grid = {'knn__n_neighbors':range(2,20,2),'knn__weights':['uniform','distance']}

In [30]:
grid = GridSearchCV(pipe, param_grid, cv=folds)
grid.fit(X_train, y_train)

GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=42, shuffle=True),
             estimator=Pipeline(steps=[('union',
                                        FeatureUnion(transformer_list=[('pipeline-1',
                                                                        Pipeline(steps=[('columnselector',
                                                                                         ColumnSelector(columns=['city_development_index',
                                                                                                                 'training_hours'])),
                                                                                        ('standardscaler',
                                                                                         StandardScaler())])),
                                                                       ('pipeline-2',
                                                                        Pipeline(steps=[('columnselector'

In [31]:
grid.best_score_

0.8440006931843665

In [32]:
grid.best_estimator_

Pipeline(steps=[('union',
                 FeatureUnion(transformer_list=[('pipeline-1',
                                                 Pipeline(steps=[('columnselector',
                                                                  ColumnSelector(columns=['city_development_index',
                                                                                          'training_hours'])),
                                                                 ('standardscaler',
                                                                  StandardScaler())])),
                                                ('pipeline-2',
                                                 Pipeline(steps=[('columnselector',
                                                                  ColumnSelector(columns=['city',
                                                                                          'gender',
                                                                                   

In [33]:
print(classification_report(grid.best_estimator_.predict(X_test), y_test))

              precision    recall  f1-score   support

         0.0       0.93      0.88      0.91      3204
         1.0       0.41      0.56      0.47       472

    accuracy                           0.84      3676
   macro avg       0.67      0.72      0.69      3676
weighted avg       0.86      0.84      0.85      3676



In [34]:
conf_mat = confusion_matrix(y_test, grid.best_estimator_.predict(X_test))

print('Confusion matrix\n\n', conf_mat)

Confusion matrix

 [[2821  206]
 [ 383  266]]


In [35]:
#PIPE REGRESION LOGISTICA

In [36]:
pasos_RL = [('union', union),('clasificador', LogisticRegression(solver='liblinear'))]

In [37]:
pipe_RL = Pipeline(pasos_RL)

In [38]:
pipe_RL.set_params( clasificador__C=0.01 )

Pipeline(steps=[('union',
                 FeatureUnion(transformer_list=[('pipeline-1',
                                                 Pipeline(steps=[('columnselector',
                                                                  ColumnSelector(columns=['city_development_index',
                                                                                          'training_hours'])),
                                                                 ('standardscaler',
                                                                  StandardScaler())])),
                                                ('pipeline-2',
                                                 Pipeline(steps=[('columnselector',
                                                                  ColumnSelector(columns=['city',
                                                                                          'gender',
                                                                                   

In [39]:
pipe_RL.fit(X_train,y_train)

Pipeline(steps=[('union',
                 FeatureUnion(transformer_list=[('pipeline-1',
                                                 Pipeline(steps=[('columnselector',
                                                                  ColumnSelector(columns=['city_development_index',
                                                                                          'training_hours'])),
                                                                 ('standardscaler',
                                                                  StandardScaler())])),
                                                ('pipeline-2',
                                                 Pipeline(steps=[('columnselector',
                                                                  ColumnSelector(columns=['city',
                                                                                          'gender',
                                                                                   

In [40]:
pipe_RL.score(X_test,y_test)

0.8416757344940152

In [41]:
accuracy_score(pipe_RL.predict(X_test), y_test)

0.8416757344940152

In [42]:
param_grid_RL = {'clasificador__penalty':['l1', 'l2'],'clasificador__C': [0.01, 0.1, 1, 10, 100] }

In [43]:
grid_RL = GridSearchCV(pipe_RL, param_grid_RL, cv=folds)
grid_RL.fit(X_train, y_train)

GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=42, shuffle=True),
             estimator=Pipeline(steps=[('union',
                                        FeatureUnion(transformer_list=[('pipeline-1',
                                                                        Pipeline(steps=[('columnselector',
                                                                                         ColumnSelector(columns=['city_development_index',
                                                                                                                 'training_hours'])),
                                                                                        ('standardscaler',
                                                                                         StandardScaler())])),
                                                                       ('pipeline-2',
                                                                        Pipeline(steps=[('columnselector'

In [44]:
grid_RL.best_score_

0.8515782177006667

In [45]:
grid_RL.best_estimator_

Pipeline(steps=[('union',
                 FeatureUnion(transformer_list=[('pipeline-1',
                                                 Pipeline(steps=[('columnselector',
                                                                  ColumnSelector(columns=['city_development_index',
                                                                                          'training_hours'])),
                                                                 ('standardscaler',
                                                                  StandardScaler())])),
                                                ('pipeline-2',
                                                 Pipeline(steps=[('columnselector',
                                                                  ColumnSelector(columns=['city',
                                                                                          'gender',
                                                                                   

In [46]:
print(classification_report(grid_RL.best_estimator_.predict(X_test), y_test))

              precision    recall  f1-score   support

         0.0       0.92      0.89      0.91      3118
         1.0       0.49      0.58      0.53       558

    accuracy                           0.85      3676
   macro avg       0.71      0.74      0.72      3676
weighted avg       0.86      0.85      0.85      3676



In [47]:
conf_mat = confusion_matrix(y_test, grid_RL.best_estimator_.predict(X_test))

print('Confusion matrix\n\n', conf_mat)

Confusion matrix

 [[2790  237]
 [ 328  321]]


In [48]:
#PIPE NAIVE BAYES

In [49]:
pasos_GNB = [('union', union),('gnb', GaussianNB())]

In [50]:
pipe_GNB = Pipeline(pasos_GNB)

In [51]:
# GNB no tiene hiperparametros, por lo tanto no seteo parametros ni hago gridsearch, salvo que aca pongamos
# la estandarizacion de variables numericas

In [52]:
pipe_GNB.fit(X_train,y_train)

Pipeline(steps=[('union',
                 FeatureUnion(transformer_list=[('pipeline-1',
                                                 Pipeline(steps=[('columnselector',
                                                                  ColumnSelector(columns=['city_development_index',
                                                                                          'training_hours'])),
                                                                 ('standardscaler',
                                                                  StandardScaler())])),
                                                ('pipeline-2',
                                                 Pipeline(steps=[('columnselector',
                                                                  ColumnSelector(columns=['city',
                                                                                          'gender',
                                                                                   

In [53]:
pipe_GNB.score(X_test,y_test)

0.2176278563656148

In [54]:
accuracy_score(pipe_GNB.predict(X_test), y_test)

0.2176278563656148

In [55]:
print(classification_report(pipe_GNB.predict(X_test), y_test))

              precision    recall  f1-score   support

         0.0       0.06      0.86      0.11       209
         1.0       0.96      0.18      0.30      3467

    accuracy                           0.22      3676
   macro avg       0.51      0.52      0.21      3676
weighted avg       0.90      0.22      0.29      3676



In [56]:
conf_mat = confusion_matrix(y_test, pipe_GNB.predict(X_test))

print('Confusion matrix\n\n', conf_mat)

Confusion matrix

 [[ 180 2847]
 [  29  620]]
