In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier

In [2]:
avisos = pd.read_csv('Predecir/avisos_detalles.csv')
postulantes = pd.read_csv('Predecir/postulantes_datos.csv')
postulaciones = pd.read_csv('Predecir/df_postulaciones.csv')
to_predict = pd.read_csv('Predecir/test_final_100k.csv')

In [3]:
del postulantes['Unnamed: 0']
del avisos['Unnamed: 0']
del postulaciones['Unnamed: 0']

In [4]:
avisos_si = avisos.copy()
avisos_si['se_postulo'] = 1

In [5]:
postulaciones.shape

(6604534, 2)

Muy grande para procesar, procesamos por bloques random y calculamos un promedio general

In [6]:
postulaciones = postulaciones.sample(frac=0.05)

In [7]:
postulaciones.shape

(330227, 2)

In [8]:
postulaciones = pd.merge(postulaciones, avisos_si, how='inner', on=['idaviso'])

In [9]:
postulaciones.shape

(318745, 35)

In [10]:
postulaciones = pd.merge(postulaciones, postulantes, how='inner', on=['idpostulante'])

In [11]:
postulaciones.shape

(327393, 39)

In [12]:
postulaciones.head()

Unnamed: 0,idaviso,idpostulante,tipo_de_trabajo,nivel_laboral,nombre_area,denominacion_empresa,web,crecimiento,capacitacion,clima laboral,...,junior,jefe,marketing,supervisor,online,se_postulo,nombre,estado,sexo,edad
0,1112463548,kPXk9Kj,1,1,71,457,0,0,0,0,...,0,0,0,0,1.0,1,0,1,1,23.0
1,1112463548,ak5plv5,1,1,71,457,0,0,0,0,...,0,0,0,0,1.0,1,0,2,1,25.0
2,1112400573,ak5plv5,1,1,5,464,0,0,0,0,...,0,0,0,0,0.0,1,0,2,1,25.0
3,1112463548,4rPbBZ0,1,1,71,457,0,0,0,0,...,0,0,0,0,1.0,1,1,0,1,34.0
4,1112312896,4rPbBZ0,1,1,46,1404,0,1,0,0,...,0,0,1,0,0.0,1,1,0,1,34.0


Contamos cuantos postulados = 1 tenemos para ver cuantos 0 ponemos

In [13]:
postulaciones.shape

(327393, 39)

Probamos un 50% aprox de ceros en relacion a los unos

In [14]:
len(postulaciones[postulaciones["se_postulo"]==1])

327393

In [15]:
def random_fill(row):
    return np.random.choice(avisos['idaviso'])

In [16]:
while (len(postulaciones[postulaciones["se_postulo"]==1])*0.8 > len(postulaciones[postulaciones["se_postulo"]==0])):
    relleno = postulantes.sample(frac=0.5).copy()
    relleno['idaviso'] = np.nan
    relleno['idaviso'] = relleno.apply(random_fill, axis=1)
    relleno = pd.merge(relleno, avisos, how='inner', on=['idaviso'])
    postulaciones = pd.merge(postulaciones, relleno, how='outer')
    postulaciones = postulaciones.fillna({'se_postulo' : 0})
    print(len(postulaciones[postulaciones["se_postulo"]==0]))

195783
391518


In [17]:
postulaciones.shape

(718913, 39)

In [18]:
se_postulo = postulaciones['se_postulo']
del postulaciones['se_postulo']
del postulaciones['idaviso']
del postulaciones['idpostulante']

# DecisionTree

In [19]:
decisiontree = DecisionTreeClassifier(random_state=103040)
decisiontree.get_params()

{'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_split': 1e-07,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'presort': False,
 'random_state': 103040,
 'splitter': 'best'}

In [20]:
X_train, X_test, y_train, y_test = train_test_split(postulaciones, se_postulo, test_size=0.30, random_state=42)

In [21]:
param_grid= {'max_features': range(1, len(postulaciones.columns))}
grid_drop = GridSearchCV(decisiontree, param_grid, cv=10)
grid_drop.fit(X_train, y_train)
print(grid_drop.best_params_, grid_drop.best_score_)

{'max_features': 12} 0.711365772526


In [26]:
grid_drop.best_estimator_.feature_importances_  

array([ 0.02009573,  0.02639657,  0.11380869,  0.11444896,  0.00716822,
        0.00987085,  0.01139062,  0.00891512,  0.00978931,  0.00674881,
        0.01222908,  0.00618162,  0.00600421,  0.01050915,  0.00389797,
        0.00916675,  0.01302519,  0.01656404,  0.01308059,  0.01368132,
        0.01101651,  0.01448445,  0.01107982,  0.00718127,  0.0095607 ,
        0.00931582,  0.0183424 ,  0.00475854,  0.00430563,  0.00834404,
        0.00709898,  0.01627611,  0.08399905,  0.04950641,  0.02938439,
        0.28237306])

In [29]:
result = grid_drop.best_estimator_.predict_proba(X_test)

In [42]:
result

array([[ 1.,  0.],
       [ 0.,  1.],
       [ 1.,  0.],
       ..., 
       [ 1.,  0.],
       [ 0.,  1.],
       [ 0.,  1.]])

In [41]:
predicciones = pd.DataFrame({'se_postulo':result}, index=[0])

Exception: Data must be 1-dimensional

In [37]:
roc_auc_score(y_test, y_test)

1.0

In [39]:
type(result)

numpy.ndarray