In [3]:
import pandas as pd
import numpy as np
from feature_engine import imputation as mdi
from feature_engine.encoding import CountFrequencyEncoder
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from imblearn.under_sampling import RandomUnderSampler

In [48]:
df = pd.read_csv('z_train.csv')

In [49]:
df.drop(columns= ['enrollee_id'], inplace=True)

In [50]:
df.isnull().sum()

city                         0
city_development_index       0
gender                    3601
relevent_experience          0
enrolled_university        302
education_level            369
major_discipline          2237
experience                  50
company_size              4734
company_type              4891
last_new_job               339
training_hours               0
target                       0
dtype: int64

In [87]:
X_train, X_test, y_train, y_test = train_test_split(df.drop(columns='target'), df.target, train_size=0.8)

In [88]:
undersampler = RandomUnderSampler()
X_train_under, y_train_under = undersampler.fit_resample(X_train, y_train)

# Nulos

In [89]:
X_train_under['last_new_job']=X_train_under.last_new_job.str.replace('never', '0').str.replace('>4', '5')
X_train_under['last_new_job']=pd.to_numeric(X_train_under['last_new_job'])
X_train_under['experience']=pd.to_numeric(X_train_under.experience.str.replace('>20', '21').str.replace('<1','0'))

In [90]:
categorico_nulos=mdi.CategoricalImputer('frequent', variables = ['gender','enrolled_university', 'education_level', 'major_discipline', 'company_size', 'company_type'])
X_train_under = categorico_nulos.fit_transform(X_train_under)

In [91]:
num_nulos=mdi.MeanMedianImputer('median', variables = ['city_development_index', 'experience', 'last_new_job', 'training_hours'])
X_train_under = num_nulos.fit_transform(X_train_under)

In [92]:
X_train_under.isnull().sum()

city                      0
city_development_index    0
gender                    0
relevent_experience       0
enrolled_university       0
education_level           0
major_discipline          0
experience                0
company_size              0
company_type              0
last_new_job              0
training_hours            0
dtype: int64

# Categoricas a numericas

In [93]:
X_train_under.gender.unique()

array(['Male', 'Other', 'Female'], dtype=object)

In [94]:
X_train_under.nunique()

city                      115
city_development_index     88
gender                      3
relevent_experience         2
enrolled_university         3
education_level             5
major_discipline            6
experience                 22
company_size                8
company_type                6
last_new_job                6
training_hours            240
dtype: int64

In [95]:
relevent_experience = preprocessing.OrdinalEncoder(categories = [['No relevent experience', 'Has relevent experience']], handle_unknown='use_encoded_value', unknown_value = -1)
X_train_under['relevent_experience']=relevent_experience.fit_transform(X_train_under[['relevent_experience']])

In [96]:
education_level = preprocessing.OrdinalEncoder(categories = [['Primary School','High School', 'Graduate', 'Masters', 'Phd']])
X_train_under['education_level']=education_level.fit_transform(X_train_under[['education_level']])

In [97]:
X_train_under[['experience', 'last_new_job']].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6118 entries, 0 to 6117
Data columns (total 2 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   experience    6118 non-null   float64
 1   last_new_job  6118 non-null   float64
dtypes: float64(2)
memory usage: 95.7 KB


In [98]:
categoricas= CountFrequencyEncoder(encoding_method = 'frequency',variables = ['city', 'gender', 'enrolled_university', 'major_discipline','company_size', 'company_type'] )
X_train_under = categoricas.fit_transform(X_train_under)

In [99]:
X_train_under.head()

Unnamed: 0,city,city_development_index,gender,relevent_experience,enrolled_university,education_level,major_discipline,experience,company_size,company_type,last_new_job,training_hours
0,0.066852,0.91,0.924649,1.0,0.704152,2.0,0.907976,16.0,0.519451,0.841615,0.0,13
1,0.005884,0.896,0.924649,1.0,0.704152,2.0,0.907976,7.0,0.061131,0.04642,2.0,30
2,0.066852,0.91,0.924649,1.0,0.704152,1.0,0.907976,15.0,0.078294,0.841615,1.0,6
3,0.211834,0.624,0.924649,1.0,0.231448,2.0,0.907976,16.0,0.119974,0.841615,1.0,105
4,0.066852,0.91,0.924649,1.0,0.704152,2.0,0.907976,11.0,0.519451,0.841615,5.0,17


TEST

In [100]:
X_test['last_new_job']=X_test.last_new_job.str.replace('never', '0').str.replace('>4', '5')
X_test['last_new_job']=pd.to_numeric(X_test['last_new_job'])
X_test['experience']=pd.to_numeric(X_test.experience.str.replace('>20', '21').str.replace('<1','0'))

In [101]:
X_test=categorico_nulos.fit_transform(X_test)

In [102]:
X_test = num_nulos.fit_transform(X_test)

In [103]:
X_test['relevent_experience']=relevent_experience.fit_transform(X_test[['relevent_experience']])
X_test['education_level']=education_level.fit_transform(X_test[['education_level']])

In [104]:
X_test = categoricas.fit_transform(X_test)

In [105]:
X_test.head()

Unnamed: 0,city,city_development_index,gender,relevent_experience,enrolled_university,education_level,major_discipline,experience,company_size,company_type,last_new_job,training_hours
11914,0.015329,0.939,0.067515,1.0,0.741683,2.0,0.021853,9.0,0.473581,0.833986,1.0,51
1154,0.001305,0.698,0.923353,1.0,0.195695,1.0,0.897586,6.0,0.072407,0.833986,1.0,78
1597,0.015982,0.804,0.923353,1.0,0.741683,2.0,0.021853,17.0,0.473581,0.833986,1.0,73
1279,0.040444,0.92,0.923353,0.0,0.741683,0.0,0.897586,0.0,0.07045,0.048271,2.0,85
2202,0.138617,0.624,0.923353,1.0,0.195695,2.0,0.897586,1.0,0.473581,0.833986,1.0,7


# MODELO

In [106]:
from sklearn.model_selection import GridSearchCV

In [178]:
forest_grid = RandomForestClassifier()
params = {'n_estimators':[10,25,30, 50,75, 100], 'max_depth':[3,6,8,9,10, 15,20]}
gridForest = GridSearchCV(forest_grid, params)

In [179]:
gridForest.fit(X_train_under,y_train_under)

In [180]:
gridForest.best_estimator_

In [181]:
forest = RandomForestClassifier(max_depth=9, n_estimators=50)

In [182]:
forest.fit(X_train_under,y_train_under)

In [183]:
print(forest.score(X_train_under,y_train_under))
print(forest.score(X_test,y_test))

0.7873488067996077
0.7452707110241357


In [128]:
forest.score(X_train_under,y_train_under)

0.7701863354037267

In [129]:
forest.score(X_test,y_test)

0.7449445531637312

# Z_test

In [184]:
z_test = pd.read_csv('z_test.csv')

In [185]:
identificación = z_test[['enrollee_id']]

In [186]:
z_test.drop(columns= 'enrollee_id', inplace=True)

In [187]:
z_test['last_new_job']=z_test.last_new_job.str.replace('never', '0').str.replace('>4', '5')
z_test['last_new_job']=pd.to_numeric(z_test['last_new_job'])
z_test['experience']=pd.to_numeric(z_test.experience.str.replace('>20', '21').str.replace('<1','0'))

In [188]:
z_test=categorico_nulos.fit_transform(z_test)
z_test = num_nulos.fit_transform(z_test)

In [189]:
z_test['relevent_experience']=relevent_experience.fit_transform(z_test[['relevent_experience']])
z_test['education_level']=education_level.fit_transform(z_test[['education_level']])

In [190]:
z_test = categoricas.fit_transform(z_test)

In [191]:
z_pred = pd.DataFrame(forest.predict_proba(z_test))

In [192]:
submission11=pd.concat((identificación,z_pred[1]), axis=1)
submission11.head()

Unnamed: 0,enrollee_id,1
0,23603,0.75152
1,22499,0.424476
2,10465,0.669959
3,8293,0.194782
4,4246,0.4293


In [193]:
submission11.rename(columns = {1:'target'}, inplace=True)
submission11.to_csv('submission_11.csv',index=False)

In [194]:
submission11

Unnamed: 0,enrollee_id,target
0,23603,0.751520
1,22499,0.424476
2,10465,0.669959
3,8293,0.194782
4,4246,0.429300
...,...,...
3827,8880,0.726020
3828,7886,0.417755
3829,12279,0.529578
3830,5326,0.291069
