In [1]:
import numpy as np
import pandas as pd
from sklearn.impute import KNNImputer
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from imblearn.combine import SMOTEENN
import os

In [2]:
train = pd.read_csv('./train.csv')
train.drop_duplicates(inplace = True)
train.dropna(inplace = True)
test = pd.read_csv('./test.csv')

In [3]:
job_level = {'JG06' : 3,'JG05' : 2,'JG04' : 1,'JG03' : 0}
train['job_level'] = train['job_level'].map(job_level)
test['job_level'] = test['job_level'].map(job_level)

person_level = {'PG08' : 7,'PG07' : 6,'PG06' : 5,'PG05' : 4,'PG04' : 3,'PG03' : 2,'PG02' : 1,'PG01' : 0}
train['person_level'] = train['person_level'].map(person_level)
test['person_level'] = test['person_level'].map(person_level)

gender = {1 : '1',2 : '2'}
train['gender'] = train['gender'].map(gender)
test['gender'] = test['gender'].map(gender)

Education_level = {'level_5' : 5,'level_4' : 4,'level_3' : 3,'level_2' : 2,'level_1' : 1,'level_0' : 0}
train['Education_level'] = train['Education_level'].map(Education_level)
test['Education_level'] = test['Education_level'].map(Education_level)

train = pd.get_dummies(train, prefix = ['type','gender','married'])
test = pd.get_dummies(test, prefix = ['type','gender','married'])

tahun = 2020
train['age'] = tahun - train['age']
train['year_graduated'] = train['year_graduated'] + train['age'] - tahun
test['age'] = tahun - test['age']
test['year_graduated'] = test['year_graduated'] + test['age'] - tahun

In [4]:
train = train.sort_values('Best Performance').reset_index(drop = True)

for index in train[(train['GPA'] == 0)|(train['GPA'] > 4)|(train['GPA'] < 2.5)].index:
    train.at[index,'GPA'] = np.nan

imputed = KNNImputer(n_neighbors = 7).fit_transform(train.drop(columns = 'Best Performance').to_numpy())
train['GPA'] = pd.Series(imputed[:,8]).values

for index in test[(test['GPA'] == 0)|(test['GPA'] > 4)|(test['GPA'] < 2.5)].index:
    test.at[index,'GPA'] = np.nan

imputed = KNNImputer(n_neighbors = 7).fit_transform(test.to_numpy())
test['GPA'] = pd.Series(imputed[:,8]).values

In [9]:
X = train.drop(columns = 'Best Performance')
y = train['Best Performance']

average = y.value_counts()[0]
best = y.value_counts()[1]

print('Jumlah karyawan average performance mula-mula :',average)
print('Jumlah karyawan best performance mula-mula    :',best)

Jumlah karyawan average performance mula-mula : 9515
Jumlah karyawan best performance mula-mula    : 1637


In [10]:
# ratio = average/best

# while (ratio <= 0.95)|(ratio >= 1.05):
#     X, y = SMOTEENN(random_state = 42).fit_resample(X,y)
#     average = y.value_counts()[0]
#     best = y.value_counts()[1]
#     ratio = average/best

# print('Jumlah karyawan average performance setelah resampling :',average)
# print('Jumlah karyawan best performance setelah resampling    :',best)

In [None]:
clf = RandomForestClassifier(random_state = 42)

scores = cross_val_score(clf,X,y,scoring = 'roc_auc')
print('Rerata skor ROC AUC',scores.mean(),'dengan simpangan baku',scores.std())

In [8]:
model = clf.fit(X,y)

pd.Series(model.predict_proba(test)[:,1],name = 'Best Performance').to_csv('submission.csv',index_label = 'index')