<h1>Loading packages and data</h1>

In [62]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [63]:
!pip install unidecode



In [0]:
# Load packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from ast import literal_eval
from unidecode import unidecode

from sklearn import preprocessing
from sklearn.model_selection import cross_val_score, RepeatedStratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE
from sklearn import metrics
from xgboost import XGBClassifier
import mlxtend as mlx

In [65]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [66]:
# Load data
root_path = '/content/drive/My Drive/Kalapa/'
train = pd.read_csv(root_path + 'train.csv')
test = pd.read_csv(root_path + 'test.csv')

  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)


In [0]:
# Concat train and test data for preprocessing
train['type'] = 'train'
test['type'] = 'test'
df = pd.concat([train, test], sort=False)
df.set_index('id', inplace=True)
df = df[['type'] + [ col for col in df.columns if col != 'type']]

<h1>Data preprocessing</h1><br>

In [0]:
# Process missing values
df.replace(['None', 'na', 'nan', 'undefined'], np.nan, inplace=True)

Processing text features
- keep only text values, values which represent digits are treated as missing values
- delete multiple spaces
- convert texts to lowercase
- remove vietnamese digraphs and marks


In [0]:
# province
def processText(f):
  f = f.map(lambda el:np.nan if str(el).isdigit() else " ".join(str(el).split()))
  f = f.str.lower()
  f = f.map(lambda el: unidecode(str(el)))
  return f

df.province = processText(df.province)

In [0]:
# district
#df.district.replace({'vĩnh thuận': 'Huyện Vĩnh Thuận', 'Đông Hải': 'Huyện Đông Hải'}, inplace=True, regex=True)
#df.district = processText(df.district)
#df.replace({'da krong': 'dakrong', 'qui nhon': 'quy nhon'}, inplace=True, regex=True)
df.drop('district', axis=1, inplace=True)

In [0]:
# Function to check if string contains element from a list
def checkContains(str, list):
  return bool([el for el in list if(el in str)])

# maCv
df.maCv = processText(df.maCv)
for i, value in enumerate(df.maCv):
  if checkContains(value, ['bep', 'nau an']):
    df.loc[i, 'maCv'] = 'dau bep'
  elif checkContains(value, ['bien kich', 'truyen thanh', 'phat thanh', 'truyen hinh', 'dien vien', 'phong vien', 'san xuat noi dung']):
    df.loc[i, 'maCv'] = 'truyen thanh'
  elif checkContains(value, ['giao vien', 'giang vien', 'gioo vion', 'gv', 'hieu pho', 'hieu truong', 'nghien cuu vien', 'chuyen mon']):
    df.loc[i, 'maCv'] = 'giao duc'
  elif checkContains(value, ['y te', 'ho sinh', 'y sy', 'y si', 'y ta', 'dieu duong', 'bac si', 'bac sy', 'bs', 'cap duong', 'duoc', 'ho ly',
                            'bao mau', 'cham soc', 'co nuoi', 'truong tram', 'truong khoa']):
    df.loc[i, 'maCv'] = 'y te'
  elif checkContains(value, ['bao ve', 'b. ve', 'b.ve']):
    df.loc[i, 'maCv'] = 'bao ve'
  elif checkContains(value, ['xe', 'lai']):
    df.loc[i, 'maCv'] = 'tai xe'
  elif checkContains(value, ['tram truong', 'thuyen truong', 'truong phong', 'doi truong', 'cong an', 'truong bo phan', 'bien che', 'ho tich',
                              'quan su', 'chi huy', 'ca', 'doi truong', 'truong tt', 'pho', 'bi thu', 'ban kiem soat', 'bi tư', 'vien chuc',
                              'btd', 'can bo', 'can su', 'cb', 'chi huy', 'cong chuc', 'chu tich', 'chuyen vien', 'dia chinh', 'dieu hanh', 'gd',
                              'giam dinh', 'gia?m sa?t', 'giam doc', 'gdoc', 'quan trac', 'giam sat', 'kiem', 'ks', 'luu tru vien', 'quan li', 
                              'quan ly', 'qu?n ly', 'ql', 'quan tri', 'quan doc', 'supervisor', 'si quan', 'executive', 'thanh tra', 'thue']):
    df.loc[i, 'maCv'] = 'can bo'
  elif checkContains(value, ['nong', 'coong nhaon', 'cung nhon', 'c.n', 'c.nhan', 'cn', 'con gnhan', 'cong', 'xep', 'cat', 'keo', 'lap rap', 'lo',
                            'chuyen', 'det', 'may', 'dien', 'dong', 'ep', 'khai thac', 'lao dong', 'ld', 'cao su', 'mai', 'moc', 'san xuat',
                            'phu viec', 'theu', 'tho', 'thuy thu', 'cao mu', 'thuyen vien', 'vai', 'nghien', 'van hanh', 'trung cap', 'go',
                            'sat', 'khuon', 'co khí', 'bao tri', 'kho', 'xuong', 'dan']):
    df.loc[i, 'maCv'] = 'cong nhan'
  elif checkContains(value, ['nv', 'ban hang', 'dich vu', 'vien', 'giao hang', 'giao nhan', 'kinh doanh', 'trainee', 'thuc tap sinh', 'chat luong',
                            'nhaon vieon', 'nhon vion', 'operator', 'pha che', 'phuc vu', 'assistant', 'ky su', 'kien truc su', 'tai chinh',
                            'ke toan', 'ky thuat', 'kt', 'k? thu?t', 'tap vu', 'thiet ke', 'thu ky', 'thu ngan', 'thong ke', 'khach hang', 
                            'tiep thi', 'tin dung', 'tong dai', 'tro ly', 'tu van', 'van thu', 'van phong', 've sinh', 'thu kho', 'thu quy', 
                            'kiem tra', 'tiep tan', 'kcs', 'to', 'cua hang', 'quay']):
    df.loc[i, 'maCv'] = 'nhan vien'
  else:
    df.loc[i, 'maCv'] = np.nan

In [0]:
# FIELD_7 - keep unique elements and create new features
def convertStringToList(arr):
  if np.all(pd.isna(arr)):
    return np.nan
  if arr == '[]':
    return []
  return literal_eval(arr)

def countOccurrences(arr, el):
  if np.all(pd.isna(arr)):
    return np.nan
  if (len(arr) & (el in arr)):
    return arr.count(el)
  return 0

df.FIELD_7 = df.FIELD_7.map(convertStringToList)
#df['FIELD7_elem_amount'] = df.FIELD_7.map(lambda el: el if np.all(pd.isna(el)) else len(el))
unique_values_FIELD7 = np.unique([item for arr in df.FIELD_7.dropna().values for item in arr])
for el in unique_values_FIELD7:
  df['FIELD_7_' + el] = df.FIELD_7.map(lambda arr: countOccurrences(arr, el))
df.drop('FIELD_7', axis=1, inplace=True)

In [0]:
# FIELD_35, FIELD_41, FIELD_42, FIELD_44 - change to numeric
df.FIELD_35.replace({'Zero': 0, 'One': 1, 'Two': 2, 'Three': 3, 'Four': 4}, inplace=True)
df.FIELD_41.replace({'I': 1, 'II': 2, 'III': 3, 'IV': 4, 'V': 5}, inplace=True)
df.FIELD_42.replace({'Zezo': 0, 'One': 1}, inplace=True)
df.FIELD_44.replace({'One': 1, 'Two': 2}, inplace=True)

In [0]:
# Adjust data type
df.FIELD_11 = df.FIELD_11.astype('float64')
df.FIELD_36.replace({'TRUE': True, 'FALSE': False}, inplace=True)
df.FIELD_37.replace({'TRUE': True, 'FALSE': False}, inplace=True)
df.FIELD_45 = df.FIELD_45.astype('float64')

Processing categorical features
- One Hot Encoding for features with max 10 unique values, otherwise Frequency Encoding
- Filling NAN values with most frequent value

In [0]:
# List of all features having binary values
features = list(df.iloc[:, 2:].columns)
bool_features = [f for f in features if ((df[f].dtype == np.object) & (df[f].nunique() <= 2) & ('FIELD_7' not in f))]

# Change their data type to float
def convertBinaryListToBoolean(data, true_value):
  if pd.isna(data):
    return np.nan
  return 1 if data == true_value else 0

for f in bool_features:
  if f == 'FIELD_8':
    true_value = 'MALE'
  elif f == 'FIELD_10':
    true_value = 'GH'
  else:
    true_value = True
  df[f] = df[f].map(lambda data: convertBinaryListToBoolean(data, true_value))
  df[f] = df[f].astype('float64')
  df[f].fillna(value=df[f].mode().iloc[0], inplace=True)

In [0]:
# One hot encode categorical features which have max 10 unique values
onehot_encoded_features = [f for f in features if ((df[f].dtype == np.object) & (2 < df[f].nunique() <= 10) & ('FIELD_7' not in f))]
for f in onehot_encoded_features:
  df = pd.concat([df,pd.get_dummies(df[f], prefix=f)], axis=1).drop([f], axis=1)

In [0]:
# Frequency encode other categorical features
features = list(df.iloc[:, 2:].columns)
cat_features = [f for f in features if (df[f].dtype == np.object) & ((df[f].nunique() > 10))]
for f in cat_features:
  encoding = df.groupby(f).size()
  encoding = encoding / len(df)
  df[f] = df[f].map(encoding)
  df[f].fillna(value=df[f].mode().iloc[0], inplace=True)

Processing numeric features

In [0]:
# invalid age values are replaced with nan
df.age_source2.replace(-1, np.nan, inplace=True)

# only keep 1 feature age
conditions = [df.age_source1 == df.age_source2, df.age_source1.isnull(), df.age_source2.isnull(), 
              (df.age_source1 < 18) & (df.age_source2 > df.age_source1), 
              (df.age_source2 < 18) & (df.age_source1 > df.age_source2)]
choices = [df.age_source1, df.age_source2, df.age_source1, df.age_source2, df.age_source1]
df['age'] = np.select(conditions, choices, (df.age_source1 + df.age_source2)/2)
df.drop(columns=['age_source1', 'age_source2'], inplace=True)

In [0]:
df.FIELD_50.fillna(29.77, inplace=True)
features = list(df.iloc[:, 2:].columns)
num_features = [f for f in features if (df[f].dtype != np.object)]
for f in num_features:
  df[f].fillna(-1, inplace=True)

# Modelling



In [0]:
# Split data back to train and test
train = df[df['type'] == 'train']
test = df[df['type'] == 'test']
train = train.drop('type', axis=1)
test = test.drop('type', axis=1)

In [148]:
target = train.label
ratio = len(target[target == 1])/(len(target[target == 1]) + len(target[target == 0]))
ratio

0.0162

In [0]:
param_dist = {
    'n_estimators': 300,
    'max_depth': 4,
    'min_samples_split': 0.1,
    'min_samples_leaf': 0.1,
    'class_weight': 'balanced',
    'random_state': 42,
    'n_jobs': -1
  }
clf_rf = RandomForestClassifier(**param_dist)
clf_rf.fit(train.iloc[:, 1:], train.label)

# Evaluate model
#cv_rf = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=42)
#scores_rf = cross_val_score(clf_rf, train.iloc[:, 1:], train.label, scoring='roc_auc', cv=cv_rf, n_jobs=-1)
#print('Mean ROC AUC: %.3f' % np.mean(scores_rf))

# Predict
preds = clf_rf.predict_proba(test.iloc[:, 1:])
pd.DataFrame({'id': list(test.index), 'label': preds[:,0]}).to_csv(root_path + 'output_rf.csv', index=False)

In [0]:
param_dist = {
    'learning_rate': 0.01,
    'n_estimators': 300,
    'max_depth': 4,
    'subsample': 1,
    'colsample_bytree': 1,
    'colsample_bylevel': 1,
    'colsample_bynode': 1,
    'gamma': 0,
    'objective': 'binary:logistic',
    'eval_metric': 'logloss',
    'random_state': 42,
    'n_jobs': -1
  }

#clf_xgb = XGBClassifier(**param_dist)
clf_xgb = XGBClassifier()
clf_xgb.fit(train.iloc[:, 1:], train.label)

# Evaluate model
#cv_xgb = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=42)
#scores_xgb = cross_val_score(clf_xgb, train.iloc[:, 1:], train.label, scoring='roc_auc', cv=cv_xgb, n_jobs=-1)
#print('Mean ROC AUC: %.3f' % np.mean(scores_xgb))

# Predict
preds = clf_xgb.predict_proba(test.iloc[:, 1:])
pd.DataFrame({'id': list(test.index), 'label': preds[:,0]}).to_csv(root_path + 'output_xgb.csv', index=False)