In [86]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import VotingClassifier

import xgboost as xgb

from pathlib import Path

In [87]:
train = pd.read_csv('data/train_pre_processing_nlp_5000.csv')
test = pd.read_csv('data/test_pre_processing_nlp_5000.csv')
train.info()

submit = test['id'].to_frame()
train.drop(labels=['id'], axis=1, inplace=True)
test.drop(labels=['id'], axis=1, inplace=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Columns: 5057 entries, keyword to target
dtypes: bool(21), float64(5000), int64(31), object(5)
memory usage: 292.7+ MB


In [88]:
noise_cols = ['location','len_location_cero_default', 
             'total_words_location_cero_default',
             'total_words_location_mean_default', 
              'text']
train.drop(labels=noise_cols, axis=1, inplace=True)
test.drop(labels=noise_cols, axis=1, inplace=True)

In [89]:
def prepare_df_for_fit(df):
    columns_str = ['keyword', 'text_clean', 'keyword_grouped']
    
    # Encode with LabelEncoder
    encoded_cols = df[columns_str]
    encoded_cols = encoded_cols.astype('str')
    encoded_cols = encoded_cols.apply(LabelEncoder().fit_transform)
    encoded_drop = df.drop(columns_str, axis = 1)
    encoded_df = pd.concat([encoded_drop, encoded_cols], axis = 1)
    # Drop Target column
    if 'target' in encoded_df.columns:
        encoded_df.drop(axis=1, labels=['target'], inplace=True)

    return encoded_df

In [90]:
train_X = prepare_df_for_fit(train) #train.drop(axis=1, labels=['target']) 
test_X = prepare_df_for_fit(test) 
train_Y = train['target']

## KNN

In [91]:
scaler = StandardScaler()
scaler.fit(train_X)
sc_transform = scaler.transform(train_X)
X = pd.DataFrame(sc_transform)

In [92]:
seed = 7
test_size = 0.20
X_train, X_test, y_train, y_test = train_test_split(X, train_Y, test_size=0.20, random_state=42)

In [93]:
knn_model = KNeighborsClassifier(n_neighbors=15)
knn_model.fit(X_train, y_train)

KNeighborsClassifier(n_neighbors=15)

In [94]:
%%time
preds = knn_model.predict(X_test)
print(roc_auc_score(y_test, preds))

0.551617873651772
CPU times: user 1min 7s, sys: 248 ms, total: 1min 7s
Wall time: 1min 9s


In [95]:
# 0.6119315757740301

## XGBoost

In [103]:
X_train, X_test, y_train, y_test = train_test_split(train_X, train_Y, test_size = 0.25, random_state = 10)

In [104]:
%%time
xgb_model = xgb.XGBClassifier(objective="reg:linear", random_state=10, colsample_bytree = 0.5, 
                             gamma = 0.1, learning_rate = 0.06, max_depth = 5, min_child_weight = 2, 
                             n_estimators = 300, reg_alpha = 0.1, seed = 123, subsample = 0.9)

xgb_model.fit(X_train, y_train)

CPU times: user 5min 30s, sys: 1.78 s, total: 5min 32s
Wall time: 1min 46s


XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.5, gamma=0.1, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.06, max_delta_step=0, max_depth=5,
              min_child_weight=2, missing=nan, monotone_constraints='()',
              n_estimators=300, n_jobs=0, num_parallel_tree=1,
              objective='reg:linear', random_state=10, reg_alpha=0.1,
              reg_lambda=1, scale_pos_weight=1, seed=123, subsample=0.9,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [105]:
preds = xgb_model.predict(X_test)

In [106]:
roc_auc_score(y_test, preds)

0.7892211938911963

In [107]:
acc = accuracy_score(preds,y_test)
print("ACC: %f" % (acc))

ACC: 0.800945


In [108]:
# 0.7924081406705616
# ACC: 0.804990

## Voting Classifier

In [109]:
%%time
eclf1 = VotingClassifier(estimators=[
        ('knn', knn_model), ('xgb', xgb_model)], voting='hard', weights=[1,2])
eclf1 = eclf1.fit(X_train, y_train)
preds = eclf1.predict(X_test)
print(roc_auc_score(y_test, preds))

0.7892211938911963
CPU times: user 5min 26s, sys: 3.75 s, total: 5min 30s
Wall time: 2min 27s


In [80]:
# 0.7924081406705616 (5000 TF IDF)

## Submit

In [81]:
%%time
eclf1 = eclf1.fit(train_X, train_Y)
preds = eclf1.predict(test_X)
submit['target'] = pd.DataFrame(preds)
print(submit)

         id  target
0         0       1
1         2       1
2         3       1
3         9       1
4        11       1
5        12       1
6        21       0
7        22       0
8        27       0
9        29       0
10       30       0
11       35       0
12       42       0
13       43       0
14       45       0
15       46       1
16       47       0
17       51       1
18       58       0
19       60       0
20       69       0
21       70       0
22       72       0
23       75       1
24       84       0
25       87       0
26       88       0
27       90       0
28       94       0
29       99       1
...     ...     ...
3233  10756       1
3234  10757       1
3235  10758       1
3236  10761       1
3237  10762       1
3238  10773       1
3239  10778       1
3240  10781       1
3241  10791       0
3242  10792       0
3243  10796       0
3244  10797       0
3245  10801       0
3246  10804       0
3247  10806       0
3248  10807       0
3249  10816       0
3250  10820       0


In [47]:
Path("result").mkdir(parents=True, exist_ok=True)
submit.to_csv('result/submit.csv', index=False)