In [1]:
import sys
import numpy as np
import pandas as pd
sys.path.append('..')

In [2]:
from utils.unbalanced_dataset import NearMiss

In [3]:
from utils.data_loading import load_users_data
train_users, test_users = load_users_data()

In [7]:
from sklearn.datasets import make_classification

x, y = make_classification(n_classes=2, class_sep=2, weights=[0.1, 0.9],
                           n_informative=3, n_redundant=1, flip_y=0,
                           n_features=20, n_clusters_per_class=1,
                           n_samples=1000, random_state=10)

In [2]:
labels = train_users['country_destination'].values
train_users = train_users.drop(['country_destination'], axis=1)

id_test = test_users['id']
piv_train = train_users.shape[0]

users = pd.concat((train_users, test_users), axis=0, ignore_index=True)
users = users.drop(['id', 'date_first_booking'], axis=1)

In [3]:
# Fill NaN values
users = users.fillna(-1)

In [4]:
import datetime
users['date_account_created'] = pd.to_datetime(users['date_account_created'])
users['year_account_created'] = pd.DatetimeIndex(users['date_account_created']).year
users['month_account_created'] = pd.DatetimeIndex(users['date_account_created']).month
users['day_account_created'] = pd.DatetimeIndex(users['date_account_created']).day
users = users.drop(['date_account_created'], axis=1)

users['timestamp_first_active'] = pd.to_datetime(users['timestamp_first_active'], format='%Y%m%d%H%M%S')
users['year_first_active'] = pd.DatetimeIndex(users['timestamp_first_active']).year
users['month_first_active'] = pd.DatetimeIndex(users['timestamp_first_active']).month
users['day_first_active'] = pd.DatetimeIndex(users['timestamp_first_active']).day
users = users.drop(['timestamp_first_active'], axis=1)

age_values = users.age.values
users['age'] = np.where(np.logical_or(age_values < 14, age_values > 100), -1, age_values)

from utils.preprocessing import one_hot_encoding

categorical_features = [
    'gender', 'signup_method', 'signup_flow', 'language', 'affiliate_channel',
    'affiliate_provider', 'first_affiliate_tracked', 'signup_app',
    'first_device_type', 'first_browser'
]

users = one_hot_encoding(users, categorical_features)

from sklearn.preprocessing import LabelEncoder

# Splitting train and test
values = users.values
X = values[:piv_train]
le = LabelEncoder()
y = le.fit_transform(labels)
X_test = values[piv_train:]

In [None]:
pca = PCA(n_components = 1)

In [None]:
x_vis = pca.fit_transform(X)

In [None]:
US = UnderSampler()
usx, usy = US.fit_transform(X, y)

In [None]:
# Generate the new dataset using under-sampling method
verbose = False
ratio = float(np.count_nonzero(y==1)) / float(np.count_nonzero(y==0))

In [None]:
# 'SMOTE Tomek links'
STK = SMOTEENN(k=1)
STK.fit_transform(X, y)

In [None]:
pd.Series(stky).value_counts()

# Grid Search

In [30]:
a = 30000
values = users.values
X = values[:piv_train - a]
le = LabelEncoder()
y = le.fit_transform(labels[:-a])

In [31]:
X_test = values[piv_train - a :piv_train]
y_test = le.fit_transform(labels[-a:])

In [32]:
import xgboost
from sklearn.grid_search import GridSearchCV

xgb_model = xgboost.XGBClassifier(
    n_estimators=45,
    max_depth=6,
    objective="multi:softprob",
    nthread=-1,
    gamma=0,
    min_child_weight=1,
    max_delta_step=0,
    subsample=0.6,
    colsample_bytree=0.6,
    colsample_bylevel=1,
    reg_alpha=0,
    reg_lambda=1,
    scale_pos_weight=1,
    base_score=0.5,
    seed=42
)

In [33]:
xgb_model.fit(X, y)

XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.6,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=6,
       min_child_weight=1, missing=None, n_estimators=45, nthread=-1,
       objective='multi:softprob', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=42, silent=True, subsample=0.6)

In [34]:
ndcg_scorer(xgb_model, X_test, y_test)

0.83687704167946408

In [35]:
ndcg_score(y_test, A)

0.78131684481478447

In [19]:
from utils.metrics import ndcg_scorer, ndcg_score

In [None]:
clf = GridSearchCV(
    xgb_model,
    {
        'max_depth': [1, 2],
        'n_estimators':  [5, 10],
#         'learning_rate': [0.2,  0.4,  0.1],
    },
    cv=2,
    verbose=2,
    n_jobs=1,
    scoring=ndcg_scorer
    )

In [None]:
clf.fit(X, y, verbose=2)

In [None]:
clf.best_score_