In [1]:
import sys
import numpy as np
import pandas as pd

sys.path.append('..')

In [2]:
from utils.data_loading import load_users_data

train_users, test_users = load_users_data()

labels = train_users['country_destination'].values
train_users = train_users.drop(['country_destination'], axis=1)

id_test = test_users['id']
piv_train = train_users.shape[0]

users = pd.concat((train_users, test_users), axis=0, ignore_index=True)
users = users.drop(['id', 'date_first_booking'], axis=1)

users['gender'].replace('-unknown-', np.nan, inplace=True)
users['language'].replace('-unknown-', np.nan, inplace=True)

In [3]:
import datetime
users['date_account_created'] = pd.to_datetime(users['date_account_created'])
users['year_account_created'] = pd.DatetimeIndex(users['date_account_created']).year
users['month_account_created'] = pd.DatetimeIndex(users['date_account_created']).month
users['day_account_created'] = pd.DatetimeIndex(users['date_account_created']).day
users = users.drop(['date_account_created'], axis=1)

users['timestamp_first_active'] = pd.to_datetime(users['timestamp_first_active'], format='%Y%m%d%H%M%S')
users['year_first_active'] = pd.DatetimeIndex(users['timestamp_first_active']).year
users['month_first_active'] = pd.DatetimeIndex(users['timestamp_first_active']).month
users['day_first_active'] = pd.DatetimeIndex(users['timestamp_first_active']).day
users = users.drop(['timestamp_first_active'], axis=1)

In [4]:
from utils.preprocessing import one_hot_encoding

categorical_features = [
    'gender', 'signup_method', 'signup_flow', 'language', 'affiliate_channel',
    'affiliate_provider', 'first_affiliate_tracked', 'signup_app',
    'first_device_type', 'first_browser'
]

users = one_hot_encoding(users, categorical_features)

In [5]:
# Age
age_values = users.age.values
users['age'] = np.where(np.logical_or(age_values < 14, age_values > 100), np.nan, age_values)

In [None]:
from utils.preprocessing import input_missing_values

users = input_missing_values(users, 'age')

In [None]:
age_mask = users['age'].isnull()

In [None]:
from sklearn.svm import SVR
clf = SVR()

clf.fit(
    users[~age_mask].drop('age', axis=1), 
    users[~age_mask]['age']
)

In [None]:
predicted_age = clf.predict(users[age_mask].drop('age', axis=1))
users.loc[age_mask, 'age'] = np.around(predicted_age)

In [None]:
from sklearn.svm import SVC
clf = SVC()

clf.fit(
    users[~users.gender.isnull()].drop('gender', axis=1).head(5), 
    users[~users.gender.isnull()]['gender'].head(5)
)

In [None]:
b = clf.predict(users[users.gender.isnull()].drop('gender', axis=1))

In [None]:
users.loc[users.gender.isnull(), 'gender'] = b

In [None]:
users_dummy = pd.get_dummies(users['gender'], prefix='gender')
users = users.drop(['gender'], axis=1)
users = pd.concat((users, users_dummy), axis=1)

In [None]:
# Data loading
train_users = pd.read_csv('../datasets/raw/train_users.csv')
test_users = pd.read_csv('../datasets/raw/test_users.csv')
labels = train_users['country_destination'].values
train_users = train_users.drop(['country_destination'], axis=1)
id_test = test_users['id']
piv_train = train_users.shape[0]

In [None]:
from sklearn.preprocessing import LabelEncoder
from xgboost.sklearn import XGBClassifier

# Splitting train and test
values = users.values
X = values[:piv_train]
le = LabelEncoder()
y = le.fit_transform(labels)
X_test = values[piv_train:]

# Classifier
xgb = XGBClassifier(
    max_depth=10,
    learning_rate=0.22,
    n_estimators=100,
    objective="multi:softprob",
    nthread=-1,
    gamma=0,
    min_child_weight=1,
    max_delta_step=0,
    subsample=0.6,
    colsample_bytree=0.6,
    colsample_bylevel=1,
    reg_alpha=0.1,
    reg_lambda=0.9,
    scale_pos_weight=1,
    base_score=0.5,
    seed=42
)

xgb.fit(X, y)
y_pred = xgb.predict_proba(X_test)

# Taking the 5 classes with highest probabilities
ids = []
cts = []
for i in range(len(id_test)):
    idx = id_test[i]
    ids += [idx] * 5
    cts += le.inverse_transform(np.argsort(y_pred[i])[::-1])[:5].tolist()

# Generate Submission
sub = pd.DataFrame(np.column_stack((ids, cts)), columns=['id', 'country'])
sub.to_csv('xgboost.csv',index=False)

In [None]:
xgb.score(X, y)