In [13]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from datetime import datetime

# Data prep

In [14]:
train = pd.read_csv('data/train_users_2.csv', parse_dates=['date_account_created','timestamp_first_active','date_first_booking'])
train.head()

Unnamed: 0,id,date_account_created,timestamp_first_active,date_first_booking,gender,age,signup_method,signup_flow,language,affiliate_channel,affiliate_provider,first_affiliate_tracked,signup_app,first_device_type,first_browser,country_destination
0,gxn3p5htnn,2010-06-28,2009-03-19 04:32:55,NaT,-unknown-,,facebook,0,en,direct,direct,untracked,Web,Mac Desktop,Chrome,NDF
1,820tgsjxq7,2011-05-25,2009-05-23 17:48:09,NaT,MALE,38.0,facebook,0,en,seo,google,untracked,Web,Mac Desktop,Chrome,NDF
2,4ft3gnwmtx,2010-09-28,2009-06-09 23:12:47,2010-08-02,FEMALE,56.0,basic,3,en,direct,direct,untracked,Web,Windows Desktop,IE,US
3,bjjt8pjhuk,2011-12-05,2009-10-31 06:01:29,2012-09-08,FEMALE,42.0,facebook,0,en,direct,direct,untracked,Web,Mac Desktop,Firefox,other
4,87mebub9p4,2010-09-14,2009-12-08 06:11:05,2010-02-18,-unknown-,41.0,basic,0,en,direct,direct,untracked,Web,Mac Desktop,Chrome,US


## Format and parse

In [15]:
train['timestamp_first_active_to_date'] = pd.to_datetime(train['timestamp_first_active'], format='%Y%m%d%H%M%S')
train['timestamp_first_active_to_date'] = train['timestamp_first_active_to_date'].apply(lambda x: x.strftime('%Y-%m-%d'))

## Impute missing values

In [16]:
#impute age missing values with the median
train['age'].fillna(train['age'].median() , inplace = True)
#impute first_affiliate_tracked missing values with the mode
train['first_affiliate_tracked'].fillna(train['first_affiliate_tracked'].mode()[0] , inplace = True)
train['timestamp_first_active_to_date'] = pd.to_datetime(train['timestamp_first_active_to_date'])

## Encoding categorical features

In [17]:
#Convert to numeric
labelencoder_X  = LabelEncoder()
train['gender'] = labelencoder_X.fit_transform(train['gender'])
train['language'] = labelencoder_X.fit_transform(train['language'])
train['affiliate_channel'] = labelencoder_X.fit_transform(train['affiliate_channel'])
train['affiliate_provider'] = labelencoder_X.fit_transform(train['affiliate_provider'])
train['first_affiliate_tracked'] = labelencoder_X.fit_transform(train['first_affiliate_tracked'])
train['signup_app'] = labelencoder_X.fit_transform(train['signup_app'])
train['first_device_type'] = labelencoder_X.fit_transform(train['first_device_type'])
train['first_browser'] = labelencoder_X.fit_transform(train['first_browser'])
train['country_destination'] = labelencoder_X.fit_transform(train['country_destination'])
train['signup_method'] = labelencoder_X.fit_transform(train['signup_method'])
#was a first booking made
train['first_booking_made'] = pd.notnull(train.loc[:, 'date_first_booking'])
#convert to integer 
train['first_booking_made'] = train['first_booking_made'].astype(int)
train['timestamp_first_active'] = train['timestamp_first_active'].astype(int)

## Feature Engineering

In [18]:
train['month_of_first_active'] = train['timestamp_first_active_to_date'].apply(lambda x: x.strftime('%m'))
train['month_account_created'] = train['date_account_created'].apply(lambda x: x.strftime('%m'))
train['month_of_first_booking'] =  train['date_first_booking'].map(lambda x: x.strftime('%m') if pd.notnull(x) else '')

## Drop incomplete features

In [22]:
train_without_dates = train[['gender', 'age', 'signup_method','signup_flow','language','affiliate_channel','affiliate_provider','first_affiliate_tracked','signup_app','first_device_type','first_browser','first_booking_made', 'country_destination']]
y = train_without_dates['country_destination']
X = train_without_dates.drop(['country_destination'], axis=1)

# Model training

In [23]:
from sklearn.model_selection import cross_val_score

## Logistic Regression

In [24]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr_scores = cross_val_score(lr, X, y, cv=5)

## Random Forest

In [25]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(random_state=0)
rfc_scores = cross_val_score(rfc, X, y, cv=5)

## Ridge Classification

In [26]:
from sklearn.linear_model import RidgeClassifier
ridge = RidgeClassifier()
ridge_scores = cross_val_score(ridge, X, y, cv=5)

## SGD Classification

In [None]:
from sklearn.linear_model import SGDClassifier
sgd = SGDClassifier(max_iter=1000)
sgd_scores = cross_val_score(sgd, X, y, cv=5)

## PassiveAggressiveClassifier

In [None]:
from sklearn.linear_model import PassiveAggressiveClassifier
passive_aggressive = PassiveAggressiveClassifier(max_iter=1000)
passive_aggressive_scores = cross_val_score(passive_aggressive, X, y, cv=5)

# Results

## Logistic Regression

In [None]:
lr_scores

## Random Forest

In [None]:
rfc_scores

## Ridge Classification

In [None]:
ridge_scores

## SGD Classification

In [None]:
sgd_scores

## PassiveAggressiveClassifier

In [None]:
passive_aggressive_scores