# MAIN
This is the master notebook for my AirBnB Recruiting Challenge work. Changes and additions are made by branching and merging.

Other possible branches:
* Explore
* Prepocessing
* Features
* Models
* Multiclass
* Imbalance
* Validation
* Ensembels
* Imputation
* Test
* PCA


## Imports

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Draw inline
%matplotlib inline

# Set figure aesthetics
sns.set_style("whitegrid") #, {'ytick.major.size': 10.0})
#sns.set_context("talk")



## Get Data

In [2]:
# Load data into DataFrames
train_users = pd.read_csv('input/train_users_2.csv')
test_users = pd.read_csv('input/test_users.csv')

piv_train = train_users.shape[0]
labels = train_users['country_destination'].values


all_users = pd.concat([train_users,test_users],axis=0,ignore_index=True)

## Measure: NDCG

In [3]:
from rank_metrics import ndcg_at_k
from sklearn.metrics import make_scorer

def ndcg_wrapper(y_true,y_pred_proba):
    
    Y = np.fliplr(y_pred_proba.argsort())
        
    R = []
    NDCG = []
    for i in range(0,y_true.size):
        r = (Y[i,:]==y_true[i]).astype(int)
        R.append(r)
        #ndcg_at_k([0,0,1],5,method=1)
        NDCG.append(ndcg_at_k(r,5,method=1))
    
    #print(NDCG)
    #return NDCG,R
    return np.mean(NDCG)

ndcg_scorer = make_scorer(ndcg_wrapper, greater_is_better=True, needs_proba=True)

#  Features 

## Preprocessing of Main Features

In [6]:
def preprocess_missing(df):
    ##AGE
    
    ## DISCRETIZE AGE
    A = pd.cut(df['age'],list(range(14,100,5)),right=True)
    A = pd.get_dummies(A)
    df= pd.concat([df,A],axis=1)
    
    #Remove all ages outside of range, set to -1 for separate categorisation //impute for logistic regression
    df.loc[df.age > 100, 'age'] = -111
    df.loc[df.age < 15, 'age'] = -111
    df['age'].fillna(-1,inplace=True)

    ## GENDER
    # Set missing values to own category
    df['gender'].replace('-unknown-',np.nan, inplace=True)
    #df['gender'].fillna('MISSING',inplace=True)

    ## FIRST AFFILIATE TRACKED
    # Set missing to untracked, hopefully the same
    df['first_affiliate_tracked'].fillna('untracked',inplace=True)
    
    ## Get rid of date_first_booking
    if 'date_first_booking' in df.columns:
        df.drop(['date_first_booking'],axis=1,inplace=True)
    
    return df

def preprocess_features(df):
    ## dates
    
    # date_account_created
    dac = np.vstack(df.date_account_created.astype(str).apply(lambda x: list(map(int, x.split('-')))).values)
    df['dac_year'] = dac[:,0]
    df['dac_month'] = dac[:,1]
    df['dac_day'] = dac[:,2]
    df.drop(['date_account_created'],axis=1,inplace=True)
    
    #time first active
    tfa = np.vstack(df.timestamp_first_active.astype(str).apply(lambda x: list(map(int, [x[:4],x[4:6],x[6:8],x[8:10],x[10:12],x[12:14]]))).values)
    df['tfa_year'] = tfa[:,0]
    df['tfa_month'] = tfa[:,1]
    df['tfa_day'] = tfa[:,2]
    df['tfa_hour'] = tfa[:,3]
    df.drop(['timestamp_first_active'],axis=1,inplace=True)
    
    
    ## One-Hot Encoding
    categorical_features = ['affiliate_channel','affiliate_provider','first_affiliate_tracked',
                            'first_browser','first_device_type','gender','language','signup_app','signup_flow','signup_method'
                           ]
    
    df = pd.get_dummies(df,columns=categorical_features)
    
    return df

def preprocess_all(df):

    df_p = preprocess_missing(df)
    df_p = preprocess_features(df_p)
    
    return(df_p)

In [7]:
All = preprocess_all(all_users) 

## Session Features

In [8]:
session_df = pd.read_csv('input/sessions.csv')

#### Counts of Actions Taken

In [9]:
session_users = session_df

# Clean missing values
session_users['secs_elapsed'].fillna(0,inplace=True)
session_users['action_type'].fillna('',inplace=True)

# Combine for grouping
#session_users['action_action_type'] = session_users['action']+'__'+session_users['action_type']
session_users['action_action_detail'] = session_users['action']+'__'+session_users['action_detail']
#session_users['action_action_type_detail'] = session_users['action']+'__'+session_users['action_type']+'__'+session_users['action_detail']


# Group actions for users
#session_users1 = session_users.groupby(['user_id','action_action_type']).count()['secs_elapsed'].unstack().fillna(0)
session_users2 = session_users.groupby(['user_id','action_action_detail']).count()['secs_elapsed'].unstack().fillna(0)
#session_users3 = session_users.groupby(['user_id','action_action_type_detail']).count()['secs_elapsed'].unstack().fillna(0)


In [10]:
#session_users2.head(2)

#### Elapsed Time by Device Type

In [11]:
deviceusage_df = session_df[['user_id','device_type','secs_elapsed']].groupby(['user_id','device_type']).sum().unstack()['secs_elapsed']
deviceusage_df.fillna(0,inplace=True)
#deviceusage_df.head(2)

#### Combine Extracted Features

In [12]:
SessionFeatures = pd.merge(session_users2,deviceusage_df,right_index=True,left_index=True,how='inner')

## Age_Gender_Bkts - Create Features
Could also add in Country Locations here . . . if that makes a difference.

In [13]:
age_gender_df = pd.read_csv('input/age_gender_bkts.csv')

In [14]:
A = pd.pivot_table(age_gender_df,values='population_in_thousands',index=['age_bucket','gender'],columns='country_destination')
normalised_brackets = A.divide(A.sum(axis=1),axis=0)
normalised_totals = A.sum()/A.sum().sum()
#normalised_brackets

In [15]:
#AA = A.reset_index().groupby('gender').sum()  ## - Later separate missing by male and female . . . very sligth difference?
#AA.loc['female']/AA.loc['male']

In [16]:
def age_bucket_converter(age, gender):
    
    
    if (age>0)&(age<100):
        a = int((age//5)*5)
        Age = str(a)+'-'+str(a+4)
        if Age == '5-9': Age = 'Missing'
        
    elif age>99: Age = '100+'
    else: Age = 'Missing'
        
    if gender == 'MALE': Gender = 'male'
    elif gender == 'FEMALE': Gender = 'female'
    else: Gender = 'Missing'
    
    return (Age, Gender)

def bkts_feature_create(age_gender,normalised_brackets,normalised_totals):
     
    Age,Gender = age_bucket_converter(*age_gender)
    #print([Age,Gender])
    
    if ((Age=='Missing')|(Gender=='Missing')):
        return normalised_totals
    else:
        return normalised_brackets.loc[(Age,Gender)]

##### Create Features

In [20]:
BracketFeatures = all_users[['age','gender']].apply(lambda x: bkts_feature_create(tuple(x.values),normalised_brackets,normalised_totals),axis=1)
BracketFeatures.set_index(all_users['id'],inplace=True)

In [21]:
All.head(2)

Unnamed: 0,age,country_destination,id,dac_year,dac_month,dac_day,tfa_year,tfa_month,tfa_day,tfa_hour,...,signup_flow_16,signup_flow_20,signup_flow_21,signup_flow_23,signup_flow_24,signup_flow_25,signup_method_basic,signup_method_facebook,signup_method_google,signup_method_weibo
0,-1,NDF,gxn3p5htnn,2010,6,28,2009,3,19,4,...,0,0,0,0,0,0,0,1,0,0
1,38,NDF,820tgsjxq7,2011,5,25,2009,5,23,17,...,0,0,0,0,0,0,0,1,0,0
2,56,US,4ft3gnwmtx,2010,9,28,2009,6,9,23,...,0,0,0,0,0,0,1,0,0,0
3,42,other,bjjt8pjhuk,2011,12,5,2009,10,31,6,...,0,0,0,0,0,0,0,1,0,0
4,41,US,87mebub9p4,2010,9,14,2009,12,8,6,...,0,0,0,0,0,0,1,0,0,0


## SELECT AND COMBINE FEATURES

In [25]:
## Merge with Bracket Features
All = All.merge(BracketFeatures,how='inner',left_on='id',right_index=True) 

## Merge with Session Features
# Reattach Ids
#All['id'] = all_users['id']
#X = X.merge(SessionFeatures,how='inner',left_on='id',right_index=True) 

## SELECT TRAINING EXAMPLES - Which classes?

In [41]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

#included_destinations = ['NDF','US']
included_destinations = ['NDF','US','FR','IT','other']

X  = All[All['country_destination'].apply(lambda x: x in included_destinations)]
y = le.fit_transform(X['country_destination'])

X = X.drop(['country_destination','id'],axis=1)

X_Test = All[All['country_destination'].isnull()].drop(['country_destination','id'],axis=1)

print(list(zip(le.classes_,range(0,y))))

[('FR', 0), ('IT', 1), ('NDF', 2), ('US', 3), ('other', 4)]


# MODELS

In [None]:
#Imports
from sklearn.cross_validation import cross_val_score
from sklearn.grid_search import GridSearchCV, RandomizedSearchCV
from sklearn.cross_validation import train_test_split
from sklearn import metrics
from sklearn import preprocessing


### 0. Dummy Estimator

In [43]:
## Compare with basic. If you can't beat this, there's a probelm.

from sklearn.dummy import DummyClassifier
Dummy = DummyClassifier(strategy='prior').fit(X,y)
ndcg_scorer(Dummy,X,y)

0.84055278867426719

### 1. Logistic Regression
Logistic Regression without Grid Search on gets CrossVal of 0.657, which is a mild improvement over 0.583 for setting all NDF. Balancing classes brings it down negligably to 0.646
Grid search doesn't seemed to have helped - 0.658 with C of 11.94
After scaling age, this now appears third largest among the coefficients, though seemingly a decrease in accuracy to 0.63

Discretizing age has increased the score to 0.667, on 1% up. It's something.But none of the age variables features in important coefficients.
Reintoducing age as continuous (with NaNs imputed) together with discrete , brings things now up to 0.675. Huh

Interesting. The score on the training data is almost the same. So we might not be overfitting and just have too high bias.

And now the age brackets are appearing in the coeffs. How very strange. Seems that having entered your age validly correlates with actually booking. Now gender isn't showing up though. Could there just be the single latent variable of "filling out the form properly?"

Next run of GridSearch gives ~5 for C, stronger regularisation, and a score of 0.688, which is several points up from when I started.

----------------- 

Changing now to multiclass, helps with developing NDCG.


In [28]:

## Customise Train and Test for Logistic Regression

def lr_preprocess(X):
    # Accepts already processed X and y

    ## DISCRETIZE AGE
    #X_log = X # Create X just for Logistic Regression
    #A = pd.cut(X_log['age'],list(range(14,100,5)),right=True)
    #A = pd.get_dummies(A)
    #X_log= pd.concat([X_log,A],axis=1)
    #X_train.drop(['age'],axis=1,inplace=True)


    ## IMPUTE VALUES AND SCALE
    X['age'].replace(-1,np.median(X['age']),inplace=True)
    scaler = preprocessing.MinMaxScaler()
    X['age'] = scaler.fit_transform(X['age'])

    return X

In [29]:
All_log = lr_preprocess(All)
All_log.shape



(275547, 189)

KeyError: 'country_destination'

In [None]:
#%%timeit -r1
from sklearn.linear_model import LogisticRegressionCV
clf = LogisticRegressionCV(Cs=5,class_weight='balanced') #class_weight='balanced'
clf.fit(X_log,y_log)


In [None]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(C = 0.0045,class_weight='balanced')
print(np.mean(cross_val_score(clf,X_log,y_log,cv=5,scoring=ndcg_scorer)))

In [None]:
#%%timeit -r1
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(C = 0.0005,class_weight='balanced')

#Need to stratify.
X_train, X_test, y_train, y_test = train_test_split(X_log, y_log, random_state=0)
clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)

print('NDCG = ',ndcg_scorer(clf,X_test,y_test))
print('Accuracy = ',metrics.accuracy_score(y_test,y_pred))


In [None]:
%%timeit
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(C ,class_weight='balanced') #class_weight='balanced'
clf.fit(X_log,y_log)


In [None]:
print('NDCG = ',ndcg_scorer(clf,X_test,y_test))


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_log, y_log, random_state=0)
ndcg_scorer(clf,X_test,y_test)


In [None]:
## Test for overfitting:
y_pred = clf.predict(X_train)
print('NDCG = ',ndcg_scorer(clf,X_train,y_train))
print('Accuracy = ',metrics.accuracy_score(y_train,y_pred))
print('Recall = ',metrics.recall_score(y_train,y_pred))
print('Precision = ',metrics.precision_score(y_train,y_pred))

### Grid Search

In [None]:
Crange = np.logspace(-1,2,30)

grid = GridSearchCV(LogisticRegression(), param_grid={'C': Crange},scoring='accuracy',cv=3,n_jobs=4)
grid.fit(X_train,y_train)
print ("best parameter choice:", grid.best_params_)
print ("best score:", grid.best_score_)


### Feature Importances - Logistic Regression

In [None]:
FeatureImportances = pd.DataFrame(data = { 'coefs': clf.coef_.squeeze(), 'abs coefs' : np.abs(clf.coef_.squeeze())},index=X_log.columns)
FeatureImportances.sort_values(by='abs coefs',ascending=False,inplace=True)

k = 50
plt.figure(figsize=(10,10))
sns.barplot(y=FeatureImportances.index[0:k],x=FeatureImportances['coefs'].head(k))
#plt.xticks(rotation = 90)

The picture I get from this that people are less likely to book from mobile devices. And more likely to book if they correctly filled out their from. I doubt logicst regression has handled the age feature well. Hence why discretizing might be a good idea.

## 2. Random Forest

### Custom Preprocessing

### Model

#### Scores and Confusion Matrix

### Feature Importance

### Validation and Learning Curve

## 3. ExtraTrees

## 4. XGBoost

# TEST-SET PREDICTIONS

### Logistic Regression - US and NDF - Main + Age_Bracket Features

In [None]:
X_log.head()
X_log_test.head()

In [None]:
print(X_log.shape)
print(X_log_test.shape)

In [None]:
y_pred_test = clf.predict_proba(X_log_test)
Y = np.fliplr(y_pred_test.argsort()) #Numbers of predicted classes in order of likelihood.
y_preds = le.inverse_transform(Y) #Converted to string labels.

In [None]:
## Shoulde turn all of this into a function

n = 5

id_test = test_users['id']

ids = [] # list of id's
cts = [] # list of countries
for i in range(len(id_test)):
    idx = id_test[i]
    ids += [idx]*n
    cts += list(y_preds[i]) + ['other','FR','IT']


In [None]:
submission = pd.DataFrame(np.column_stack((ids,cts)), columns=['id','country'])
submission.to_csv('output/sub_lr_2class_usndf_plus3_main_bkts.csv',index=False)

# BELOW HERE IS MESSY - OLD CODE

In [None]:
from sklearn.ensemble import ExtraTreesClassifier

forest = ExtraTreesClassifier(n_estimators=50,
                              random_state=0)



forest.fit(X_log, y_int)
importances = forest.feature_importances_
std = np.std([tree.feature_importances_ for tree in forest.estimators_],
             axis=0)
indices = np.argsort(importances)[::-1]

# Print the feature ranking
print("Feature ranking:")

for f in range(X.shape[1]):
    print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))

# Plot the feature importances of the forest
plt.figure()
plt.title("Feature importances")
plt.bar(range(X_log.shape[1]), importances[indices],
       color="r", yerr=std[indices], align="center")
plt.xticks(range(X_log.shape[1]), indices)
plt.xlim([-1, X_log.shape[1]])
plt.show()

In [None]:
importances = forest.feature_importances_
std = np.std([tree.feature_importances_ for tree in forest.estimators_],
             axis=0)
indices = np.argsort(importances)[::-1]

# Print the feature ranking
print("Feature ranking:")

for f in range(X.shape[1]):
    print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))

# Plot the feature importances of the forest
plt.figure()
plt.title("Feature importances")
plt.bar(range(X_log.shape[1]), importances[indices],
       color="r", yerr=std[indices], align="center")
plt.xticks(range(X_log.shape[1]), indices)
plt.xlim([-1, X_log.shape[1]])
plt.show()

In [None]:
## Let's get a classifier going
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import train_test_split
from sklearn import metrics

Xtrain, Xtest, ytrain, ytest = train_test_split(X_log, y_log, random_state=0)

clf = RandomForestClassifier(n_estimators=25)
clf.fit(Xtrain,ytrain)
ypred = clf.predict(Xtest)


In [None]:
FeatureImportances = pd.Series(index=X_log.columns,data=clf.feature_importances_).sort_values(ascending=False)

In [None]:
k = 50
plt.figure(figsize=(10,15))
sns.barplot(y=FeatureImportances.index[0:k],x=FeatureImportances.head(k))
#plt.xticks(rotation = 90)