# Imports

In [46]:
import pandas as pd
import numpy as np
import math
from sklearn.feature_selection import VarianceThreshold
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, roc_auc_score, classification_report
from dateutil.parser import parse
from datetime import timedelta
import pprint as pp

In [47]:
# Read in the pandas.DataFrame from csv
data = pd.read_csv('../DataAnalysis/RandomForest/ebay_data_rf.csv', index_col=False)

# Random forest model

Our eBay data contains primarily categorical variables. Thus, random forest is a natural option for modeling the dataset, because the random forest algorithm is based on decision trees which naturally handle categorical data.

## Preprocessing

In [59]:
# Separate target variable from inputs
y = data.sellingState
T = data.endTime
X = data.drop(['sellingState','endTime'], axis=1)
T_dt = [parse(x) for x in T]

def get_hrs_diff(dt1 , dt2):
    return (dt1 - dt2).total_seconds()/3600

def get_weight(dt, dt_ref, sigma):

    hrs = get_hrs_diff(dt,dt_ref)
    
    return math.exp( -hrs**2 / (2 * sigma) )

def get_sample_weights(dt_ls, sigma):
    latest = max(dt_ls)
    return [get_weight(x, latest, sigma) for x in dt_ls]

In [61]:
get_sample_weights(T_dt, 24)

[1.0,
 0.99990663855341,
 0.99990663855341,
 0.99990663855341,
 0.99990663855341,
 0.99990663855341,
 0.99990663855341,
 0.99990663855341,
 0.99990663855341,
 0.99990663855341,
 0.99990663855341,
 0.9998333407924811,
 0.9998030993771846,
 0.9997492196579527,
 0.9996419365480078,
 0.9996373712569573,
 0.9996281539641231,
 0.9995399962020027,
 0.9994424763974273,
 0.9994098385656573,
 0.9992807998364693,
 0.9990424083722486,
 0.9988878616830714,
 0.9988145519005225,
 0.9987502607420179,
 0.9986106304206872,
 0.9984730275429963,
 0.9982157847087677,
 0.9978652431967864,
 0.9974954240403184,
 0.9971519799924377,
 0.9966676002398285,
 0.9966583590617379,
 0.9962733962723803,
 0.9962587376700931,
 0.995846994834742,
 0.995643672819729,
 0.9954947215334848,
 0.9954839869935731,
 0.9949538179241221,
 0.9945723961255865,
 0.9941405876187723,
 0.9940177339572109,
 0.9919665980136159,
 0.9912812285526349,
 0.990521741340551,
 0.9901460869066424,
 0.9896259305272155,
 0.9882464706267743,
 0.988142

## Train the random forest classifier

In [3]:
n_estimators = 80
max_features = 7
weights = {0: 2, 1: 1}
clf = RandomForestClassifier(n_estimators = 80,
                             max_features=7,
                             class_weight=weights,
                             warm_start=False)

clf.fit(X_train,y_train)

ValueError: could not convert string to float: '2016-04-18T14:55:55.000Z'

## Test the model on the test set

In [98]:
# Test on the training set:
y_test_pred = clf.predict(X_test)

# Print the confusion matrix
mat = confusion_matrix(y_test, y_test_pred)
num = np.sum(mat)
print("Confusion matrix:\n", mat)
print("Normalized confusion matrix:\n", mat/num)
print("Accuracy:\n", (mat[0,0]+mat[1,1])/num)

# Calculate the roc_auc score
print('ROC AUC:\n', roc_auc_score(y_test, clf.predict_proba(X_test)[:,1], average = 'weighted'))

Confusion matrix:
 [[1330  224]
 [ 323  623]]
Normalized confusion matrix:
 [[ 0.532   0.0896]
 [ 0.1292  0.2492]]
Accuracy:
 0.7812
ROC AUC:
 0.832348015488


The accuracy on the test set agrees well with the OOB error rate of about 0.2 for our model parameters.

## Feature importances
The random forest model also allows us to see which features were important for modeling the data.

In [100]:
cols = X.columns
feature_scores = clf.feature_importances_

score_card = pd.DataFrame.from_items([('Features', cols),('Scores', feature_scores)])
score_card.sort_values(by = 'Scores', inplace=True, ascending=False)

print(score_card)

                Features    Scores
17               endHour  0.131385
13             startHour  0.128735
19           endMonthday  0.110342
15         startMonthday  0.097369
6      buyItNowAvailable  0.091429
1        productId_value  0.073820
18            endWeekday  0.063566
14          startWeekday  0.059124
10          shippingType  0.043532
5            listingType  0.040778
9      expeditedShipping  0.028797
12       returnsAccepted  0.023537
3            conditionId  0.019814
7       bestOfferEnabled  0.018616
0         productId_type  0.018359
2   conditionDisplayName  0.018032
11        isShippingFree  0.010840
16            startMonth  0.009505
20              endMonth  0.005948
4                country  0.003612
8          paymentMethod  0.002860


We see that the most important features have to do with the time of day that the listing started and ended and the day of month that the listing started and ended. These variables are may be highly correlated, so we should be careful not to read too much into the model at this point. We can say, however, that listing time features are predictive for the listing outcome.