In [1]:
import pandas as pd
import evaluation_helper as eval

# Read in data

features = pd.read_csv('../data/answer.csv', sep=';', nrows=100000, skiprows=[i for i in range(1, 5000000)])

features.head(5)


Unnamed: 0,id,user,place_asked,place_answered,type,inserted,response_time,place_map,language,options,ip_country,ip_id
0,5014157,61806,184,184.0,1,2014-12-23 20:04:50,2150,230,0,[],CZ,23178
1,5014158,61806,203,203.0,1,2014-12-23 20:04:52,2081,230,0,[],CZ,23178
2,5014159,58920,1173,1173.0,2,2014-12-23 20:04:52,3170,231,0,"[1172, 1165, 1173]",CZ,23031
3,5014160,61806,176,176.0,1,2014-12-23 20:04:55,2207,230,0,[],CZ,23178
4,5014161,58920,1055,1055.0,2,2014-12-23 20:04:56,3017,231,0,"[1083, 1108, 1055]",CZ,23031


In [2]:
# Parsing labels
import numpy as np
questions = np.array(features['place_asked'])
answers = np.array(features['place_answered'])
labels = np.equal(questions, answers)
labels = list(map(lambda x: 1 if x else 0, labels))
labels = np.array(labels)

In [3]:
# Removing answers from features
features = features.drop('place_answered', axis=1)
# TODO parse date
features = features.drop('inserted', axis=1)
# Removing non-important or missing features
features = features.drop('place_map', axis=1)
features = features.drop('ip_country', axis=1)
features = features.drop('ip_id', axis=1)
features.describe()

Unnamed: 0,id,user,place_asked,type,response_time,language
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,5019157.0,60322.1501,418.6303,1.1355,36958570.0,0.1719
std,2887.106,6284.409444,430.464876,0.342274,279219600.0,0.392385
min,5014157.0,17083.0,1.0,1.0,169.0,0.0
25%,5016658.0,61580.0,112.0,1.0,2879.75,0.0
50%,5019158.0,61819.0,196.0,1.0,4316.0,0.0
75%,5021657.0,61858.0,676.25,1.0,7246.25,0.0
max,5024157.0,61920.0,1318.0,2.0,2147484000.0,2.0


In [4]:
# Parsing options to number of options
features['options'] = features['options'].apply(lambda x: len(x[1:-1].split(",")))

# Saving feature names for later use
feature_list = list(features.columns)
# Convert to numpy array
features = np.array(features)


In [5]:
# Using Skicit-learn to split data into training and testing sets
from sklearn.model_selection import train_test_split
# Split the data into training and testing sets
train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size = 0.25, random_state = 42)

print('Training Features Shape:', train_features.shape)
print('Training Labels Shape:', train_labels.shape)
print('Testing Features Shape:', test_features.shape)
print('Testing Labels Shape:', test_labels.shape)

Training Features Shape: (7500, 7)
Training Labels Shape: (7500,)
Testing Features Shape: (2500, 7)
Testing Labels Shape: (2500,)


In [6]:
# The baseline predictions are the historical averages
averages = np.array([1 for x in range(0, len(test_labels))])
# Baseline errors, and display average baseline error
baseline_errors = abs(averages - test_labels)
print("##############################BASELINE###################################")
print('Accuracy: ', eval.accuracy(test_labels, averages))
print('RMSE: ', eval.rmse(averages, test_labels))
print('AUC: ', eval.auc(list(map(int,test_labels)), averages))
print('Pearson: ', eval.pearson(averages, test_labels))

##############################BASELINE###################################
Average baseline error:  0.25
Correct 0   613
Correct 1   1887
Predictions > 0.5   2500
Accuracy:  0.7548
RMSE:  0.49517673612559787
Pearson:  0.0


In [7]:
# Import the model we are using
from sklearn.ensemble import RandomForestRegressor
# Instantiate model with 1000 decision trees
rf = RandomForestRegressor(n_estimators = 1000, random_state = 42)
# Train the model on training data
rf.fit(train_features, train_labels)

# Use the forest's predict method on the test data
predictions = rf.predict(test_features)
# Calculate the absolute errors
errors = abs(predictions - test_labels)
# Print out the mean absolute error (mae)
print("##############################TEST SET###################################")
print('Accuracy: ', eval.accuracy(test_labels, averages))
print('RMSE: ', eval.rmse(predictions, test_labels))
print('AUC: ', eval.auc(list(map(int,test_labels)), predictions))
print('Pearson: ', eval.pearson(predictions, test_labels))


##############################TEST SET###################################
Average baseline error: 0.28
Correct 0   613
Correct 1   1887
Predictions > 0.5   2500
Accuracy:  0.7548
RMSE:  0.3888097807411743
Pearson:  -1.9949536951421423
AUC:  0.7709696550018976


In [8]:
print("##############################FEATURES IMPORTANCE###################################")
# Get numerical feature importances
importances = list(rf.feature_importances_)
# List of tuples with variable and importance
feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(feature_list, importances)]
# Sort the feature importances by most important first
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)
# Print out the feature and importances
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances]

##############################FEATURES IMPORTANCE###################################
Variable: response_time        Importance: 0.35
Variable: id                   Importance: 0.27
Variable: place_asked          Importance: 0.22
Variable: user                 Importance: 0.09
Variable: options              Importance: 0.04
Variable: language             Importance: 0.02
Variable: type                 Importance: 0.01


[None, None, None, None, None, None, None]