# Setup
## Imports

In [1]:
from itertools import chain
import ujson as json
import multiprocessing as mp
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.cross_validation import ShuffleSplit
from sklearn.metrics import roc_curve
from sklearn.metrics import auc
from sklearn.cross_validation import StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import ExtraTreesClassifier

from helpers import disaggregated_df
from helpers import aggregated_df
from helpers import dummify_df

from helpers import score_metrics
from helpers import all_scoring_metrics

%matplotlib inline
sns.set(font='monospace')

In [2]:
df = pd.read_pickle('../../data/merged/data_to_use_by_phone_v4.pkl')

In [4]:
df['class'] = df['class'].isin(['positive'])
df['class'].value_counts()

False    85334
True     74809
Name: class, dtype: int64

In [6]:
df.columns

Index(['phone', 'age_mean', 'price_mean', 'duration_in_mins_mean',
       'price_per_min_mean', 'price_per_min_mean', 'n_ads_mean',
       'n_distinct_locations_mean', 'location_tree_length_mean',
       'n_outcall_mean',
       ...
       'flag_Derogatory Descriptions_sum_sum',
       'flag_Foreign Providers_sum_sum', 'flag_Hotel_sum_sum',
       'flag_Juvenile_sum_sum', 'flag_Massage Parlor_sum_sum',
       'flag_Multiple Girls_sum_sum', 'flag_Risky Services_sum_sum',
       'flag_Traveling_sum_sum', 'flag_URL Embedding_sum_sum', 'class'],
      dtype='object', length=132)

In [5]:
eval_columns = ['f1','accuracy','true_negative_rate','true_positive_rate','roc_auc']
price_cols = ['duration_in_mins','price','price_per_min']

# work at phone level
print("_____")
print("Work at phone level...")
phone_clf = ExtraTreesClassifier(oob_score=False,
                                 bootstrap=False,
                                 random_state=2,
                                 n_estimators=100,
                                 n_jobs=-1,
                                 class_weight="balanced")

metrics = all_scoring_metrics(phone_clf,
                              df.ix[:, [x for x in df.columns if x not in {'class', 'phone'}]],
                              phone_level_label,
                              StratifiedKFold(phone_level_label, 10))
print("Results (averaged from 10 fold cross validation and computed out of sample)")
print(metrics[[i for i in metrics.columns if i in eval_columns]])

_____
Work at phone level...


NameError: name 'phone_level_label' is not defined

In [6]:
# work at ad level
print("_____")
print("Work at ad level...")
clf = ExtraTreesClassifier(oob_score=True, random_state=2, n_estimators=100, n_jobs=-1, class_weight="balanced")
metrics = all_scoring_metrics(clf, df_X, df['class'], StratifiedKFold(df['class'], 2))
print("Results (averaged from 10 fold cross validation and computed out of sample)")
print(metrics[[i for i in metrics.columns if i in eval_columns]])
importances = metrics[[i for i in metrics.columns if i not in eval_columns]]
print('Price importances: %s' % importances[[i for i in importances.columns if i in price_cols]].sum(axis=1).iloc[0])
print('Age importances: %s' % importances['age'].iloc[0])

_____
Work at ad level...


NameError: name 'df_X' is not defined

In [None]:
flag_cols = [x for x in df.columns if x.split('_')[0] == 'flag']
eth_cols = [x for x in df.columns if x.split('_')[0] == 'ethnicity']

cols_to_use = ['age',
               'price',
               'duration_in_mins',
               'price_per_min'] + flag_cols + eth_cols

df_slice = df.ix[:, ['class',
                     'phone',
                     'dd_id'] + cols_to_use].copy().drop_duplicates()

print(df_slice.shape)

# Fill NAs
for col in ['age', 'price', 'duration_in_mins', 'price_per_min']:
    df_slice['no_{}'.format(col)] = df_slice[col].apply(np.isnan)
    df_slice[col] = df_slice[col].fillna(0)
    
cols_to_use += [x for x in df.columns if x[:3] == 'no_']

# Dropped duplicates should kill identical ads
# WE NEED TO GROUP ON PHONES
# But I'm not going to think about that right now, so ads it is
df_X = df_slice.ix[:, cols_to_use]
y_series = df_slice['class'].astype(int)
print(df_X.shape)
print(y_series.shape)

### Get Train / Test splits, folds, or whatever you want to call them
Splitting *after* dropping a bunch of duplicates. We want the phone number to be the primary key.

In [None]:
splitter = ShuffleSplit(df_X.shape[0], 10)
splits = [x for x in splitter]

### Run model

In [None]:
p = mp.Pool(10)
lrs = p.starmap(lr_train_tester,
                [(df_X.iloc[train_ix, :],
                  y_series.iloc[train_ix],
                  df_X.iloc[test_ix, :],
                  y_series.iloc[test_ix])
                 for train_ix, test_ix in splits])
p.close()
p.join()

In [None]:
pd.Series([lr['lr_score'] for lr in lrs]).describe()

In [None]:
pd.Series([lr['roc']['auc'] for lr in lrs]).describe()

With these splits and features, our linear model _juuuuust_ a little better than the mean. Thanks, skewed data :(

In [None]:
from scipy import interp

mean_tpr = 0.0
mean_fpr = np.linspace(0, 1, 100)
all_tpr = []

for lr in lrs:
    mean_tpr += interp(mean_fpr, lr['roc']['fpr'], lr['roc']['tpr'])
    mean_tpr[0] = 0.0
    
mean_tpr /= len(lrs)
mean_tpr[-1] = 1.0

plt.plot(mean_fpr, mean_tpr,
         label='Mean ROC (area = %0.2f)' % pd.Series([lr['roc']['auc'] for lr in lrs]).describe()['mean'], lw=2)

plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Mean ROC on 10-fold Shuffle Split Linear Regression')
plt.legend(loc="lower right")
plt.show()