In [1]:
# %load src/model_selection.py
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.metrics import roc_curve, precision_score, recall_score,roc_auc_score
from imblearn.over_sampling import SMOTE
from sklearn.metrics import f1_score, confusion_matrix, accuracy_score, precision_score, recall_score,  classification_report, make_scorer

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_curve, auc
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, LabelEncoder, LabelBinarizer
from sklearn_pandas import DataFrameMapper
from sklearn.impute import SimpleImputer

from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV
from src.plot_confusion_matrix import plot_confusion_matrix
%matplotlib inline
pd.options.display.max_colwidth = 50

import pickle

from datetime import date
import calendar

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

from sklearn.exceptions import DataConversionWarning
warnings.filterwarnings(action='ignore', category=DataConversionWarning)

* Retrieve raw data from pipeline.py.
* Merge standardized geo-spatial (city, admin1, admin2, country) to the master data.
* Enrich features based on the "tickets_type" field - unpacking lists of dictionaries.

In [2]:
# %load src/merging geospatial info.py
import pandas as pd
import pickle


rg = pd.read_pickle('data/rg.pkl')

get_ipython().run_line_magic('run', 'src/pipeline')
D = FraudData('data/data.json')
df = D.get_enriched_df()





In [3]:
loc = df[['venue_latitude','venue_longitude','object_id']]
loc = loc[~loc['venue_latitude'].isnull()]

loc2 = loc.join(rg)
loc3 = loc2.drop(['venue_latitude', 'venue_longitude'],axis = 1)
df = df.merge(loc3, how='left', right_on = 'object_id', left_on = 'object_id')




In [4]:
idx= df.set_index(['object_id']).ticket_types.apply(pd.Series).stack().index

tmp = pd.DataFrame(df.set_index(['object_id']).ticket_types.apply(pd.Series).stack().values.tolist(),index=idx).reset_index().drop('level_1',1)
tmp_sum = tmp.groupby('event_id').agg({'cost': ['min', 'max'],'quantity_total' : 'sum' }).reset_index()
tmp_sum.columns = [' '.join(col).strip() for col in tmp_sum.columns.values]

df = df.merge(tmp_sum, how='left', left_on = 'object_id', right_on = 'event_id')
df['ticket_tiers_num'] = df['ticket_types'].apply(lambda x: len(x))

for col in ['cost min', 'cost max','quantity_total sum']:
    df[col]=df[col].apply(lambda x: 0 if pd.isnull(x) else x)

df['day_of_week'] = df['ec_weekday'].apply(lambda x: calendar.day_name[x])

df.listed.replace(('y', 'n'), (1, 0), inplace=True)

In [5]:
n_feat = ['sale_duration', 'user_event_delta', 'cost min', 'cost max', 'quantity_total sum', 
         'body_length', 'fb_published', 'has_analytics', 'has_logo', 'listed',
       'name_length', 'num_order', 'num_payouts', 'ticket_tiers_num','venue_longitude','venue_latitude' ]

In [6]:
c_feat = ['currency', 'email_domain','payout_type','day_of_week']

In [7]:
df_cat = pd.get_dummies(df[c_feat])

In [8]:
df_cat_n = df[n_feat].join(df_cat)
# df_cat_n.dtypes

In [9]:
hours = pd.get_dummies(df['ec_hour'].astype(str),prefix= 'hour_')
df2 = df_cat_n.join(hours)

In [10]:
y = df.pop('label').values
X = df2.copy()



In [11]:
# feat_all = n_feat + c_feat

In [12]:
# # Create dataframe mapper in preparation for sklearn modeling, which takes numeric numpy arrays
# mapper = DataFrameMapper([
#         ('sale_duration', None),
#         ('user_event_delta', None),
#         ('cost min', None),
#         ('cost max', None),
#         ('quantity_total sum', None),
#         ('body_length', None),
#         ('fb_published', None),
#         ('has_analytics', None),
#         ('has_logo', None),
#         ('listed', None),
#         ('name_length', None),
#         ('num_order',None),
#         ('num_payouts',None),
#         ('ec_weekday',LabelBinarizer()),
#         ('ec_hour',LabelBinarizer()),
#         ('ticket_tiers_num',None),
#          ('currency',LabelBinarizer()),
#          ('email_domain',LabelBinarizer()),
#          ('payout_type',LabelBinarizer()),
# #          ('city_std',LabelBinarizer()),
#          ('admin1',LabelBinarizer()),
# #          ('admin2',LabelBinarizer()),
#          ('country_std',LabelBinarizer())])
         
         
         
         
         

In [13]:
# imputation = SimpleImputer( strategy='mean',fill_value=0)

In [14]:
# lr_pipeline = Pipeline([
#         ('featurize', mapper),
#         ('scale', StandardScaler()),
#         ('impute', imputation),
#         ('classifier', LogisticRegression())
#         ])

In [15]:
# scaler = StandardScaler()
# scaler.fit(X_train)
# X_train = scaler.transform(X_train)

In [16]:
# scaler.mean_, scaler.scale_

In [17]:
# X_train = pd.DataFrame(X_train)
# X_train.columns = cols

In [18]:
# X_train = X_train.apply(lambda x: x.fillna(x.mean(),axis=0))


#### Model Selection

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X,y, stratify=y, random_state=42)

In [20]:
# List of classifiers
lr = LogisticRegression()
rf = RandomForestClassifier(n_jobs=-1)
gb = GradientBoostingClassifier()



In [21]:
imputation = SimpleImputer(strategy='mean',fill_value=0)
pipeline = (Pipeline([
                
                ('scale', StandardScaler()),
                ('impute',imputation),
                ('clf', LogisticRegression())
                ]))

In [22]:
def eval_scores(clf):
    recall = cross_val_score(clf, X_train, y_train,  scoring='recall')
    precision = cross_val_score(clf, X_train, y_train, scoring='precision')
    f1 = cross_val_score(clf, X_train, y_train, scoring='f1')

    print('Precision', np.mean(precision))
    print('Recall', np.mean(recall))
    print('F1', np.mean(f1))

In [23]:
eval_scores(pipeline)

Precision 0.8131197697769044
Recall 0.6927977168775242
F1 0.747881763227244


In [24]:
pipelines = []

plt.figure(figsize=(10, 10))
for classifier in [lr, rf,gb]:
    pipeline.set_params(clf = classifier)
    pipeline.fit(X_train, y_train)
    predicted = pipeline.predict(X_test)
    y_proba = pipeline.predict_proba(X_test)[:, 1]
    
    print('---------------------------------')
    print(str(classifier))
    print('-----------------------------------')
    accuracy = accuracy_score(y_test, predicted)
    precision = precision_score(y_test, predicted)
    recall = recall_score(y_test, predicted)
    print(f"Accuracy Score: {accuracy}")
    print(f"Precision Score: {precision}")
    print(f"Recall Score: {recall}")

---------------------------------
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)
-----------------------------------
Accuracy Score: 0.9642956764295676
Precision Score: 0.851985559566787
Recall Score: 0.7306501547987616
---------------------------------
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
-----------------------------------
Accuracy Score: 0.9637377963737797
Precision Score: 0.9617224880382775


<Figure size 720x720 with 0 Axes>

In [26]:
pipeline.set_params(clf = gb)
pipeline.fit(X_train,y_train)



In [41]:
# feature_importances = pipeline.steps[2][1].feature_importances_
# feature_names = X_train.columns
# feat_impt = pd.DataFrame(feature_importances, index = feature_names)
# feat_impt.sort_values(0, inplace = True)
# feat_impt.plot.barh(figsize = (12, 6))

In [29]:
# apply oversampling to pre-processed data

def smote_split():
#     print("Before OverSampling, counts of label '1': {}".format(sum(y_train==1)))
#     print("Before OverSampling, counts of label '0': {} \n".format(sum(y_train==0)))

    sm = SMOTE(random_state=2)
    X_train_res, y_train_res = sm.fit_sample(X_train, y_train.ravel())
    return X_train_res, y_train_res 

    print('After OverSampling, the shape of train_X: {}'.format(X_train_res.shape))
    print('After OverSampling, the shape of train_y: {} \n'.format(y_train_res.shape))

    print("After OverSampling, counts of label '1': {}".format(sum(y_train_res==1)))
    print("After OverSampling, counts of label '0': {}".format(sum(y_train_res==0)))