In [1]:
import numpy as np
import pandas as pd
import src.scrubbington as scrub
import src.evaluationton as evalu


from sklearn.ensemble import IsolationForest
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.grid_search import GridSearchCV

import matplotlib.pyplot as plt
%matplotlib inline



### Read In and Clean Data

In [2]:
df_orig = pd.read_json('./data/data.json')

In [3]:
feature_list = ['channels', 'fb_published', 'has_analytics', 'has_logo', 'num_order', 'num_payouts', 
'sale_duration2', 'show_map', 'user_age', 'user_type', 'body_length']

In [4]:
df, y, X = scrub.scrub_everything(df_orig, feature_list)

In [5]:
df.head()

Unnamed: 0,acct_type,approx_payout_date,body_length,channels,country,currency,delivery_method,description,email_domain,event_created,...,user_created,user_type,venue_address,venue_country,venue_latitude,venue_longitude,venue_name,venue_state,fraud_no_fraud,org_name_bool
0,fraudster_event,2010-02-13 12:00:00,3852,5,US,USD,0.0,"<p><a href=""http://s432.photobucket.com/albums...",gmail.com,2010-01-06 01:01:46,...,2009-11-30 20:45:50,1,717 Washington Avenue,US,25.777471,-80.133433,INK Nightclub - South Beach,FL,True,False
1,premium,2011-02-03 08:00:00,3499,0,US,USD,1.0,"<p>Join us for a quick, one-night, community-b...",ruf.org,2010-12-31 21:57:50,...,2010-08-04 17:26:16,3,,US,32.776566,-79.930922,"The Charleston, SC area",SC,False,False
2,premium,2011-01-28 00:00:00,2601,8,US,USD,1.0,"<h3><span class=""subcategory""><strong>Teacher ...",pvsd.k12.ca.us,2010-11-30 04:22:36,...,2010-04-29 16:43:08,3,10100 Pioneer Blvd Suite 100,US,33.944201,-118.080419,Los Angeles County Office of Education,CA,False,False
3,premium,2014-01-06 00:00:00,12347,6,IE,EUR,1.0,"<p style=""margin-bottom: 1.3em; padding-bottom...",irishtabletennis.com,2013-02-12 15:06:10,...,2010-09-07 14:35:02,3,,,,,,,False,True
4,premium,2011-02-17 00:00:00,2417,11,US,USD,0.0,<p>Writers and filmmakers need to understand t...,artsandbusinesscouncil.org,2010-12-10 15:24:26,...,2010-11-05 19:07:45,3,One Marina Park Drive,US,42.353848,-71.044276,Fish & Richardson,MA,False,False


### Test Train Split

In [6]:
#Note: default split perecentage is 25%
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, shuffle=True)

In [7]:
X_train_len = X_train.shape[0]
X_test_len = X_test.shape[0]

y_train_len = y_train.shape[0]
y_test_len = y_test.shape[0]

print("X_train Data Count: {}".format(X_train_len))
print("y_train Data Count: {}".format(y_train_len))
print("X_test Data Count: {}".format(X_test_len))
print("y_test Data Count: {}".format(y_test_len))

print("\n\nSplit Percentage for Train Data: {}".format(X_train_len/(X_train_len + X_test_len)))

X_train Data Count: 10752
y_train Data Count: 10752
X_test Data Count: 3585
y_test Data Count: 3585


Split Percentage for Train Data: 0.7499476878007951


### Create IF Model and Fit to Train Data

In [8]:
iso_forest = IsolationForest()

In [9]:
iso_forest.fit(X_train)

IsolationForest(bootstrap=False, contamination=0.1, max_features=1.0,
        max_samples='auto', n_estimators=100, n_jobs=1, random_state=None,
        verbose=0)

### Make Predictions and Check Accuracy

In [10]:
y_predictions = iso_forest.predict(X_test)

In [11]:
# Train and Test Accuracy
print ("Train Accuracy : {}".format(accuracy_score(y_train, iso_forest.predict(X_train))))
print ("Test Accuracy  : {}".format(accuracy_score(y_test, y_predictions)))

Train Accuracy : 0.08696056547619048
Test Accuracy  : 0.08842398884239888


### Plotting Functions

def plot_roc(y_test, X_test, model, model_name):
    AUC = roc_auc_score(y_test, clf.predict_proba(X_test)[:,1])
    fpr, tpr, thresholds = roc_curve(y_test, clf.predict_proba(X_test)[:,1])
    
    plt.figure()
    lw = 2
    plt.plot(fpr, tpr, color='darkorange', lw=lw, label='ROC curve (area = %0.2f)' % AUC)
    plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic {}'.format(model_name))
    plt.legend(loc="lower right");

In [12]:
def plot_profit_curve(scoring, profit_matrix, num_points=100):
    x_threshold, y_profit = scoring.profit_curve(num_points=num_points, profit_matrix=profit_matrix)
    
    fig, ax = plt.subplots()
    ax.plot(x_threshold, y_profit)
    
    ymax = max(y_profit)
    xpos = y_profit.index(ymax)
    xmax = x_threshold[xpos]
    
    text= "x={:.3f}, y={:.3f}".format(xmax, ymax)
    bbox_props = dict(boxstyle="square,pad=0.3", fc="w", ec="k", lw=0.72)
    arrowprops=dict(arrowstyle="->",connectionstyle="angle,angleA=180,angleB=60")
    kw = dict(xycoords='data',textcoords="axes fraction", arrowprops=arrowprops, bbox=bbox_props, ha="right", va="top")
    ax.annotate(text, xy=(xmax, ymax), xytext=(0.94,0.96), **kw)
    ax.set_ylim(min(y_profit), ymax*2)
    ax.set_xlabel('Threshold For Predicting Fraud')
    ax.set_ylabel('Profit')
    ax.set_title('Profit Curve')
    plt.show()

### Evaluating Standard RF

In [13]:
scoring = evalu.Scores(model=iso_forest, X_test=X_test, y_test=y_test)
scoring.prediction

array([ 1,  1,  1, ...,  1,  1, -1])

In [16]:
y_test

array([False, False,  True, ...,  True, False, False], dtype=bool)

In [18]:
y_test[0:10]

array([False, False,  True, False, False, False,  True, False, False, False], dtype=bool)

In [19]:
for i in range(10):
    print(y_test[i], scoring.prediction[i])

False 1
False 1
True 1
False -1
False 1
False 1
True 1
False 1
False 1
False 1
