In [1]:
# Imports

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from imblearn.over_sampling import SMOTE
from pandas.api.types import is_string_dtype
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import IsolationForest
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (f1_score,
                             roc_auc_score,
                             roc_curve,
                             accuracy_score,
                             classification_report,
                             confusion_matrix)
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier


# Data Loading

In [2]:
fraud_data_raw = pd.read_csv('Candidate_tech_evaluation_candidate_copy_data science_fraud.csv')
fraud_data = fraud_data_raw
country_ip_range = pd.read_excel('Candidate_tech_evaluation_candidate_copy_datascience_IpAddress_to_Country.xlsx')

# Data Exploration and Preprocessing

In [6]:
fraud_data.describe()

Unnamed: 0,user_id,purchase_value,age,ip_address,class
count,120000.0,120000.0,120000.0,120000.0,120000.0
mean,200320.2801,36.881475,33.12525,2151550000.0,0.093875
std,115361.945935,18.325855,8.617081,1248314000.0,0.291656
min,2.0,9.0,18.0,93447.14,0.0
25%,100925.5,22.0,27.0,1085345000.0,0.0
50%,200037.5,34.0,33.0,2155238000.0,0.0
75%,300395.5,49.0,39.0,3241268000.0,0.0
max,400000.0,154.0,76.0,4294850000.0,1.0


In [7]:
fraud_data_distribution = fraud_data['class'].value_counts()
ratio_normal_fraud = fraud_data_distribution[1]/fraud_data_distribution[0]
print("Percentage of data with fraud = {} ".format(ratio_normal_fraud*100))

Percentage of data with fraud = 10.360049662022348 


In [8]:
print("Counts of each class in data")
fraud_data_distribution

Counts of each class in data


0    108735
1     11265
Name: class, dtype: int64

In [9]:
fraud_data['class'].value_counts(normalize=True)

0    0.906125
1    0.093875
Name: class, dtype: float64

In [10]:
fraud_data['sex'].value_counts(normalize=True)

M    0.584383
F    0.415617
Name: sex, dtype: float64

In [11]:
print("Average purchase Value per class")
fraud_data[['purchase_value','class']].groupby('class').mean()


Average purchase Value per class


Unnamed: 0_level_0,purchase_value
class,Unnamed: 1_level_1
0,36.862344
1,37.066134


In [12]:
def plot_count_of_time(class_value,fraud_data):
    plt.figure(figsize=(16,8))
    fraud_data_class=fraud_data[fraud_data['class']==class_value]
    fraud_data_class['signup_to_purchase_h']=fraud_data_class['signup_to_purchase_seconds']*3600
    ax = sns.barplot(fraud_data_class['signup_to_purchase_h'].unique(),fraud_data_class['signup_to_purchase_h'].value_counts())
    for patch in ax.patches:
        ax.annotate('{:.0f}'.format(patch.get_height()), (patch.get_x()+0.3, patch.get_height()+100),fontsize=12)
    ax.set_title("Unique Counts of signup_to_purchase_hours with class {0}".format(class_value),fontsize=20)

In [13]:
def set_ip_address_country(ip):
    mask = (ip >= country_ip_range.lower_bound_ip_address) & (ip <= country_ip_range.upper_bound_ip_address)
    
    if mask.any():
        return country_ip_range.country[mask].to_string(index=False)

In [14]:
# fraud_data['country'] = fraud_data['ip_address'].apply(lambda ip : set_ip_address_country(ip))

In [15]:
import datetime
print(datetime.datetime.now())
df1 = fraud_data
df2 = country_ip_range

country = []
for i in df1['ip_address']:
   a = int(i) >= df2['lower_bound_ip_address']
   b = int(i) <= df2['upper_bound_ip_address']
   res = df2[a & b]
   try:
       country.append((res['country'].values[0]))
   except:
       country.append('')

#MERGE COUNTRY
df1['country'] = pd.DataFrame(country)
df = df1
print(datetime.datetime.now())

df


2019-08-11 15:51:25.923572
2019-08-11 15:55:14.651715


Unnamed: 0,user_id,signup_time,purchase_time,purchase_value,device_id,source,browser,sex,age,ip_address,class,country
0,285108,7/15/2015 4:36,9/10/2015 14:17,31,HZAKVUFTDOSFD,Direct,Chrome,M,49,2.818400e+09,0,United States
1,131009,1/24/2015 12:29,4/13/2015 4:53,31,XGQAJSOUJIZCC,SEO,IE,F,21,3.251268e+09,0,United Kingdom
2,328855,3/11/2015 0:54,4/5/2015 12:23,16,VCCTAYDCWKZIY,Direct,IE,M,26,2.727760e+09,0,United States
3,229053,1/7/2015 13:19,1/9/2015 10:12,29,MFFIHYNXCJLEY,SEO,Chrome,M,34,2.083420e+09,0,Korea Republic of
4,108439,2/8/2015 21:11,4/9/2015 14:26,26,WMSXWGVPNIFBM,Ads,FireFox,M,33,3.207913e+09,0,Brazil
5,178528,4/20/2015 20:19,8/3/2015 20:17,35,HUFZWLDYJPRCK,SEO,Chrome,F,30,2.550204e+09,0,United States
6,323775,6/30/2015 7:34,9/5/2015 16:59,28,DLOOEWQCUQRKZ,SEO,Safari,M,47,4.075994e+09,0,
7,69189,3/10/2015 1:53,6/30/2015 17:37,21,NRJPZDBHXNQJG,SEO,FireFox,F,47,2.753449e+09,0,Germany
8,70424,7/5/2015 16:54,8/25/2015 16:08,43,YDLNPVEWMTMMZ,Direct,IE,M,40,3.310808e+09,0,South Africa
9,328496,7/23/2015 12:48,10/20/2015 13:01,40,HTOCRWCBYIQJI,Ads,Safari,M,33,2.185371e+09,0,France


In [16]:
print("Count of missing values")
fraud_data.isna().sum()

Count of missing values


user_id           0
signup_time       0
purchase_time     0
purchase_value    0
device_id         0
source            0
browser           0
sex               0
age               0
ip_address        0
class             0
country           0
dtype: int64

In [17]:
print("Count of missing values")
df['country']=df['country'].replace(r'^\s*$', np.nan, regex=True)
df.isna().sum()


Count of missing values


user_id               0
signup_time           0
purchase_time         0
purchase_value        0
device_id             0
source                0
browser               0
sex                   0
age                   0
ip_address            0
class                 0
country           17418
dtype: int64

In [18]:
fraud_data=df
fraud_data.dtypes

user_id             int64
signup_time        object
purchase_time      object
purchase_value      int64
device_id          object
source             object
browser            object
sex                object
age                 int64
ip_address        float64
class               int64
country            object
dtype: object

In [19]:
print("Country to Source Count")
pd.crosstab(fraud_data.country, fraud_data.source)

Country to Source Count


source,Ads,Direct,SEO
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Afghanistan,1,1,3
Albania,4,2,4
Algeria,30,21,47
Angola,9,1,9
Antigua and Barbuda,1,0,1
Argentina,194,107,212
Armenia,3,5,4
Australia,556,310,625
Austria,138,79,132
Azerbaijan,11,1,9


In [20]:
print("Class to Source Count")
pd.crosstab(fraud_data['class'], fraud_data.source)

Class to Source Count


source,Ads,Direct,SEO
class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,43109,21681,43945
1,4352,2561,4352


In [21]:
print("Class to Country Count")
pd.crosstab(fraud_data.country,fraud_data["class"])

Class to Country Count


class,0,1
country,Unnamed: 1_level_1,Unnamed: 2_level_1
Afghanistan,4,1
Albania,10,0
Algeria,88,10
Angola,18,1
Antigua and Barbuda,2,0
Argentina,457,56
Armenia,10,2
Australia,1356,135
Austria,323,26
Azerbaijan,19,2


In [22]:
fraud_data_class_1=fraud_data[fraud_data['class']==1]
fraud_data_class_1.groupby('country')['country'].count().sort_values(ascending=False).head(10)
fraud_data_class_1.groupby('country')['country'].count().sort_values(ascending=False).head(10).plot(kind='bar')
plt.ylabel('Number of users')
plt.title('Top 10 Fraud User countries')

Text(0.5, 1.0, 'Top 10 Fraud User countries')

In [23]:
fraud_data_class_1['sex'].value_counts(normalize=True)

M    0.594141
F    0.405859
Name: sex, dtype: float64

In [24]:
fraud_data.signup_time = fraud_data.signup_time.apply(pd.to_datetime)
fraud_data.purchase_time = fraud_data.purchase_time.apply(pd.to_datetime)

In [25]:
fraud_data[fraud_data['class']==1].signup_time.apply(lambda x: x.hour).hist()
plt.xlabel("Hour of Day", fontsize=15)
plt.ylabel("Count", fontsize=15)

Text(0, 0.5, 'Count')

In [26]:
fraud_data["signup_to_purchase_seconds"] = (fraud_data.purchase_time - fraud_data.signup_time).apply(lambda x: x.seconds)

In [28]:
plot_count_of_time(0, fraud_data)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [29]:
plot_count_of_time(1, fraud_data)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [30]:
sns.barplot(x="class", y="signup_to_purchase_seconds", data=fraud_data)
plt.xlabel("Fraud to no fraud", fontsize=15)
plt.ylabel("Sign-up to Purchase Time in Sec", fontsize=15)

Text(0, 0.5, 'Sign-up to Purchase Time in Sec')

In [31]:
sns.factorplot(x="class", y="purchase_value", data=fraud_data)
plt.xlabel("Fraud to no fraud", fontsize=15)
plt.ylabel("Purchase Value", fontsize=15)



Text(0.5694444444444446, 0.5, 'Purchase Value')

In [32]:
fraud_data.dtypes

user_id                                int64
signup_time                   datetime64[ns]
purchase_time                 datetime64[ns]
purchase_value                         int64
device_id                             object
source                                object
browser                               object
sex                                   object
age                                    int64
ip_address                           float64
class                                  int64
country                               object
signup_to_purchase_seconds             int64
dtype: object

In [33]:
source_dict = {'SEO':0, 'Ads':1, 'Direct':2}
if is_string_dtype(fraud_data.source):
    fraud_data.source = fraud_data.source.replace(source_dict)
browser_dict = {'Chrome':0, 'Opera':1, 'Safari':2, 'IE':3, 'FireFox':4}
if is_string_dtype(fraud_data.browser):
    fraud_data.browser =fraud_data.browser.replace(browser_dict)
if is_string_dtype(fraud_data.sex):
    fraud_data['sex']=[1 if i == 'M' else 0 for i in fraud_data['sex']]

In [34]:
fraud_data.dtypes

user_id                                int64
signup_time                   datetime64[ns]
purchase_time                 datetime64[ns]
purchase_value                         int64
device_id                             object
source                                 int64
browser                                int64
sex                                    int64
age                                    int64
ip_address                           float64
class                                  int64
country                               object
signup_to_purchase_seconds             int64
dtype: object

In [35]:
if is_string_dtype(fraud_data.country):
    labels = fraud_data['country'].astype('category').cat.categories.tolist()
    replace_map = {'country' : {country: label for country,label in zip(labels,list(range(1,len(labels)+1)))}}
    fraud_data.replace(replace_map,inplace=True)
    fraud_data['country'].fillna(0,inplace=True)


In [36]:
fraud_data.dtypes

user_id                                int64
signup_time                   datetime64[ns]
purchase_time                 datetime64[ns]
purchase_value                         int64
device_id                             object
source                                 int64
browser                                int64
sex                                    int64
age                                    int64
ip_address                           float64
class                                  int64
country                              float64
signup_to_purchase_seconds             int64
dtype: object

In [37]:
fraud_data['occurence'] = 0
device_id_occurence = fraud_data['device_id'].value_counts()
fraud_data['occurence'] = fraud_data.apply(lambda row: device_id_occurence[row['device_id']], axis=1)
fraud_data['occurence'].value_counts()

1     106386
2       6832
9        990
8        840
10       810
11       748
7        630
12       612
13       520
6        444
14       364
5        305
3        144
15       105
4        104
16        96
17        51
19        19
Name: occurence, dtype: int64

In [38]:
fraud_data=fraud_data.drop('purchase_time',axis=1)
fraud_data=fraud_data.drop('signup_time',axis=1)
fraud_data=fraud_data.drop('device_id',axis=1)
fraud_data=fraud_data.drop('ip_address',axis=1)

### Train Test Split

In [39]:
fraud_data_no_class=fraud_data
fraud_data_no_class=fraud_data_no_class.drop('class',axis=1)

In [40]:
fraud_data_no_class

Unnamed: 0,user_id,purchase_value,source,browser,sex,age,country,signup_to_purchase_seconds,occurence
0,285108,31,2,0,1,49,167.0,34860,2
1,131009,31,0,3,0,21,166.0,59040,1
2,328855,16,2,3,1,26,167.0,41340,1
3,229053,29,0,0,1,34,87.0,75180,1
4,108439,26,1,4,1,33,24.0,62100,1
5,178528,35,0,0,0,30,167.0,86280,1
6,323775,28,0,2,1,47,0.0,33900,1
7,69189,21,0,4,0,47,63.0,56640,1
8,70424,43,2,3,1,40,148.0,83640,1
9,328496,40,1,2,1,33,59.0,780,1


In [41]:
X_train, X_test, y_train, y_test = train_test_split(
    fraud_data_no_class, fraud_data['class'], test_size=0.3, random_state=42)

In [74]:
# Count of each class type in training data
y_train.value_counts()

0    76146
1     7854
Name: class, dtype: int64

In [73]:
# percentage of data with fraud in training data
fraud_data_distribution_train = y_train.value_counts()
ratio_fraud_train = fraud_data_distribution_train[1]/len(fraud_data_no_class.index)
print("Percentage of data with fraud in training data= {} ".format(ratio_normal_fraud*100))

Percentage of data with fraud = 10.360049662022348 


In [44]:
fraud_data_distribution_test= y_test.value_counts()
ratio_fraud_test= fraud_data_distribution_test[1]/fraud_data_distribution_test[0]
print("Percentage of data with fraud = {} ".format(ratio_normal_fraud*100))

Percentage of data with fraud = 10.360049662022348 


In [45]:
fraud_data_distribution_test

0    32589
1     3411
Name: class, dtype: int64

# Models

## Random Forest

In [46]:
def get_random_forest_classifier():
    classifier_RF = RandomForestClassifier(n_estimators=500,random_state=0,n_jobs=-1)



def random_forest_fraud_prediction():
    unique, counts = np.unique(y_train, return_counts=True)

    print (np.asarray((unique, counts)).T)


    classifier_RF = RandomForestClassifier(random_state=0)

    classifier_RF.fit(X_train, y_train)
    rf_score=classifier_RF.score(X_test, y_test)
    print(rf_score)
    
    print(classifier_RF)
    
    # predict class labels 0/1 for the test set
    predicted = classifier_RF.predict(X_test)

    # generate class probabilities
    probs = classifier_RF.predict_proba(X_test)

    
    
    # generate evaluation metrics
    print("%s: %r" % ("accuracy_score is: ", accuracy_score(y_test, predicted)))
    print("%s: %r" % ("roc_auc_score is: ", roc_auc_score(y_test, probs[:, 1])))
    print("%s: %r" % ("f1_score is: ", f1_score(y_test, predicted )))#string to int
    
    print ("confusion_matrix is: ")
    cm = confusion_matrix(y_test, predicted)
    cmDF = pd.DataFrame(cm, columns=['pred_0', 'pred_1'], index=['true_0', 'true_1'])
    print(cmDF)
    print ('recall =',float(cm[1,1])/(cm[1,0]+cm[1,1]))
    print ('precision =', float(cm[1,1])/(cm[1,1] + cm[0,1]))#1.0
    return classifier_RF


In [47]:
def random_forest_with_smote():
    smote = SMOTE(random_state=12)
    x_train_sm, y_train_sm = smote.fit_sample(X_train, y_train)

    unique, counts = np.unique(y_train_sm, return_counts=True)

    print (np.asarray((unique, counts)).T)

    classifier_RF = RandomForestClassifier(n_estimators=500,random_state=0)

    classifier_RF.fit(x_train_sm, y_train_sm)

    # predict class labels 0/1 for the test set
    predicted = classifier_RF.predict(X_test)

    # generate class probabilities
    probs = classifier_RF.predict_proba(X_test)

    # generate evaluation metrics
    print("%s: %r" % ("accuracy_score is: ", accuracy_score(y_test, predicted)))
    print("%s: %r" % ("roc_auc_score is: ", roc_auc_score(y_test, probs[:, 1])))
    print("%s: %r" % ("f1_score is: ", f1_score(y_test, predicted )))#string to int

    print ("confusion_matrix is: ")
    cm = confusion_matrix(y_test, predicted)
    cmDF = pd.DataFrame(cm, columns=['pred_0', 'pred_1'], index=['true_0', 'true_1'])
    print(cmDF)
    print ('recall =',float(cm[1,1])/(cm[1,0]+cm[1,1]))
    print ('precision =', float(cm[1,1])/(cm[1,1] + cm[0,1]))#1.0
    return classifier_RF


In [48]:
def plot_confusion_matrix(model, X_test, y_true):
    cm = confusion_matrix(y_true, model.predict(X_test))

    print(cm)

    # Show confusion matrix in a separate window
    plt.matshow(cm)
    plt.title('Confusion matrix')
    plt.colorbar(cmap='jet')
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.show()
    

In [75]:
model=random_forest_fraud_prediction()
plot_confusion_matrix(model, X_test, y_test)

[[    0 76146]
 [    1  7854]]




0.0018333333333333333
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)
accuracy_score is: : 0.0018333333333333333
roc_auc_score is: : 0.18433123971385706


ValueError: Target is multiclass but average='binary'. Please choose another average setting, one of [None, 'micro', 'macro', 'weighted'].

In [76]:
model=random_forest_with_smote()
plot_confusion_matrix(model, X_test, y_test)


[[    0 76146]
 [    1 76146]]
accuracy_score is: : 0.0025277777777777777
roc_auc_score is: : 0.1682582938943944


ValueError: Target is multiclass but average='binary'. Please choose another average setting, one of [None, 'micro', 'macro', 'weighted'].

### Feature Importance - RF

In [51]:

def plot_importance(clf, feature, max_features=10):
    '''Plot feature importance'''
    feature_importance = clf.feature_importances_
    # make importances relative to max importance
    feature_importance = 100.0 * (feature_importance / feature_importance.max())
    sorted_idx = np.argsort(feature_importance)
    pos = np.arange(sorted_idx.shape[0]) + .5
    
    # Show only top features
    pos = pos[-max_features:]
    feature_importance = (feature_importance[sorted_idx])[-max_features:]
    feature = np.array(feature)
    feature_names = (feature[sorted_idx])[-max_features:]
    
    plt.barh(pos, feature_importance, align='center')
    plt.yticks(pos, feature_names)
    plt.xlabel('Relative Importance')
    plt.title('Variable Importance')

In [52]:
feature = X_train.columns.tolist()[:]
plot_importance(random_forest_fraud_prediction(), feature, max_features=16)




[[    0 76146]
 [    1  7854]]
0.9544722222222223
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)
accuracy_score is: : 0.9544722222222223
roc_auc_score is: : 0.815668760286143
f1_score is: : 0.6916274694261525
confusion_matrix is: 
        pred_0  pred_1
true_0   32523      66
true_1    1573    1838
recall = 0.5388449135150982
precision = 0.9653361344537815


## K Nearest Neighbours

In [53]:
scores={}
scores_list=[]
for k in range(1,26):
    knn=KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train,y_train)
    y_predicted=knn.predict(X_test)
    scores[k]=accuracy_score(y_test,y_predicted)
    scores_list.append(scores[k])


In [None]:
print("{}:{}".format("accuracy_score is: ", accuracy_score(y_test, y_predicted)))
print("{}:{}".format("f1_score is: ", f1_score(y_test, y_predicted )))#string to int

print ("confusion_matrix is: ")
cm = confusion_matrix(y_test, y_predicted)
cmDF = pd.DataFrame(cm, columns=['pred_0', 'pred_1'], index=['true_0', 'true_1'])
print(cmDF)
print ('recall =',float(cm[1,1])/(cm[1,0]+cm[1,1]))
print ('precision =', float(cm[1,1])/(cm[1,1] + cm[0,1]))#1.0



In [54]:
plt.plot(range(1,26),scores_list)
plt.xlabel("K")
plt.ylabel("Accuracy")

Text(0, 0.5, 'Accuracy')

In [None]:
print(scores_list)

## Gradient Boost

In [55]:
def gradient_boost_fraud_prediction():
    gbc = GradientBoostingClassifier(n_estimators=500, max_depth=8, subsample=0.5, 
                                 max_features='auto', learning_rate=0.05)
    gbc.fit(X_train, y_train)
    gbc.score(X_test,y_test)
    pd.crosstab(y_test, gbc.predict(X_test))
    return gbc

In [56]:
plot_confusion_matrix(gradient_boost_fraud_prediction(), X_test, y_test)


[[32493    96]
 [ 1575  1836]]


  self.figure.tight_layout()


In [None]:
print("{}:{}".format("accuracy_score is: ", accuracy_score(y_test, y_predicted)))
print("{}:{}".format("f1_score is: ", f1_score(y_test, y_predicted )))#string to int

print ("confusion_matrix is: ")
cm = confusion_matrix(y_test, y_predicted)
cmDF = pd.DataFrame(cm, columns=['pred_0', 'pred_1'], index=['true_0', 'true_1'])
print(cmDF)
print ('recall =',float(cm[1,1])/(cm[1,0]+cm[1,1]))
print ('precision =', float(cm[1,1])/(cm[1,1] + cm[0,1]))#1.0



## Logistic Regression

In [57]:
# instantiate the model (using the default parameters)
logreg = LogisticRegression(solver='lbfgs')

# fit the model with data
logreg.fit(X_train,y_train)

# predict on test
y_pred=logreg.predict(X_test)



In [58]:
cm = confusion_matrix(y_test, y_pred)
cmDF = pd.DataFrame(cm, columns=['pred_0', 'pred_1'], index=['true_0', 'true_1'])
print(cmDF)

        pred_0  pred_1
true_0   32567      22
true_1    2667     744


In [59]:
logreg.score(X_test,y_test)
import sklearn.metrics as skm
skm.roc_auc_score(y_test, logreg.predict(X_test))

0.6087213897950738

## Isolation Forest

In [60]:
Y_dev = y_train.apply(lambda x: 1 if x == 0 else -1)
Y_eval = y_test.apply(lambda x: 1 if x == 0 else -1)
X_dev=X_train
X_eval=X_test
# Initiate the isolation forest class and specify the percentage of anomalous samples in the development dataset
IF = IsolationForest(max_samples="auto", contamination=0.095)

# Define the parameter grid to search over
param_grid = {"n_estimators": [100, 150, 200, 250, 300, 350, 400, 450, 500]}

# Define the grid search object
clf = GridSearchCV(IF, 
                   param_grid=param_grid, 
                   cv=5, 
                   n_jobs=10,
                   scoring="f1",
                   verbose=10)

# Perform grid search
_ = clf.fit(X_dev, Y_dev)

Fitting 5 folds for each of 9 candidates, totalling 45 fits


[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done   5 tasks      | elapsed:   13.2s
[Parallel(n_jobs=10)]: Done  12 tasks      | elapsed:   29.8s
[Parallel(n_jobs=10)]: Done  21 tasks      | elapsed:   53.5s
[Parallel(n_jobs=10)]: Done  31 out of  45 | elapsed:  1.4min remaining:   38.8s
[Parallel(n_jobs=10)]: Done  36 out of  45 | elapsed:  1.7min remaining:   25.5s
[Parallel(n_jobs=10)]: Done  41 out of  45 | elapsed:  1.9min remaining:   11.3s
[Parallel(n_jobs=10)]: Done  45 out of  45 | elapsed:  1.9min finished


In [61]:
IF_best = clf.best_estimator_
Y_predicted = IF_best.predict(X_eval)
roc_auc = roc_auc_score(Y_eval, Y_predicted)

print(classification_report(Y_eval, 
                            Y_predicted,
                            target_names=["anomalous", "normal"]))
print("Area under ROC curve: {:0.3f}".format(roc_auc))

# Compute ROC curve and area under the curve
IF_probs = IF_best.decision_function(X_eval)
fpr, tpr, thresholds = roc_curve(Y_eval, IF_probs)

plt.plot(fpr, tpr, lw=1, label='ROC (area = %0.3f)'%(roc_auc))

plt.plot([0, 1], [0, 1], '--', color=(0.6, 0.6, 0.6), label='Luck')
plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])
plt.xlabel('False Positive Rate', fontsize=15)
plt.ylabel('True Positive Rate', fontsize=15)
plt.title('Receiver Operating Characteristic (ROC) curve', fontsize=12)
plt.legend(loc="lower right", frameon = True).get_frame().set_edgecolor('black')
plt.grid(True, linestyle = 'dotted')
plt.show()



              precision    recall  f1-score   support

   anomalous       0.53      0.54      0.54      3411
      normal       0.95      0.95      0.95     32589

    accuracy                           0.91     36000
   macro avg       0.74      0.74      0.74     36000
weighted avg       0.91      0.91      0.91     36000

Area under ROC curve: 0.744


In [62]:
smote = SMOTE(random_state=12)
X_dev1, y_train_sm1 = smote.fit_sample(X_train, y_train)

unique, counts = np.unique(y_train_sm1, return_counts=True)

print (np.asarray((unique, counts)).T)

y_train_sm1[y_train_sm1!=0]=-1
y_train_sm1[y_train_sm1!=-1]=1
y_test[y_test!=0]=-1
y_test[y_test!=-1]=1
Y_dev1 = y_train_sm1
Y_eval1 = y_test
# Y_dev1 = y_train_sm1.apply(lambda x: 1 if x == 0 else -1)
# Y_eval1 = y_test.apply(lambda x: 1 if x == 0 else -1)
X_eva1l=X_test
# Initiate the isolation forest class and specify the percentage of anomalous samples in the development dataset

# Define the parameter grid to search over
param_grid = {"n_estimators": [100, 150, 200, 250, 300, 350, 400, 450, 500]}

# Define the grid search object
clf = GridSearchCV(IF, 
                   param_grid=param_grid, 
                   cv=5, 
                   n_jobs=10,
                   scoring="f1",
                   verbose=10)

# Perform grid search
_ = clf.fit(X_dev1, Y_dev1)

[[    0 76146]
 [    1 76146]]
Fitting 5 folds for each of 9 candidates, totalling 45 fits


[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done   5 tasks      | elapsed:   15.8s
[Parallel(n_jobs=10)]: Done  12 tasks      | elapsed:   47.0s
[Parallel(n_jobs=10)]: Done  21 tasks      | elapsed:  1.6min
[Parallel(n_jobs=10)]: Done  31 out of  45 | elapsed:  2.7min remaining:  1.2min
[Parallel(n_jobs=10)]: Done  36 out of  45 | elapsed:  3.2min remaining:   48.5s
[Parallel(n_jobs=10)]: Done  41 out of  45 | elapsed:  3.7min remaining:   21.5s
[Parallel(n_jobs=10)]: Done  45 out of  45 | elapsed:  3.7min finished


In [63]:
IF_best = clf.best_estimator_
Y_predicted = IF_best.predict(X_eval)
roc_auc = roc_auc_score(Y_eval, Y_predicted)

print(classification_report(Y_eval, 
                            Y_predicted,
                            target_names=["anomalous", "normal"]))
print("Area under ROC curve: {:0.3f}".format(roc_auc))

# Compute ROC curve and area under the curve
IF_probs = IF_best.decision_function(X_eval)
fpr, tpr, thresholds = roc_curve(Y_eval, IF_probs)

plt.plot(fpr, tpr, lw=1, label='ROC (area = %0.3f)'%(roc_auc))

plt.plot([0, 1], [0, 1], '--', color=(0.6, 0.6, 0.6), label='Luck')
plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])
plt.xlabel('False Positive Rate', fontsize=15)
plt.ylabel('True Positive Rate', fontsize=15)
plt.title('Receiver Operating Characteristic (ROC) curve', fontsize=12)
plt.legend(loc="lower right", frameon = True).get_frame().set_edgecolor('black')
plt.grid(True, linestyle = 'dotted')
plt.show()



              precision    recall  f1-score   support

   anomalous       0.22      0.20      0.21      3411
      normal       0.92      0.92      0.92     32589

    accuracy                           0.86     36000
   macro avg       0.57      0.56      0.56     36000
weighted avg       0.85      0.86      0.85     36000

Area under ROC curve: 0.562


## Grid Search

In [65]:
rf_params = {
    'max_depth': [4, 8, None],
    'max_features': ['sqrt', 'log2', None],
    'min_samples_split': [2, 4],
    'min_samples_leaf': [2, 4],
    'bootstrap': [True], # Mandatory with oob_score=True
    'n_estimators': [50, 100, 200, 400, 500],
    'random_state': [67],
    'oob_score': [True],
    'n_jobs': [-1]
    }
def optimize_hyperparameters(model, params, X_train, y_train):
    gs = GridSearchCV(model(), params, cv=5, n_jobs=-1, verbose=True, scoring='roc_auc')
    gs.fit(X_train, y_train)
    print (gs.best_params_)

    #print best parameters
# optimize_hyperparameters(RandomForestClassifier, rf_params, X_train, y_train)