In [132]:
import numpy as np
import pandas as pd
import datetime as dt
from datetime import timedelta
import matplotlib.pyplot as plt
from patsy import dmatrices
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble.partial_dependence import partial_dependence, plot_partial_dependence
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss

In [2]:
ip_df = pd.read_csv('IpAddress_to_Country.csv')

In [6]:
fraud_df = pd.read_csv('Fraud_Data.csv')

In [10]:
ip_df['upper_lower'] = list(zip(ip_df.loc[:,'lower_bound_ip_address'], ip_df.loc[:, 'upper_bound_ip_address']))

In [12]:
dict_df = ip_df.loc[:, ['country', 'upper_lower']]

Map IP addresses to the corresponding country in a dictionary

In [227]:
d = dict_df.groupby('upper_lower')['country'].apply(list).to_dict()


#### Create a function to extract countries from ip addresses using ip address dictionary

In [166]:
upper_bounds=[(v[0], k[1]) for k,v in d.items()]
def get_country(x):
    for c,ip in upper_bounds:
        if x<=ip: return c  # didn't end up using

### Feature Engineering

#### Find the countries for all the IP addresses listed

In [101]:
v = ip_df.loc[:, 'lower_bound_ip_address':'upper_bound_ip_address'].apply(tuple, 1).tolist()
idx = pd.IntervalIndex.from_tuples(v, closed='both')
fraud_df['country'] = ip_df.loc[idx.get_indexer(fraud_df['ip_address'].values), 'country'].values

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  return getattr(section, self.name)[new_key]


In [108]:
fraud_df = fraud_df.merge(pd.get_dummies(fraud_df.source), left_index=True, right_index=True)

In [152]:
fraud_df = fraud_df.merge(pd.get_dummies(fraud_df.browser), left_index=True, right_index=True)

In [111]:
fraud_df['sex'] = pd.Series(np.where(fraud_df.sex.values == 'F', 1, 0),
          fraud_df.index)

In [114]:
fraud_df['signup_time'] = pd.to_datetime(fraud_df['signup_time'])

In [115]:
fraud_df['purchase_time'] = pd.to_datetime(fraud_df['purchase_time'])

#### Find difference in signup time and purchase time

In [118]:
fraud_df['time_to_purchase'] = fraud_df['purchase_time'] -fraud_df['signup_time']

In [133]:
fraud_df.time_to_purchase = fraud_df.time_to_purchase.apply(lambda x: x / timedelta (days=1))

In [175]:
percent_fraud_df = pd.DataFrame(fraud_df.groupby(['country'])['class'].sum()/fraud_df.groupby(['country'])['class'].count())

#### Find countries with fraud percent above 20

In [185]:
fraud_countries = list(percent_fraud_df.loc[percent_fraud_df['class'] > 0.2].index)

In [186]:
def possible_fraud(country):
    if country in fraud_countries:
        return 1
    else:
        return 0

In [187]:
fraud_df['possible_fraud'] = fraud_df['country'].apply(lambda x: possible_fraud(x))

In [230]:
fraud_df.isna().sum()  #see if there are any missing values

user_id                 0
signup_time             0
purchase_time           0
purchase_value          0
device_id               0
source                  0
browser                 0
sex                     0
age                     0
ip_address              0
class                   0
country             21966
Ads                     0
Direct                  0
SEO                     0
time_to_purchase        0
Chrome                  0
FireFox                 0
IE                      0
Opera                   0
Safari                  0
possible_fraud          0
dtype: int64

#### Create X and y matrices for model

In [249]:
date_to_split = dt.datetime(2015, 7, 31) #split on this date for the train/test split

In [241]:
len(fraud_df)*0.1

15111.2

In [248]:
fraud_df[fraud_df.signup_time >= date_to_split]

Unnamed: 0,user_id,signup_time,purchase_time,purchase_value,device_id,source,browser,sex,age,ip_address,...,Ads,Direct,SEO,time_to_purchase,Chrome,FireFox,IE,Opera,Safari,possible_fraud
6,50116,2015-08-01 22:40:52,2015-08-27 03:37:57,11,IWKVZHJOCLPUR,Ads,Chrome,1,19,3.987484e+09,...,1,0,0,25.206308,1,0,0,0,0,0
21,316355,2015-08-08 20:05:50,2015-10-03 02:21:46,47,IHGECNGZGQOAZ,SEO,Chrome,0,28,3.836794e+09,...,0,0,1,55.261065,1,0,0,0,0,0
31,396746,2015-08-05 23:50:35,2015-08-14 12:08:55,56,JUWCOHMHRBDCL,SEO,IE,0,56,2.937614e+09,...,0,0,1,8.512731,0,0,1,0,0,0
37,174879,2015-08-12 20:24:19,2015-09-09 03:39:58,56,ONLMVTYBMQQWN,Ads,FireFox,0,30,2.871722e+09,...,1,0,0,27.302535,0,1,0,0,0,0
40,81113,2015-08-16 12:56:45,2015-08-24 05:00:54,52,BKQVBSSFGETUQ,Direct,Chrome,1,44,3.682312e+09,...,0,1,0,7.669549,1,0,0,0,0,0
45,313833,2015-08-16 20:48:52,2015-11-17 09:43:11,12,GDATAKTYQUGWR,Ads,IE,0,55,5.690384e+08,...,1,0,0,92.537720,0,0,1,0,0,0
131,44224,2015-08-13 17:21:02,2015-11-15 18:25:45,86,VJZWVXGOUWMZP,Ads,Chrome,1,35,1.744239e+08,...,1,0,0,94.044942,1,0,0,0,0,0
136,18857,2015-08-13 22:48:43,2015-09-16 18:50:28,36,NRAWJXXBNCJSM,Ads,Chrome,0,41,3.638273e+09,...,1,0,0,33.834549,1,0,0,0,0,0
157,20668,2015-08-06 04:20:12,2015-11-10 20:39:06,62,NMISUIDPWOSEU,Direct,Chrome,0,33,1.921877e+09,...,0,1,0,96.679792,1,0,0,0,0,0
158,272165,2015-08-05 23:38:51,2015-12-01 22:00:05,20,YAYLYLSKUKZDT,SEO,FireFox,1,36,2.477876e+09,...,0,0,1,117.931412,0,1,0,0,0,0


In [250]:
X = fraud_df.loc[:, ['purchase_value', 'sex', 'age', 'Ads', 'Direct', 'SEO', 'time_to_purchase', 'Chrome',
       'FireFox', 'IE', 'Opera', 'Safari', 'possible_fraud']]

In [251]:
y = fraud_df.loc[:, ['class']]

In [255]:
df_train = fraud_df[fraud_df.signup_time <= date_to_split]

In [256]:
df_test = fraud_df[fraud_df.signup_time >= date_to_split]

In [200]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [277]:
X_train2 = df_train.loc[:, ['purchase_value', 'sex', 'age', 'Ads', 'Direct', 'SEO', 'time_to_purchase', 'Chrome',
       'FireFox', 'IE', 'Opera', 'Safari']] #dropped potential fraud due to data leakage concerns

In [268]:
y_train2 = df_train.loc[:, ['class']]

In [283]:
X_test2 = df_test.loc[:, ['purchase_value', 'sex', 'age', 'Ads', 'Direct', 'SEO', 'time_to_purchase', 'Chrome',
       'FireFox', 'IE', 'Opera', 'Safari']] #dropped potential fraud due to data leakage concerns

In [284]:
y_test2 = df_test.loc[:, ['class']]

In [271]:
rf_classifier = RandomForestClassifier(max_depth=10, n_estimators=1000)

In [272]:
rf_classifier.fit(X_train, y_train)

ValueError: could not convert string to float: 'United States'

In [219]:
preds = rf_classifier.predict_proba(X_test)

In [220]:
pos_preds = preds[:,1]

In [221]:
y_test.sum()

class    3607
dtype: int64

In [222]:
y_test.sum()/len(y_test)

class    0.095479
dtype: float64

In [223]:
rf_classifier.score(X_test, y_test)  #accuracy

0.9566679019535179

In [224]:
log_loss(y_test, pos_preds)

0.17882469971613849

In [279]:
rf_classifier.fit(X_train2, y_train2)

  """Entry point for launching an IPython kernel.


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=10, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [285]:
preds2 = rf_classifier.predict_proba(X_test2)

In [286]:
rf_classifier.score(X_test2, y_test2)

0.9544016868740116

In [287]:
log_loss(y_test2, preds2)

0.1854550485034003