In [132]:
import numpy as np
import pandas as pd
import datetime as dt
from datetime import timedelta
import matplotlib.pyplot as plt
from patsy import dmatrices
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble.partial_dependence import partial_dependence, plot_partial_dependence
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss

In [2]:
ip_df = pd.read_csv('IpAddress_to_Country.csv')

In [6]:
fraud_df = pd.read_csv('Fraud_Data.csv')

In [7]:
fraud_df.head()

Unnamed: 0,user_id,signup_time,purchase_time,purchase_value,device_id,source,browser,sex,age,ip_address,class
0,22058,2015-02-24 22:55:49,2015-04-18 02:47:11,34,QVPSPJUOCKZAR,SEO,Chrome,M,39,732758400.0,0
1,333320,2015-06-07 20:39:50,2015-06-08 01:38:54,16,EOGFQPIZPYXFZ,Ads,Chrome,F,53,350311400.0,0
2,1359,2015-01-01 18:52:44,2015-01-01 18:52:45,15,YSSKYOSJHPPLJ,SEO,Opera,M,53,2621474000.0,1
3,150084,2015-04-28 21:13:25,2015-05-04 13:54:50,44,ATGTXKYKUDUQN,SEO,Safari,M,41,3840542000.0,0
4,221365,2015-07-21 07:09:52,2015-09-09 18:40:53,39,NAUITBZFJKHWW,Ads,Safari,M,45,415583100.0,0


In [55]:
fraud_df.isna().sum()  #see if there are any missing values

user_id           0
signup_time       0
purchase_time     0
purchase_value    0
device_id         0
source            0
browser           0
sex               0
age               0
ip_address        0
class             0
dtype: int64

In [10]:
ip_df['upper_lower'] = list(zip(ip_df.loc[:,'lower_bound_ip_address'], ip_df.loc[:, 'upper_bound_ip_address']))

In [12]:
dict_df = ip_df.loc[:, ['country', 'upper_lower']]

In [None]:
Map IP addresses to the corresponding country in a dictionary

In [79]:
d = dict_df.groupby('upper_lower')['country'].apply(list).to_dict()
d

{(16777216.0, 16777471): ['Australia'],
 (16777472.0, 16777727): ['China'],
 (16777728.0, 16778239): ['China'],
 (16778240.0, 16779263): ['Australia'],
 (16779264.0, 16781311): ['China'],
 (16781312.0, 16785407): ['Japan'],
 (16785408.0, 16793599): ['China'],
 (16793600.0, 16809983): ['Japan'],
 (16809984.0, 16842751): ['Thailand'],
 (16842752.0, 16843007): ['China'],
 (16843008.0, 16843263): ['Australia'],
 (16843264.0, 16843775): ['China'],
 (16843776.0, 16844799): ['China'],
 (16844800.0, 16846847): ['China'],
 (16846848.0, 16850943): ['China'],
 (16850944.0, 16859135): ['China'],
 (16859136.0, 16875519): ['Japan'],
 (16875520.0, 16908287): ['Thailand'],
 (16908288.0, 16908799): ['China'],
 (16908800.0, 16909055): ['China'],
 (16909056.0, 16909311): ['Australia'],
 (16909312.0, 16909567): ['China'],
 (16909568.0, 16909823): ['China'],
 (16909824.0, 16910335): ['China'],
 (16910336.0, 16910591): ['China'],
 (16910592.0, 16910847): ['China'],
 (16910848.0, 16911359): ['China'],
 (1691

#### Create a function to extract countries from ip addresses using ip address dictionary

In [166]:
upper_bounds=[(v[0], k[1]) for k,v in d.items()]
def get_country(x):
    for c,ip in upper_bounds:
        if x<=ip: return c  # didn't end up using

### Feature Engineering

#### Find the countries for all the IP addresses listed

In [101]:
v = ip_df.loc[:, 'lower_bound_ip_address':'upper_bound_ip_address'].apply(tuple, 1).tolist()
idx = pd.IntervalIndex.from_tuples(v, closed='both')
fraud_df['country'] = ip_df.loc[idx.get_indexer(fraud_df['ip_address'].values), 'country'].values

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  return getattr(section, self.name)[new_key]


In [108]:
fraud_df = fraud_df.merge(pd.get_dummies(fraud_df.source), left_index=True, right_index=True)

In [152]:
fraud_df = fraud_df.merge(pd.get_dummies(fraud_df.browser), left_index=True, right_index=True)

In [111]:
fraud_df['sex'] = pd.Series(np.where(fraud_df.sex.values == 'F', 1, 0),
          fraud_df.index)

In [114]:
fraud_df['signup_time'] = pd.to_datetime(fraud_df['signup_time'])

In [115]:
fraud_df['purchase_time'] = pd.to_datetime(fraud_df['purchase_time'])

#### Find difference in signup time and purchase time

In [118]:
fraud_df['time_to_purchase'] = fraud_df['purchase_time'] -fraud_df['signup_time']

In [133]:
fraud_df.time_to_purchase = fraud_df.time_to_purchase.apply(lambda x: x / timedelta (days=1))

In [175]:
percent_fraud_df = pd.DataFrame(fraud_df.groupby(['country'])['class'].sum()/fraud_df.groupby(['country'])['class'].count())

#### Find countries with fraud percent above 20

In [185]:
fraud_countries = list(percent_fraud_df.loc[percent_fraud_df['class'] > 0.2].index)

In [186]:
def possible_fraud(country):
    if country in fraud_countries:
        return 1
    else:
        return 0

In [187]:
fraud_df['possible_fraud'] = fraud_df['country'].apply(lambda x: possible_fraud(x))

#### Create X and y matrices for model

In [226]:
fraud_df.head()

Unnamed: 0,user_id,signup_time,purchase_time,purchase_value,device_id,source,browser,sex,age,ip_address,...,Ads,Direct,SEO,time_to_purchase,Chrome,FireFox,IE,Opera,Safari,possible_fraud
0,22058,2015-02-24 22:55:49,2015-04-18 02:47:11,34,QVPSPJUOCKZAR,SEO,Chrome,0,39,732758400.0,...,0,0,1,52.160671,1,0,0,0,0,0
1,333320,2015-06-07 20:39:50,2015-06-08 01:38:54,16,EOGFQPIZPYXFZ,Ads,Chrome,1,53,350311400.0,...,1,0,0,0.207685,1,0,0,0,0,0
2,1359,2015-01-01 18:52:44,2015-01-01 18:52:45,15,YSSKYOSJHPPLJ,SEO,Opera,0,53,2621474000.0,...,0,0,1,1.2e-05,0,0,0,1,0,0
3,150084,2015-04-28 21:13:25,2015-05-04 13:54:50,44,ATGTXKYKUDUQN,SEO,Safari,0,41,3840542000.0,...,0,0,1,5.695428,0,0,0,0,1,0
4,221365,2015-07-21 07:09:52,2015-09-09 18:40:53,39,NAUITBZFJKHWW,Ads,Safari,0,45,415583100.0,...,1,0,0,50.479873,0,0,0,0,1,0


In [189]:
X = fraud_df.loc[:, ['purchase_value', 'sex', 'age', 'Ads', 'Direct', 'SEO', 'time_to_purchase', 'Chrome',
       'FireFox', 'IE', 'Opera', 'Safari', 'possible_fraud']]

In [190]:
y = fraud_df.loc[:, ['class']]

In [200]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [217]:
rf_classifier = RandomForestClassifier(max_depth=10, n_estimators=1000)

In [218]:
rf_classifier.fit(X_train, y_train)

  """Entry point for launching an IPython kernel.


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=10, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [219]:
preds = rf_classifier.predict_proba(X_test)

In [220]:
pos_preds = preds[:,1]

In [221]:
y_test.sum()

class    3607
dtype: int64

In [222]:
y_test.sum()/len(y_test)

class    0.095479
dtype: float64

In [223]:
rf_classifier.score(X_test, y_test)  #accuracy

0.9566679019535179

In [224]:
log_loss(y_test, pos_preds)

0.17882469971613849