In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sys
sys.path.append("/home/jovyan/sz/practice/take_home_challenges/")
import helper_f
%matplotlib inline
from datetime import datetime

## 4 goals
1. find out country of the user
2. build a ML model to predict fraud
3. explain what variables differentiate two groups

In [None]:
country = pd.read_csv("IpAddress_to_Country.csv")
user = pd.read_csv("Fraud_Data.csv")

In [None]:
#basic exploration of user table
#1. find out null value percentage
for col in user.columns:
    print(col, " has: {}% null values".format(user[col].isnull().sum() / user.shape[0]))

#Good to know that no column has null values, now we map id by country

### Map ip_address to country
There are some ip addresses that have no map, and in that case, it is simply null.

In [None]:
user['country'] = user['ip_address'].map(lambda val: country.loc[(country.lower_bound_ip_address < val) & \
                                                    (country.upper_bound_ip_address > val), 'country'].values)
user['country'] = user['country'].map(lambda row: row[0] if len(row) > 0 else '')

In [None]:
#create a new feature purchase duration = purchase_time - signup_time in days
user['signup_time'] = user['signup_time'].apply(lambda row: datetime.strptime(row, "%Y-%m-%d %H:%M:%S"))
user['purchase_time'] = user['purchase_time'].apply(lambda row: datetime.strptime(row, "%Y-%m-%d %H:%M:%S"))
user['purchase_duration'] = user.apply(lambda x: (x['purchase_time'].to_pydatetime() - x['signup_time'].to_pydatetime()).days, axis = 1)

In [None]:
user.head(3)

In [None]:
#check out number of unique values of each column
for col in user.columns:
    print("{} has {} unique values".format(col, len(user[col].unique())))

In [None]:
user.describe()

### Exploratory data analysis with graphs using helper_f

we can see that these three varibles have very balanced distribution for each class of target variable

In [None]:
helper_f.plot_freq(columns=['sex','source','browser'], label= 'class', df=user, ylabel= 'fraud rate', rotation=True)

1. **purchase value**, we can see that for those types that are not fraud, most of purchases occur from 10 - 50<br>
2. **age**, most of fraud activities happen to age between 30 - 40

In [None]:
helper_f.plot_continuous(columns= ['purchase_value', 'age', 'purchase_duration'], label = 'class', df= user)

In [None]:
helper_f.plot_target('class',user)

### Build a machine learning model for prediction

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

#drop these columns
to_drop = ['user_id', 'signup_time', 'purchase_time', 'device_id', 'ip_address']
user.drop(to_drop, axis = 1, inplace=True)

#categorize purchase duration to short and long
user['purchase_duration'] = pd.cut(x = user['purchase_duration'], bins = 3, labels= ['short', 'medium','long'])

#encode these string columns to categorical values
to_encode = ['source', 'browser', 'sex','purchase_duration']
for col in to_encode:
    user[col] = le.fit_transform(user[col])

In [None]:
dummy_country = pd.get_dummies(user['country'])
user.drop(['country'], axis = 1, inplace = True)
user = pd.concat([user,dummy_country], axis = 1)

In [None]:
helper_f.plot_classifier(user.loc[:,user.columns != 'class'].values, user['class'].values)