# Imports

In [1]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_selector as selector
from sklearn.preprocessing import MinMaxScaler,StandardScaler,OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score,cross_val_predict, StratifiedShuffleSplit
from sklearn.metrics import confusion_matrix,roc_curve,auc,f1_score,roc_auc_score
from sklearn.ensemble import RandomForestClassifier

from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV



# Model Prep

In [4]:
ROOT_FOLDER_PATH = '/home/jupyter/data/ieee-fraud-detection'
TRAINING_TRANSACTION_PATH = f"{ROOT_FOLDER_PATH}/train_transaction.csv"
TRAINING_IDENTITY_PATH = f"{ROOT_FOLDER_PATH}/train_identity.csv"
CORR_PATH = f"{ROOT_FOLDER_PATH}/corr_imputer.csv"


In [5]:
trans_source_df = pd.read_csv(TRAINING_TRANSACTION_PATH)
identity_source_df = pd.read_csv(TRAINING_IDENTITY_PATH)


In [6]:
source_df=trans_source_df.merge(identity_source_df, on='TransactionID', how='left')

In [5]:
for col in source_df.columns:
    print(col,'NaN values present is:',source_df[col].isnull().sum(),end=" ")
    print('type is :',source_df[col].dtypes,end=" ")
    print('% of NaN values:',np.round((source_df[col].isnull().sum()/source_df.shape[0])*100,2))

TransactionID NaN values present is: 0 type is : int64 % of NaN values: 0.0
isFraud NaN values present is: 0 type is : int64 % of NaN values: 0.0
TransactionDT NaN values present is: 0 type is : int64 % of NaN values: 0.0
TransactionAmt NaN values present is: 0 type is : float64 % of NaN values: 0.0
ProductCD NaN values present is: 0 type is : object % of NaN values: 0.0
card1 NaN values present is: 0 type is : int64 % of NaN values: 0.0
card2 NaN values present is: 8933 type is : float64 % of NaN values: 1.51
card3 NaN values present is: 1565 type is : float64 % of NaN values: 0.27
card4 NaN values present is: 1577 type is : object % of NaN values: 0.27
card5 NaN values present is: 4259 type is : float64 % of NaN values: 0.72
card6 NaN values present is: 1571 type is : object % of NaN values: 0.27
addr1 NaN values present is: 65706 type is : float64 % of NaN values: 11.13
addr2 NaN values present is: 65706 type is : float64 % of NaN values: 11.13
dist1 NaN values present is: 352271 ty

In [7]:
raw_features_df = source_df[['TransactionDT', 'TransactionAmt', 'card6', 'isFraud', 'P_emaildomain', 'R_emaildomain']].sort_values('TransactionDT')


In [8]:
raw_features_df = raw_features_df[raw_features_df['TransactionAmt'] < 5000]


In [9]:
raw_features_df['LogTransactionAmt'] = np.log2(raw_features_df['TransactionAmt'])

In [10]:
raw_features_df[['P_organization', 'P_domain']] = raw_features_df['P_emaildomain'].str.split(".", n = 1, expand=True)
raw_features_df[['R_organization', 'R_domain']] = raw_features_df['R_emaildomain'].str.split(".", n = 1, expand=True)

In [11]:
raw_features_df['P_organization'].replace(['None', None], value=np.nan, inplace=True)
raw_features_df['P_domain'].replace(['None', None], value=np.nan, inplace=True)
raw_features_df['R_organization'].replace(['None', None], value=np.nan, inplace=True)
raw_features_df['R_domain'].replace(['None', None], value=np.nan, inplace=True)

In [13]:
raw_features_df['R_domain'].unique()

array([nan, 'com', 'net', 'net.mx', 'com.mx', 'es', 'de', 'edu', 'fr',
       'co.uk', 'rr.com', 'co.jp'], dtype=object)

In [14]:
X_train = raw_features_df[['LogTransactionAmt', 'card6', 'P_organization', 'P_domain', 'R_organization', 'R_domain']][0:400000]
Y_train = raw_features_df[['isFraud']][0:400000].values.flatten()

X_test = raw_features_df[['LogTransactionAmt', 'card6', 'P_organization', 'P_domain', 'R_organization', 'R_domain']][400001:]
Y_test = raw_features_df[['isFraud']][400001:].values.flatten()

In [15]:
categorical_features=['card6','P_organization', 'P_domain']
categorical_transformer = Pipeline(steps=[
    ('categorical_imputer', SimpleImputer(strategy='most_frequent')),
    ('one_hot_encoder', OneHotEncoder(handle_unknown='ignore'))])


In [16]:
numerical_features=['LogTransactionAmt']
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

In [17]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)])

In [26]:
clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', RandomForestClassifier(random_state=0, class_weight='balanced', max_depth=5))])

In [27]:
cross_validator = StratifiedShuffleSplit(2, test_size=0.3, random_state=0)
gridSearch = GridSearchCV(clf, param_grid=dict(), cv=cross_validator)

In [34]:
model = gridSearch.fit(X_train,Y_train)

In [35]:
model.best_score_

0.7328541666666666

In [36]:
y_pred = model.predict(X_test)

In [37]:
f1_score(Y_test, y_pred, average='weighted')

0.8356334078210603

In [38]:
roc_auc_score(Y_test, y_pred, average='weighted')

0.6221620484297186

In [39]:
pd.DataFrame(y_pred, columns=['is_fraud']).groupby('is_fraud').size()

is_fraud
0    145415
1     45106
dtype: int64

In [40]:
pd.DataFrame(Y_test, columns=['is_fraud']).groupby('is_fraud').size()

is_fraud
0    183762
1      6759
dtype: int64

# Test Data Prep

In [85]:
TEST_TRANSACTION_PATH = f"{ROOT_FOLDER_PATH}/test_transaction.csv"
test_source_df=pd.read_csv(TEST_TRANSACTION_PATH)

raw_features_test_df = test_source_df[['TransactionDT', 'card6', 'TransactionAmt', 'P_emaildomain', 'R_emaildomain']]
raw_features_test_df['LogTransactionAmt'] = np.log2(raw_features_test_df['TransactionAmt'])


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


In [89]:
raw_features_test_df[['P_organization', 'P_domain']] = raw_features_test_df['P_emaildomain'].str.split(".", n = 1, expand=True)
raw_features_test_df[['R_organization', 'R_domain']] = raw_features_test_df['R_emaildomain'].str.split(".", n = 1, expand=True)


In [90]:
raw_features_test_df['P_organization'].replace('None', value=np.nan, inplace=True)
raw_features_test_df['P_domain'].replace('None', value=np.nan, inplace=True)
raw_features_test_df['R_organization'].replace('None', value=np.nan, inplace=True)
raw_features_test_df['R_domain'].replace('None', value=np.nan, inplace=True)


In [91]:
X_test_final = raw_features_test_df[['LogTransactionAmt', 'card6', 'P_organization', 'P_domain', 'R_organization', 'R_domain']]
y_preds_test = model.predict(X_test_final)

In [92]:
test_source_df['TransactionID'].shape

(506691,)

In [93]:
raw_features_test_df.shape

(506691, 10)

In [94]:
y_preds_test.shape

(506691,)

In [95]:
raw_features_test_df.shape

(506691, 10)

In [96]:
output = pd.DataFrame({'TransactionID': test_source_df['TransactionID'], 'isFraud': y_preds_test})

In [97]:
output.to_csv(f"{ROOT_FOLDER_PATH}/output.csv", index=False)