<a href="https://colab.research.google.com/github/divinelof/21120404_dissertation/blob/main/NB4_EccomerceRobustness(NoiseRate%3D1_).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **SETUP**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# **Libraries**

In [None]:
# General imports
import numpy as np
import pandas as pd
import os, sys, gc, warnings, random, datetime

from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

from tqdm import tqdm

import math
import lightgbm as lgb
import xgboost as xgb
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import classification_report, confusion_matrix, f1_score, roc_auc_score

warnings.filterwarnings('ignore')

# **UTILS**

In [None]:
class FEATURES:

  SEED = 2022
  
  LGBMModelParameter = {
                    'objective':'binary',
                    'boosting_type':'gbdt',
                    'metric':'auc',
                    'n_jobs':-1,
                    'learning_rate':0.01,
                    'num_leaves': 2**8,
                    'max_depth':-1,
                    'tree_learner':'serial',
                    'colsample_bytree': 0.5,
                    'subsample_freq':1,
                    "scale_pos_weight":5,
                    'subsample':0.7,
                    'n_estimators':800,
                    'max_bin':255,
                    'verbose':-1,
                    'seed': SEED,
                    'early_stopping_rounds':100} 

In [None]:
########################### Helpers
#################################################################################
## Seeder
# :seed to make all processes deterministic     # type: int
def seed_everything(seed=0):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    
## Memory Reducer
# :df pandas dataframe to reduce size             # type: pd.DataFrame()
# :verbose                                        # type: bool
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [None]:
seed_everything(seed=FEATURES.SEED)

# **Dataset**

In [None]:
data_path = '/content/drive/MyDrive/EcommerceFraudDetection/'
train = pd.read_csv(f"{data_path}FinalTrain.csv")
validation = pd.read_csv(f"{data_path}FinalValidation.csv")

In [None]:
validation = validation.fillna(0) 

In [None]:
# ------------------------------------------------------------------------
# DISTRIBUTION OF THE ACTUAL DATASET
# ------------------------------------------------------------------------

train["isFraud"].value_counts()

0.0    151966
1.0     45590
Name: isFraud, dtype: int64

In [None]:
# ------------------------------------------------------------------------
# ADDING 1% NOISE RATE (1976 Transactions)
# ------------------------------------------------------------------------
# ------------------------------------------------------------------------


# ------------------------------------------------------------------------
# LABEL SWAPPING FUNCTION
# ------------------------------------------------------------------------

def interchangeLabel(x):
  if x == 0:
    return 1
  else:
    return 0

# ------------------------------------------------------------------------
# SHUFFLING DATASET
# ------------------------------------------------------------------------

train = train.sample(frac=1).reset_index(drop=True)


train.isFraud[:1976] = train.isFraud[:1976].map(interchangeLabel)

In [None]:
# ------------------------------------------------------------------------
# DISTRIBUTION AFTER INTRODUCING NOISE
# ------------------------------------------------------------------------

train["isFraud"].value_counts()

0.0    150878
1.0     46678
Name: isFraud, dtype: int64

# **Splitting Train data**

In [None]:
train_idx, test_idx = train_test_split(range(len(train)), test_size=0.2, random_state=FEATURES.SEED, stratify=train.isFraud)

train_df = train.iloc[train_idx, :]
test_df = train.iloc[test_idx, :]

X_train, y_train = train_df.drop(["isFraud"], axis = 1), train_df["isFraud"]
X_test, y_test = test_df.drop(["isFraud"], axis = 1), test_df["isFraud"]

# **SUPPORT VECTOR MACHINE (BASELINE MODEL)**

In [None]:
from sklearn.preprocessing import StandardScaler    
scaler = StandardScaler()    
train_svm = scaler.fit_transform(train.drop(["isFraud"], axis = 1))    
val_svm = scaler.transform(validation.drop(["isFraud"], axis = 1))  

In [None]:
# "Support vector classifier"  
SVMMODEL = SVC(kernel='rbf', random_state=FEATURES.SEED)  
SVMMODEL.fit(train_svm, train["isFraud"])  

In [None]:
valPredSVM = SVMMODEL.predict(val_svm)
pd.Series(valPredSVM).to_csv(f"{data_path}valPredSVM_(1% Noise Rate).csv", index = False)

# **LGBM**

In [None]:
train_data = lgb.Dataset(X_train, label=y_train)
test_data = lgb.Dataset(X_test, label= y_test)  

LGBMModel = lgb.train(
            FEATURES.LGBMModelParameter,
            train_data,
            valid_sets = [train_data,test_data],
            verbose_eval = 200,)

valPredLGB = LGBMModel.predict(validation.drop(["isFraud"], axis = 1))
pd.Series(valPredLGB).to_csv(f"{data_path}valPredLGB_(1% Noise Rate).csv", index = False)

# **RANDOM FOREST**

In [None]:
RandomForestModel = RandomForestClassifier(criterion='entropy', max_features='sqrt',
                                             max_samples=0.5, min_samples_split=80)

RandomForestModel.fit(train.drop(["isFraud"], axis = 1), train["isFraud"])

valPredRF = RandomForestModel.predict_proba(validation.drop(["isFraud"], axis = 1))[:,1]
pd.Series(valPredRF).to_csv(f"{data_path}valPredRF_(1% Noise Rate).csv", index = False)

# **XGBOOST**

In [None]:
XGBoostModel = xgb.XGBClassifier( 
        objective='binary:logistic',
        n_estimators=2000,
        max_depth=12, 
        learning_rate=0.02, 
        subsample=0.8,
        colsample_bytree=0.4, 
        missing=-1, 
        eval_metric=['auc','logloss'],
        nthread=4,
        scale_pos_weight = 5,
        tree_method='hist')

XGBoostModel.fit(train.drop(["isFraud"], axis = 1), train["isFraud"])

valPredXGB = XGBoostModel.predict_proba(validation.drop(["isFraud"], axis = 1))[:, 1]
pd.Series(valPredXGB).to_csv(f"{data_path}valPredXGB_(1% Noise Rate).csv", index = False)

In [None]:
roc_auc_score(validation["isFraud"],valPredXGB)

0.9659635426520911