# **Traditional Models**

According to our literature review, we decided to explore a variety of traditional ML algorithms to assist us in our problem.

**Goal:** Find the best hyperparameters and train the following models:
1. Logistic Regression
2. KNN
3. Support Vector Classifier (SVC)
4. Decision Trees
5. Random Forest

6. XGBoost will be done separately at the end, since it requires custom data loading which is a different process than the other models.

### Imports

In [1]:
import joblib

In [2]:
import pandas as pd
import numpy as np

In [3]:
# Metrics
from sklearn.metrics import make_scorer, confusion_matrix, classification_report

In [4]:
# Tools
from sklearn.model_selection import train_test_split, GridSearchCV

In [5]:
# Models
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.svm import SVC

In [6]:
TRAIN_PATH = 'data/train_data.csv'
TEST_PATH = 'data/test_data.csv'

In [7]:
train = pd.read_csv(TRAIN_PATH)
test = pd.read_csv(TEST_PATH)

### Retrieve Target Variable from .csv File

In [8]:
X_train = train.drop(columns=['isFraud'], axis=1)
y_train = train['isFraud']

In [9]:
X_test = test.drop(columns=['isFraud'], axis=1)
y_test = test['isFraud']

### Sample 5% of Training Data (Dataset is too large)

In [10]:
X_train, X_unused, y_train, y_unused = train_test_split(X_train, y_train, test_size=0.95, stratify=y_train, random_state=777)

### Calculate Class Weights

In [11]:

class_counts = test['isFraud'].value_counts()
total_count = class_counts.sum()

# Calculate inverse weight (total count / class count)
weights = []
for i in range(2):
    weights.append(total_count/class_counts.loc[i])

# Get the total weight
total = pd.DataFrame(weights).sum()

# Create the class weighting dictionary
class_weighting = {}
for i in range(2):
    class_weighting[i] = weights[i]/total

print ( class_weighting )

{0: 0    0.001291
dtype: float64, 1: 0    0.998709
dtype: float64}


### Define Models and Hyperparameter Search Range

In [None]:
models = {


    #"Logistic Regression": {
    #    "instance": LogisticRegression(penalty='l2',
    #                                   solver='lbfgs',
    #                                   class_weight=class_weighting
    #                                   ),
    #    "params": {
    #        "C": [0.001, 0.01, 0.1, 1, 10],
    #        'max_iter': [100, 200, 300],
    #        }
    #    },
    #    
    #    "K-Nearest Neighbors": {
    #    "instance": KNeighborsClassifier(n_jobs=-1),
    #    "params": {
    #        "n_neighbors": list(range(5, 50, 2)),
    #        "weights": ["uniform", "distance"]
    #        }
    #    },

    #"Decision Tree": {
    #    "instance": DecisionTreeClassifier(class_weight=class_weighting),
    #    "params": {
    #        "max_depth": [10, 20, 30, 40],
    #        "min_samples_split": [2, 5, 10]
    #        }
    #    },

    "Random Forest": {
        "instance": RandomForestClassifier(n_jobs=-1,
                                           class_weight=class_weighting),
        "params": {
            "n_estimators": [50, 100],
            "max_depth": [None, 10, 20]
            }
        },
    



        
    #"Support Vector Classifier": {
    #    "instance": SVC(kernel='linear',
    #                    class_weight=class_weighting),
    #    "params": {
    #        "C": [0.01]
    #        }
    #    },


    }

### Perform Grid Search
Find the best parameters for each of the five models and then save best model info, like estimator, params, and score to a file for later use.

In [13]:

for model_name, model_info in models.items():

        # Set up grid search
        grid_search = GridSearchCV(estimator=model_info['instance'],
                                   param_grid=model_info['params'],
                                   verbose=2,
                                   n_jobs=-1,
                                   cv=5
                                   )

        # Fit search to training data
        grid_search.fit(X_train, y_train)

        # Retrieve the best scores and params
        model_data = {
        'best estimator': grid_search.best_estimator_,
        'best params': grid_search.best_params_,
        'best score': grid_search.best_score_
       }
        
        joblib.dump(model_data, 'models/' + model_name + '.pkl')
        





Fitting 5 folds for each of 12 candidates, totalling 60 fits


  weight[i] = class_weight[c]


Fitting 5 folds for each of 16 candidates, totalling 80 fits


KeyboardInterrupt: 