# Fraud Analysis Classification

## Import Dependencies and load data

In [1]:
#Import necessary dependencies
import pandas as pd
from sklearn.feature_selection import SelectFromModel
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from re import sub
from decimal import Decimal
from sklearn.linear_model import LogisticRegression, LassoCV
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix

fraudInstanceData = pd.read_csv("FraudInstanceData.csv", header=0, index_col=0)

## Treat categorical data

In [2]:
maritalStatuses = pd.get_dummies(fraudInstanceData["Marital Status"])
accomodationTypes = pd.get_dummies(fraudInstanceData["Accomodation Type"])
fraudInstanceData = fraudInstanceData.drop('Marital Status', axis=1)
fraudInstanceData = fraudInstanceData.drop('Accomodation Type', axis=1)
fraudInstanceData = fraudInstanceData.join(maritalStatuses)
fraudInstanceData = fraudInstanceData.join(accomodationTypes)

## Convert currency to numerical data

In [3]:
currencyToMoney = lambda c: Decimal(sub(r'[^\d.]', '', c))
fraudInstanceData['Claim Amount'] = fraudInstanceData["Claim Amount"].apply(currencyToMoney)

## Split training and testing data

In [4]:
y = fraudInstanceData.iloc[:, 1]
X = fraudInstanceData.iloc[:, 1:]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=23)

## Create the pipeline

In [5]:
feature_selection = SelectFromModel(LassoCV())
pipeline = Pipeline([('feature_selection', feature_selection),
                     ('regression', LogisticRegression())])
grid_cv = GridSearchCV(pipeline, {}, cv=10)

_ = grid_cv.fit(X_train, y_train)

fs = grid_cv.best_estimator_.named_steps['feature_selection']
selected_features = X_train.columns[fs.get_support()]

The following features were selected by the Lasso selector:

In [6]:
selected_features

Index(['Damaged Item'], dtype='object')

## Do the test data predictions and calculates the score

In [7]:
y_pred = grid_cv.predict(X_test)
print(grid_cv.score(X_test, y_pred))

1.0


## Displays the confusion matrix

In [8]:
print(confusion_matrix(y_test, y_pred))

[[439   0]
 [  0 866]]


The source code can be found at: [https://github.com/fredcaram/FraudInstance]