# Importing Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder

# Importing Dataset

In [2]:
dataset = pd.read_csv("/content/Fraud_check.csv")

In [3]:
dataset.head()

Unnamed: 0,Undergrad,Marital.Status,Taxable.Income,City.Population,Work.Experience,Urban
0,NO,Single,68833,50047,10,YES
1,YES,Divorced,33700,134075,18,YES
2,NO,Married,36925,160205,30,YES
3,YES,Single,50190,193264,15,YES
4,NO,Married,81002,27533,28,NO


In [4]:
dataset.tail(20)

Unnamed: 0,Undergrad,Marital.Status,Taxable.Income,City.Population,Work.Experience,Urban
580,YES,Divorced,31721,123335,1,YES
581,YES,Divorced,31085,57473,10,YES
582,YES,Married,84931,32907,12,YES
583,NO,Married,69036,28874,19,NO
584,NO,Divorced,42679,155684,4,YES
585,NO,Single,84557,137622,2,YES
586,YES,Single,54126,106230,30,NO
587,NO,Single,87685,154677,26,YES
588,YES,Married,85076,93242,27,YES
589,YES,Married,31633,30774,10,NO


In [5]:
dataset['Taxable.Income']=dataset['Taxable.Income'].apply(lambda x:'Good' if x >=30000 else 'Risky')

In [6]:
dataset

Unnamed: 0,Undergrad,Marital.Status,Taxable.Income,City.Population,Work.Experience,Urban
0,NO,Single,Good,50047,10,YES
1,YES,Divorced,Good,134075,18,YES
2,NO,Married,Good,160205,30,YES
3,YES,Single,Good,193264,15,YES
4,NO,Married,Good,27533,28,NO
...,...,...,...,...,...,...
595,YES,Divorced,Good,39492,7,YES
596,YES,Divorced,Good,55369,2,YES
597,NO,Divorced,Good,154058,0,YES
598,YES,Married,Good,180083,17,NO


In [7]:
dataset.tail(20)

Unnamed: 0,Undergrad,Marital.Status,Taxable.Income,City.Population,Work.Experience,Urban
580,YES,Divorced,Good,123335,1,YES
581,YES,Divorced,Good,57473,10,YES
582,YES,Married,Good,32907,12,YES
583,NO,Married,Good,28874,19,NO
584,NO,Divorced,Good,155684,4,YES
585,NO,Single,Good,137622,2,YES
586,YES,Single,Good,106230,30,NO
587,NO,Single,Good,154677,26,YES
588,YES,Married,Good,93242,27,YES
589,YES,Married,Good,30774,10,NO


In [8]:
print(dataset['Marital.Status'].unique())
print(dataset['Urban'].unique())
print(dataset['Taxable.Income'].unique())

['Single' 'Divorced' 'Married']
['YES' 'NO']
['Good' 'Risky']


In [9]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 600 entries, 0 to 599
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Undergrad        600 non-null    object
 1   Marital.Status   600 non-null    object
 2   Taxable.Income   600 non-null    object
 3   City.Population  600 non-null    int64 
 4   Work.Experience  600 non-null    int64 
 5   Urban            600 non-null    object
dtypes: int64(2), object(4)
memory usage: 28.2+ KB


#Encoding Categorical Data

In [10]:
#give no. to the Taxable income name.
order={'Taxable.Income':{'Risky':0,'Good':1}}
dataset=dataset.replace(order)

In [11]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 600 entries, 0 to 599
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Undergrad        600 non-null    object
 1   Marital.Status   600 non-null    object
 2   Taxable.Income   600 non-null    int64 
 3   City.Population  600 non-null    int64 
 4   Work.Experience  600 non-null    int64 
 5   Urban            600 non-null    object
dtypes: int64(3), object(3)
memory usage: 28.2+ KB


In [12]:
dataset=pd.get_dummies(dataset,columns=['Marital.Status','Urban'],drop_first=True)

In [13]:
dataset=pd.get_dummies(dataset,columns=['Undergrad'],drop_first=True)

In [18]:
dataset

Unnamed: 0,Taxable.Income,City.Population,Work.Experience,Marital.Status_Married,Marital.Status_Single,Urban_YES,Undergrad_YES
0,1,50047,10,0,1,1,0
1,1,134075,18,0,0,1,1
2,1,160205,30,1,0,1,0
3,1,193264,15,0,1,1,1
4,1,27533,28,1,0,0,0
...,...,...,...,...,...,...,...
595,1,39492,7,0,0,1,1
596,1,55369,2,0,0,1,1
597,1,154058,0,0,0,1,0
598,1,180083,17,1,0,0,1


In [14]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 600 entries, 0 to 599
Data columns (total 7 columns):
 #   Column                  Non-Null Count  Dtype
---  ------                  --------------  -----
 0   Taxable.Income          600 non-null    int64
 1   City.Population         600 non-null    int64
 2   Work.Experience         600 non-null    int64
 3   Marital.Status_Married  600 non-null    uint8
 4   Marital.Status_Single   600 non-null    uint8
 5   Urban_YES               600 non-null    uint8
 6   Undergrad_YES           600 non-null    uint8
dtypes: int64(3), uint8(4)
memory usage: 16.5 KB


In [20]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 600 entries, 0 to 599
Data columns (total 7 columns):
 #   Column                  Non-Null Count  Dtype
---  ------                  --------------  -----
 0   Taxable.Income          600 non-null    int64
 1   City.Population         600 non-null    int64
 2   Work.Experience         600 non-null    int64
 3   Marital.Status_Married  600 non-null    uint8
 4   Marital.Status_Single   600 non-null    uint8
 5   Urban_YES               600 non-null    uint8
 6   Undergrad_YES           600 non-null    uint8
dtypes: int64(3), uint8(4)
memory usage: 16.5 KB


# Independent and Dependent Variables

In [32]:
y=dataset.iloc[:,0].values
X=dataset.iloc[:,1:7].values

In [19]:
#X1 = dataset.iloc[:,0:2].values
#X2 = dataset.iloc[:,3:6].values
#X=np.concatenate((X1,X2),axis=1)
#y = dataset.iloc[:, 2].values

# Splitting the Dataset

In [33]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.25, random_state = 0)

In [34]:
X_train

array([[137346,     27,      1,      0,      1,      1],
       [ 69386,     21,      1,      0,      1,      1],
       [110892,     18,      0,      1,      1,      1],
       ...,
       [157299,     11,      0,      1,      0,      1],
       [126556,     10,      1,      0,      1,      1],
       [180424,     26,      0,      0,      0,      1]])

In [35]:
y_train

array([1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1,
       1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1,
       1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1,
       1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1,
       1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1,
       1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1,
       1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1,
       1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1,
       1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1,
       1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1,

# Feature Scaling

In [36]:
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.fit_transform(X_test)

# MACHINE LEARNING ALGORITHM

DECISION TREE

In [38]:
#Fitting the model
from sklearn.tree import DecisionTreeClassifier
classifier = DecisionTreeClassifier(criterion='entropy',random_state=0)
classifier.fit(X_train,y_train)

DecisionTreeClassifier(criterion='entropy', random_state=0)

In [39]:
#Predict my test results
y_pred = classifier.predict(X_test)

In [57]:
classifier.score(X_train,y_train)

1.0

In [58]:
classifier.score(X_test,y_test)

0.6133333333333333

In [40]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
cm

array([[ 9, 28],
       [30, 83]])

### Model is 61.3% Accurate

#Validation

In [48]:
from sklearn.model_selection import GridSearchCV
param_grid = {
    'max_depth':[2,4,8,10,None],
    'max_features':[0.25,0.5,1.0],
    'min_samples_split':[0.25,0.5,1.0]
}

In [49]:
classifier2 = GridSearchCV(DecisionTreeClassifier(),param_grid=param_grid)

In [50]:
classifier2.fit(X_train,y_train)

GridSearchCV(estimator=DecisionTreeClassifier(),
             param_grid={'max_depth': [2, 4, 8, 10, None],
                         'max_features': [0.25, 0.5, 1.0],
                         'min_samples_split': [0.25, 0.5, 1.0]})

In [52]:
classifier2.best_params_

{'max_depth': 4, 'max_features': 0.25, 'min_samples_split': 0.5}

In [53]:
dtc2=DecisionTreeClassifier(max_depth=4,max_features=0.25,min_samples_split=0.5)
dtc2.fit(X_train,y_train)

DecisionTreeClassifier(max_depth=4, max_features=0.25, min_samples_split=0.5)

In [55]:
dtc2.score(X_train,y_train)

0.8066666666666666

In [56]:
dtc2.score(X_test,y_test)

0.7533333333333333

In [59]:
from prettytable import PrettyTable
scoreTable =PrettyTable(['Model','train_score','test_score'])
scoreTable.add_row(['MOdel1','1.0','0.6133'])
scoreTable.add_row(['Tuned','0.8066','0.7533'])

print(scoreTable)

+--------+-------------+------------+
| Model  | train_score | test_score |
+--------+-------------+------------+
| MOdel1 |     1.0     |   0.6133   |
| Tuned  |    0.8066   |   0.7533   |
+--------+-------------+------------+


In model 1 gap between the train score and test is very high on the other hand in Tuned this very less ,so, it  definely will give good prediction as compared to without Tuned. 