### Problem Statement: Use random forest to prepare a model on fraud data treating those who have taxable_income <= 30000 as "Risky" and others are "Good"

In [4]:
import pandas as pd
import matplotlib.pyplot as plt
#from sklearn import datasets  
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn.metrics import classification_report
from sklearn import preprocessing

In [5]:
data = pd.read_csv('Fraud_check.csv')
data

Unnamed: 0,Undergrad,Marital.Status,Taxable.Income,City.Population,Work.Experience,Urban
0,NO,Single,68833,50047,10,YES
1,YES,Divorced,33700,134075,18,YES
2,NO,Married,36925,160205,30,YES
3,YES,Single,50190,193264,15,YES
4,NO,Married,81002,27533,28,NO
...,...,...,...,...,...,...
595,YES,Divorced,76340,39492,7,YES
596,YES,Divorced,69967,55369,2,YES
597,NO,Divorced,47334,154058,0,YES
598,YES,Married,98592,180083,17,NO


In [6]:
data.isnull().sum()

Undergrad          0
Marital.Status     0
Taxable.Income     0
City.Population    0
Work.Experience    0
Urban              0
dtype: int64

In [7]:
data.columns

Index(['Undergrad', 'Marital.Status', 'Taxable.Income', 'City.Population',
       'Work.Experience', 'Urban'],
      dtype='object')

In [8]:
data['Taxable.Income']=np.where(data['Taxable.Income']<=30000,'Risky','Good')

In [9]:
data

Unnamed: 0,Undergrad,Marital.Status,Taxable.Income,City.Population,Work.Experience,Urban
0,NO,Single,Good,50047,10,YES
1,YES,Divorced,Good,134075,18,YES
2,NO,Married,Good,160205,30,YES
3,YES,Single,Good,193264,15,YES
4,NO,Married,Good,27533,28,NO
...,...,...,...,...,...,...
595,YES,Divorced,Good,39492,7,YES
596,YES,Divorced,Good,55369,2,YES
597,NO,Divorced,Good,154058,0,YES
598,YES,Married,Good,180083,17,NO


In [10]:
label_encoder = preprocessing.LabelEncoder()
data['Undergrad']=label_encoder.fit_transform(data['Undergrad'])
data['Marital.Status']=label_encoder.fit_transform(data['Marital.Status'])
data['Work.Experience']=label_encoder.fit_transform(data['Work.Experience'])
data['Taxable.Income']=label_encoder.fit_transform(data['Taxable.Income'])
data['Urban']=label_encoder.fit_transform(data['Urban'])

In [11]:
data

Unnamed: 0,Undergrad,Marital.Status,Taxable.Income,City.Population,Work.Experience,Urban
0,0,2,0,50047,10,1
1,1,0,0,134075,18,1
2,0,1,0,160205,30,1
3,1,2,0,193264,15,1
4,0,1,0,27533,28,0
...,...,...,...,...,...,...
595,1,0,0,39492,7,1
596,1,0,0,55369,2,1
597,0,0,0,154058,0,1
598,1,1,0,180083,17,0


In [12]:
array = data.values
array

array([[     0,      2,      0,  50047,     10,      1],
       [     1,      0,      0, 134075,     18,      1],
       [     0,      1,      0, 160205,     30,      1],
       ...,
       [     0,      0,      0, 154058,      0,      1],
       [     1,      1,      0, 180083,     17,      0],
       [     0,      0,      0, 158137,     16,      0]], dtype=int64)

In [13]:
X = array[:,[0,1,3,4,5]]

In [14]:
Y = array[:,2]

In [15]:
X

array([[     0,      2,  50047,     10,      1],
       [     1,      0, 134075,     18,      1],
       [     0,      1, 160205,     30,      1],
       ...,
       [     0,      0, 154058,      0,      1],
       [     1,      1, 180083,     17,      0],
       [     0,      0, 158137,     16,      0]], dtype=int64)

In [16]:
Y

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1,
       0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0,
       0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0,
       0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0,
       0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,

In [45]:
# Random Forest Classification
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators=100, oob_score=True, max_features=3)
model.fit(X, Y)

RandomForestClassifier(max_features=3, oob_score=True)

In [46]:
# Accuracy 
accuracy = model.score(X, Y)
print(accuracy*100)

100.0


In [47]:
model.estimators_

[DecisionTreeClassifier(max_features=3, random_state=476743167),
 DecisionTreeClassifier(max_features=3, random_state=1624307107),
 DecisionTreeClassifier(max_features=3, random_state=724382014),
 DecisionTreeClassifier(max_features=3, random_state=1058918115),
 DecisionTreeClassifier(max_features=3, random_state=544050649),
 DecisionTreeClassifier(max_features=3, random_state=1153648316),
 DecisionTreeClassifier(max_features=3, random_state=580371395),
 DecisionTreeClassifier(max_features=3, random_state=1523007509),
 DecisionTreeClassifier(max_features=3, random_state=391054320),
 DecisionTreeClassifier(max_features=3, random_state=687441171),
 DecisionTreeClassifier(max_features=3, random_state=997261276),
 DecisionTreeClassifier(max_features=3, random_state=846060088),
 DecisionTreeClassifier(max_features=3, random_state=2005507509),
 DecisionTreeClassifier(max_features=3, random_state=340255129),
 DecisionTreeClassifier(max_features=3, random_state=1629157261),
 DecisionTreeClassi

In [48]:
model.oob_score_

0.7466666666666667

In [49]:
model.classes_

array([0, 1], dtype=int64)

In [50]:
data['model_pred']=model.predict(X)
data[['Taxable.Income','model_pred']]

Unnamed: 0,Taxable.Income,model_pred
0,0,0
1,0,0
2,0,0
3,0,0
4,0,0
...,...,...
595,0,0
596,0,0
597,0,0
598,0,0


In [51]:
pd.crosstab(data['Taxable.Income'],data['model_pred'])

model_pred,0,1
Taxable.Income,Unnamed: 1_level_1,Unnamed: 2_level_1
0,476,0
1,0,124


In [52]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier

In [53]:
# Bagging
kfold = KFold(n_splits=10, random_state=7, shuffle=True)
cart = DecisionTreeClassifier()
num_trees = 500
model = BaggingClassifier(base_estimator=cart, n_estimators=num_trees, random_state=7)
results1 = cross_val_score(model, X, Y, cv=kfold)
print(results1.mean()*100)
results1

74.66666666666666


array([0.73333333, 0.7       , 0.83333333, 0.83333333, 0.73333333,
       0.7       , 0.73333333, 0.65      , 0.78333333, 0.76666667])

In [54]:
# AdaBoost Classification
num_trees = 100
kfold = KFold(n_splits=10, random_state=7,shuffle=True)
model = AdaBoostClassifier(n_estimators=num_trees, random_state=7)
results2 = cross_val_score(model, X, Y, cv=kfold)
print(results2.mean()*100)
results2

77.66666666666666


array([0.75      , 0.73333333, 0.83333333, 0.88333333, 0.75      ,
       0.76666667, 0.81666667, 0.63333333, 0.83333333, 0.76666667])