In [1]:
import pandas as pd

In [2]:
# Loading the required data 
fdf = pd.read_csv('Fraud_check.csv')
fdf.head()

Unnamed: 0,Undergrad,Marital.Status,Taxable.Income,City.Population,Work.Experience,Urban
0,NO,Single,68833,50047,10,YES
1,YES,Divorced,33700,134075,18,YES
2,NO,Married,36925,160205,30,YES
3,YES,Single,50190,193264,15,YES
4,NO,Married,81002,27533,28,NO


In [3]:
# creating risk column based on taxable income value 
fdf['risk']=fdf['Taxable.Income']
fdf.loc[fdf['risk']<30000, 'risk'] = 1
fdf.loc[fdf['risk']>30000, 'risk'] = 0

In [4]:
# dropping the city population column has no affect on outcome

fdf = fdf.drop(['City.Population'], axis = 1)
fdf.head()

Unnamed: 0,Undergrad,Marital.Status,Taxable.Income,Work.Experience,Urban,risk
0,NO,Single,68833,10,YES,0
1,YES,Divorced,33700,18,YES,0
2,NO,Married,36925,30,YES,0
3,YES,Single,50190,15,YES,0
4,NO,Married,81002,28,NO,0


In [5]:
# labeling the necessary  data where ever required
from sklearn import preprocessing

label_encoder = preprocessing.LabelEncoder()

fdf['Undergrad']= label_encoder.fit_transform(fdf['Undergrad']) 
fdf['Marital.Status']= label_encoder.fit_transform(fdf['Marital.Status'])
fdf['Urban']= label_encoder.fit_transform(fdf['Urban'])

fdf.head()


Unnamed: 0,Undergrad,Marital.Status,Taxable.Income,Work.Experience,Urban,risk
0,0,2,68833,10,1,0
1,1,0,33700,18,1,0
2,0,1,36925,30,1,0
3,1,2,50190,15,1,0
4,0,1,81002,28,0,0


In [6]:
# As the taxable income alredy labelled to another column we can drop this
df = fdf.drop(['Taxable.Income'], axis=1)

In [7]:
# Checking for risk column data distribution which will be used for balancing the data in the later part 
from collections import Counter
counter = Counter(df.risk)
counter

Counter({0: 476, 1: 124})

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 600 entries, 0 to 599
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype
---  ------           --------------  -----
 0   Undergrad        600 non-null    int32
 1   Marital.Status   600 non-null    int32
 2   Work.Experience  600 non-null    int64
 3   Urban            600 non-null    int32
 4   risk             600 non-null    int64
dtypes: int32(3), int64(2)
memory usage: 16.5 KB


In [9]:
df.head()

Unnamed: 0,Undergrad,Marital.Status,Work.Experience,Urban,risk
0,0,2,10,1,0
1,1,0,18,1,0
2,0,1,30,1,0
3,1,2,15,1,0
4,0,1,28,0,0


In [10]:
# Assigning the values to x and y variables
array = df.values
X = array[:,:-1]
Y = array[:,-1]

In [11]:
# Splitting data into training and testing data set
from sklearn.model_selection import train_test_split
X_train, X_test,y_train,y_test = train_test_split(X,Y, test_size=0.2,random_state=7)

In [30]:
# For balancing the data using SMOTE technic import smote
#pip install imbalanced-learn
from imblearn.over_sampling import SMOTE

oversample = SMOTE(sampling_strategy = 'minority')
X_o , Y_o = oversample.fit_resample(X_train, y_train)


In [31]:
# Checking the balance of the training data which will be used in the later part
Counter(Y_o)

Counter({0: 385, 1: 385})

In [16]:
# Building model using bagging classifier 
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier


kfold = KFold(n_splits=10)
cart = DecisionTreeClassifier()
num_trees = 100
model = BaggingClassifier(base_estimator=cart, n_estimators=num_trees, random_state=7)
results = cross_val_score(model, X,Y ,cv=kfold)
print("The Accuracy using Decision Tree Classifier with Bagging:", results.mean())

The Accuracy using Decision Tree Classifier with Bagging: 0.7


In [None]:
# the efficiency we got is 0.7 

In [17]:
# Builiding the model with Random Forest

from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier

num_trees = 100
max_features = 5
kfold = KFold(n_splits=10)
model == RandomForestClassifier(n_estimators=num_trees, max_features=max_features)
results = cross_val_score(model, X, Y, cv=kfold)
print("The Accuracy using Decision Tree Classifier with Random Forest:", results.mean())

The Accuracy using Decision Tree Classifier with Random Forest: 0.7


In [None]:
# Here also the efficieny is 0.7

In [38]:
# building the model with adaboosting classifier

from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import AdaBoostClassifier

num_trees = 10
kfold = KFold(n_splits=10)
model = AdaBoostClassifier(n_estimators=num_trees, random_state=7)
results = cross_val_score(model, X, Y, cv=kfold)
print("The Accuracy using Decision Tree Classifier with AdaBoostong:", results.mean())


The Accuracy using Decision Tree Classifier with AdaBoostong: 0.7933333333333332


In [None]:
# The efficiency we got here is 0.793

In [62]:
# using ensemple technic building model
# here we are using Decision Tree, SVM and Random  Forest classifiers

from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import RandomForestClassifier


kfold = KFold(n_splits=10)

# create the sub models
estimators = []
model2 = DecisionTreeClassifier()
estimators.append(('cart', model2))
model3 = AdaBoostClassifier(n_estimators=10, random_state=7)
estimators.append(('svm', model3))
model4 = RandomForestClassifier(n_estimators=100, max_features = 2)
estimators.append(('RFM', model4))

# create the ensemble model
ensemble = VotingClassifier(estimators)
results = cross_val_score(ensemble, X, Y, cv=kfold)
print(results.mean())

0.7066666666666668


In [None]:
# here we got an average accuracy of .706

In [None]:
# now lets fit the ensemble model and check with test data

In [68]:
ensemble.fit( X_train, y_train)

VotingClassifier(estimators=[('cart', DecisionTreeClassifier()),
                             ('svm',
                              AdaBoostClassifier(n_estimators=10,
                                                 random_state=7)),
                             ('RFM', RandomForestClassifier(max_features=2))])

In [69]:
# predicting using test data
predict = ensemble.predict(X_test)

In [70]:
# cross tabbing for finding out accuracy
pd.crosstab(y_test, predict)

col_0,0,1
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
0,83,8
1,27,2


In [71]:
# we got an accuracy of .708. But it is an imbalanced data let us check for f1 score
import numpy as np
np.mean(predict==y_test)

0.7083333333333334

In [72]:
# The f1 score is very low
from sklearn.metrics import f1_score
f1_score(y_test, predict, zero_division=1)

0.10256410256410257

In [None]:
# let us now try ensemble model with train data as earlier we built with complete data

In [56]:
# using x_train and y_train
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import RandomForestClassifier


kfold = KFold(n_splits=10)

# create the sub models
estimators = []
model2 = DecisionTreeClassifier()
estimators.append(('cart', model2))
model3 = AdaBoostClassifier(n_estimators=10, random_state=7)
estimators.append(('svm', model3))
model4 = RandomForestClassifier(n_estimators=100, max_features = 2)
estimators.append(('RFM', model4))

# create the ensemble model
ensemble = VotingClassifier(estimators)
results = cross_val_score(ensemble, X_train, y_train, cv=kfold)
print(results.mean())

0.71875


In [73]:
ensemble.fit( X_train, y_train)

VotingClassifier(estimators=[('cart', DecisionTreeClassifier()),
                             ('svm',
                              AdaBoostClassifier(n_estimators=10,
                                                 random_state=7)),
                             ('RFM', RandomForestClassifier(max_features=2))])

In [74]:
predict = ensemble.predict(X_test)

In [75]:
pd.crosstab(y_test, predict)

col_0,0,1
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
0,83,8
1,27,2


In [76]:
import numpy as np
np.mean(predict==y_test)

0.7083333333333334

In [77]:
from sklearn.metrics import f1_score
f1_score(y_test, predict, zero_division=1)

0.10256410256410257

In [None]:
# here also the f1 score is very low 

In [None]:
# let us try the ensemple model with balance data

In [78]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import RandomForestClassifier


kfold = KFold(n_splits=10)

# create the sub models
estimators = []
model2 = DecisionTreeClassifier()
estimators.append(('cart', model2))
model3 = AdaBoostClassifier(n_estimators=10, random_state=7)
estimators.append(('svm', model3))
model4 = RandomForestClassifier(n_estimators=100, max_features = 2)
estimators.append(('RFM', model4))

# create the ensemble model
ensemble = VotingClassifier(estimators)
results = cross_val_score(ensemble, X_o, Y_o, cv=kfold)
print(results.mean())

0.6584415584415584


In [79]:
ensemble.fit( X_o, Y_o)

VotingClassifier(estimators=[('cart', DecisionTreeClassifier()),
                             ('svm',
                              AdaBoostClassifier(n_estimators=10,
                                                 random_state=7)),
                             ('RFM', RandomForestClassifier(max_features=2))])

In [80]:
predict = ensemble.predict(X_test)

In [81]:
pd.crosstab(y_test, predict)

col_0,0,1
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
0,63,28
1,20,9


In [82]:
import numpy as np
np.mean(predict==y_test)

0.6

In [83]:
from sklearn.metrics import f1_score
f1_score(y_test, predict, zero_division=1)

0.27272727272727276

In [None]:
# here the f1 score is slighly increased

In [None]:
# now try the ensemple model with data balanced using under sampling technic

In [86]:
# converting to bslsnced data using under sampling

from imblearn.under_sampling import RandomUnderSampler
undersample = RandomUnderSampler(sampling_strategy='majority')
X_u, Y_u = undersample.fit_resample(X_train,y_train)

In [None]:
# now fit the ensemple model with  balanced data using under sampling

In [87]:

from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import RandomForestClassifier


kfold = KFold(n_splits=10)

# create the sub models
estimators = []
model2 = DecisionTreeClassifier()
estimators.append(('cart', model2))
model3 = AdaBoostClassifier(n_estimators=10, random_state=7)
estimators.append(('svm', model3))
model4 = RandomForestClassifier(n_estimators=100, max_features = 2)
estimators.append(('RFM', model4))

# create the ensemble model
ensemble = VotingClassifier(estimators)
results = cross_val_score(ensemble, X_u, Y_u, cv=kfold)
print(results.mean())

0.41052631578947363


In [89]:
ensemble.fit( X_u, Y_u)

VotingClassifier(estimators=[('cart', DecisionTreeClassifier()),
                             ('svm',
                              AdaBoostClassifier(n_estimators=10,
                                                 random_state=7)),
                             ('RFM', RandomForestClassifier(max_features=2))])

In [90]:
predict = ensemble.predict(X_test)

In [91]:
pd.crosstab(y_test, predict)

col_0,0,1
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
0,50,41
1,15,14


In [92]:
import numpy as np
np.mean(predict==y_test)

0.5333333333333333

In [93]:
from sklearn.metrics import f1_score
f1_score(y_test, predict, zero_division=1)

0.33333333333333337

In [None]:
# Here we observe the overall accuracy  reduced but f1 score got increased

In [None]:
# Now by just using random forest let us try build the model

In [16]:
# Builiding the model with Random Forest

from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_estimators=100, max_features=2)

In [17]:
# fitting the model with train data

In [24]:
model.fit(X_train, y_train)

RandomForestClassifier(max_features=2)

In [25]:
# predicting for test data
y_predict = model.predict(X_test)

In [26]:
pd.crosstab(y_test, y_predict)

col_0,0,1
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
0,82,9
1,26,3


In [27]:
import numpy as np
np.mean(y_predict==y_test)

0.7083333333333334

In [None]:
# like wise we can try other models with different scenarios

In [None]:
# ----x-----