In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import Normalizer
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import CategoricalNB
from imblearn.over_sampling import SMOTE
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import VotingClassifier

In [4]:
df = pd.read_csv('preprocessed_data_Assignment_2.csv')

In [5]:
x = df.iloc[:, :-1]
y = df['y']

In [6]:
xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size= 0.2, random_state=20)

In [7]:
n = Normalizer() 
x_train = n.fit_transform(xtrain)
x_test = n.transform(xtest)

In [8]:
X_resampled, y_resampled = SMOTE().fit_resample(x_train, ytrain)

In [9]:
clf1 = LogisticRegression(random_state=1)
clf2 = RandomForestClassifier(n_estimators = 40, min_samples_split = 15, min_samples_leaf = 12, max_features = 'log2',
                              max_depth = 10, criterion ='gini', random_state=1)
clf3 = GaussianNB()
clf4 = CategoricalNB()
clf5 = DecisionTreeClassifier(random_state = 1)

eclf = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('gnb', clf3), ('cnb', clf4), ('dtc', clf5)],voting='hard')

In [10]:
eclf.fit(X_resampled, y_resampled)


VotingClassifier(estimators=[('lr', LogisticRegression(random_state=1)),
                             ('rf',
                              RandomForestClassifier(max_depth=10,
                                                     max_features='log2',
                                                     min_samples_leaf=12,
                                                     min_samples_split=15,
                                                     n_estimators=40,
                                                     random_state=1)),
                             ('gnb', GaussianNB()), ('cnb', CategoricalNB()),
                             ('dtc', DecisionTreeClassifier(random_state=1))])

In [11]:
ypred = eclf.predict(x_test)

In [12]:
print(classification_report(ytest, ypred))

              precision    recall  f1-score   support

           0       0.96      0.82      0.89      5845
           1       0.35      0.74      0.48       745

    accuracy                           0.81      6590
   macro avg       0.66      0.78      0.68      6590
weighted avg       0.89      0.81      0.84      6590



In [13]:
roc_auc_score(ytest, ypred)

0.7837881284411043

In [14]:
age = '15'
job = '2'
marital = '1'
education = '5'
default = '1'
housing = '1'
loan = '0'
contact = '1'
month = '10'
day_of_week = '5'
duration = '240'
campaign = '3'
poutcome = '1'

In [15]:
df1 = pd.DataFrame({'age': age, 'job': job, 'marital': marital, 'education': education, 'default': default, 'housing': housing,
                    'loan': loan,'contact': contact, 'month': month, 'day_of_week': day_of_week, 'duration': duration,
                    'campaign': campaign, 'poutcome': poutcome}, index= [0])

In [18]:
import pickle

In [22]:
with open('model_pkl', 'wb') as files:
    pickle.dump(eclf, files)

In [20]:
# load saved model
with open('model_pkl' , 'rb') as f:
    model = pickle.load(f)

In [23]:
with open('normalizer_pkl', 'wb') as files:
    pickle.dump(n, files)

In [21]:
with open('normalizer_pkl' , 'rb') as f:
    norm = pickle.load(f)

In [22]:
var = norm.transform(df1.astype('int64'))

array([[0.06228787, 0.00830505, 0.00415252, 0.02076262, 0.00415252,
        0.00415252, 0.        , 0.00415252, 0.04152525, 0.02076262,
        0.9966059 , 0.01245757, 0.00415252]])

In [23]:
model.predict(var)[0]

array([0], dtype=int64)