In [351]:
import pandas as pd
import numpy as np
from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn import metrics

In [38]:
#import datasets
dataset = pd.read_csv("trainingset.csv")
queries = pd.read_csv("queries.csv")

In [352]:
#Create feature headers
dataset.columns = ["ID","Age","Job","MartialStatus","Education","Default","Balance","Housing","Loan","Contact","Day","Month","Duration","Campaign","Pdays","Previous","Poutcome","output"]
queries.columns = ["ID","Age","Job","MartialStatus","Education","Default","Balance","Housing","Loan","Contact","Day","Month","Duration","Campaign","Pdays","Previous","Poutcome","output"]

In [40]:
dataset.head()
queries.head()

Unnamed: 0,ID,Age,Job,MartialStatus,Education,Default,Balance,Housing,Loan,Contact,Day,Month,Duration,Campaign,Pdays,Previous,Poutcome,output
0,TEST2,30,JobCat3,single,primary,no,23,yes,yes,unknown,5,may,0,1,-1,0,unknown,?
1,TEST3,53,JobCat9,married,secondary,no,-3,no,no,unknown,5,may,0,1,-1,0,unknown,?
2,TEST4,36,JobCat6,single,tertiary,no,424,yes,no,unknown,5,may,0,1,-1,0,unknown,?
3,TEST5,53,JobCat9,married,secondary,no,384,yes,no,unknown,5,may,0,1,-1,0,unknown,?
4,TEST6,28,JobCat9,married,secondary,no,152,yes,yes,unknown,5,may,0,2,-1,0,unknown,?


In [202]:
#Create contonuous features
cont = dataset.filter(['Age','Balance','Day','Duration', 'Campaign','Previous','Pdays','output'])
queries_cont = dataset.filter(['Age','Balance','Day','Duration', 'Campaign','Previous','Pdays'])

In [203]:
#convert binary values to continuous values
cont["Poutcome"] = pd.factorize(dataset['Poutcome'])[0]
cont["Housing"] = pd.factorize(dataset['Housing'])[0]
cont["Loan"] = pd.factorize(dataset['Loan'])[0]
queries_cont["Poutcome"] = pd.factorize(dataset['Poutcome'])[0]
queries_cont["Housing"] = pd.factorize(dataset['Housing'])[0]
queries_cont["Loan"] = pd.factorize(dataset['Loan'])[0]

In [204]:
queries_cont

Unnamed: 0,Age,Balance,Day,Duration,Campaign,Previous,Pdays,Poutcome,Housing,Loan
0,31,2,5,0,1,0,-1,0,0,0
1,42,2,5,0,1,0,-1,0,0,1
2,58,121,5,0,1,0,-1,0,0,1
3,43,593,5,0,1,0,-1,0,0,1
4,57,162,5,0,1,0,-1,0,0,1
...,...,...,...,...,...,...,...,...,...,...
24312,36,557,16,0,4,0,-1,0,0,1
24313,53,583,17,0,1,4,184,3,1,1
24314,23,505,17,0,2,0,-1,0,1,0
24315,51,825,17,0,3,0,-1,0,1,1


In [520]:
#Convert output to a continuous feature
Y = pd.factorize(cont['output'])[0]

In [521]:
#create X values from dataframes while dropping output for Y value
X = cont.drop(['output'], axis=1)
X2 = queries_cont.drop(['output'], axis=1)

In [522]:
#Shape of Dataframes
Y.shape,X.shape

((24317,), (24317, 10))

In [523]:
#Create training/testing set with an 80-20 split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)

In [524]:
#Shape of training dataset
X_train.shape, Y_train.shape

((19453, 10), (19453,))

In [544]:
#Shape of test. dataset
X_test.shape, Y_test.shape

((4864, 10), (4864,))

In [538]:
#Create linear model using logistical Regression, 3000 iterations, classweight assigns negitive weight to wrongly predicted value
model = linear_model.LogisticRegression(max_iter=3000, class_weight = 'balanced')

In [539]:
#Fit or train our model with datasets
model.fit(X_train, Y_train)

LogisticRegression(class_weight='balanced', max_iter=3000)

In [541]:
#Use model to make prediction
Y_pred = model.predict(X_test)
Y_pred

array([0, 0, 1, ..., 0, 1, 0])

In [542]:
#np.round(Y_pred.transpose()).astype(int)

In [572]:
#Create confusion matrix to examine output
cnf_matrix = metrics.confusion_matrix(Y_test, Y_pred)

In [573]:
#statistics of output
print("Accuracy:",metrics.accuracy_score(Y_test, Y_pred))
print("Precision:",metrics.precision_score(Y_test, Y_pred))
print("Recall:",metrics.recall_score(Y_test, Y_pred))

Accuracy: 0.6700246710526315
Precision: 0.1961231470923603
Recall: 0.6382189239332097


In [564]:
#Use the query set to get prediction using our model
query_pred= model.predict(X2)
query_pred

array([0, 0, 0, ..., 1, 0, 1])

In [565]:
#Create predictions dataframe
prediction= pd.DataFrame(columns = ['ID','Result'])
prediction.ID = queries.ID

In [566]:
#Building dataframe with predictions
prediction.Result = pd.Series(query_pred)
prediction

Unnamed: 0,ID,Result
0,TEST2,0
1,TEST3,0
2,TEST4,0
3,TEST5,0
4,TEST6,0
...,...,...
2697,TEST2699,0
2698,TEST2700,0
2699,TEST2701,1
2700,TEST2702,0


In [567]:
#replace 0,1 with typeA and TypeB in results column
prediction['Result'] = prediction['Result'].replace([0,1],["TypeA","TypeB"])
prediction

Unnamed: 0,ID,Result
0,TEST2,TypeA
1,TEST3,TypeA
2,TEST4,TypeA
3,TEST5,TypeA
4,TEST6,TypeA
...,...,...
2697,TEST2699,TypeA
2698,TEST2700,TypeA
2699,TEST2701,TypeB
2700,TEST2702,TypeA


In [569]:
#Create predictions.csv file
prediction.to_csv('queries.txt',index=False)