In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
import matplotlib.pyplot as plt 
plt.rc("font", size=14)
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import seaborn as sns
sns.set(style="white")
sns.set(style="whitegrid", color_codes=True)

In [7]:
data = pd.read_csv("./data/clean_data.csv")
data = data.dropna()
data.rename(columns={'y': 'subscribed'}, inplace=True)

data.info()
data.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 13216 entries, 0 to 13215
Data columns (total 22 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   age             13216 non-null  float64
 1   job             13216 non-null  object 
 2   marital         13216 non-null  object 
 3   education       13216 non-null  object 
 4   default         13216 non-null  object 
 5   housing         13216 non-null  object 
 6   loan            13216 non-null  object 
 7   contact         13216 non-null  object 
 8   month           13216 non-null  object 
 9   day_of_week     13216 non-null  object 
 10  campaign        13216 non-null  float64
 11  pdays           13216 non-null  float64
 12  previous        13216 non-null  float64
 13  poutcome        13216 non-null  object 
 14  emp.var.rate    13216 non-null  float64
 15  cons.price.idx  13216 non-null  float64
 16  cons.conf.idx   13216 non-null  float64
 17  euribor3m       13216 non-null 

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,subscribed,year,date
0,38.0,services,married,high.school,no,yes,no,cellular,apr,wed,...,0.0,nonexistent,-1.8,93.075,-47.1,1.498,5099.1,no,2009,2009-04-01
1,33.0,blue-collar,married,basic.6y,no,yes,no,cellular,apr,wed,...,0.0,nonexistent,-1.8,93.075,-47.1,1.498,5099.1,yes,2009,2009-04-01
2,43.0,self-employed,married,high.school,unknown,yes,no,cellular,apr,wed,...,0.0,nonexistent,-1.8,93.075,-47.1,1.498,5099.1,no,2009,2009-04-01
3,31.0,technician,married,professional.course,no,yes,no,cellular,apr,wed,...,0.0,nonexistent,-1.8,93.075,-47.1,1.498,5099.1,no,2009,2009-04-01
4,43.0,self-employed,married,high.school,unknown,no,no,cellular,apr,wed,...,0.0,nonexistent,-1.8,93.075,-47.1,1.498,5099.1,no,2009,2009-04-01


In [9]:
data.groupby('marital').count()

#unknown marital status will be rolled into married

data.loc[(data.marital == 'unknown'), 'marital'] = 'married'


In [12]:
data.groupby('job').count()

#unknown job category will be rolled into admin

data.loc[(data.job == 'unknown'), 'job'] = 'admin.'


Unnamed: 0_level_0,age,marital,education,default,housing,loan,contact,month,day_of_week,campaign,...,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,subscribed,year,date
job,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
admin.,3591,3591,3591,3591,3591,3591,3591,3591,3591,3591,...,3591,3591,3591,3591,3591,3591,3591,3591,3591,3591
blue-collar,2760,2760,2760,2760,2760,2760,2760,2760,2760,2760,...,2760,2760,2760,2760,2760,2760,2760,2760,2760,2760
entrepreneur,389,389,389,389,389,389,389,389,389,389,...,389,389,389,389,389,389,389,389,389,389
housemaid,233,233,233,233,233,233,233,233,233,233,...,233,233,233,233,233,233,233,233,233,233
management,877,877,877,877,877,877,877,877,877,877,...,877,877,877,877,877,877,877,877,877,877
retired,890,890,890,890,890,890,890,890,890,890,...,890,890,890,890,890,890,890,890,890,890
self-employed,416,416,416,416,416,416,416,416,416,416,...,416,416,416,416,416,416,416,416,416,416
services,1245,1245,1245,1245,1245,1245,1245,1245,1245,1245,...,1245,1245,1245,1245,1245,1245,1245,1245,1245,1245
student,657,657,657,657,657,657,657,657,657,657,...,657,657,657,657,657,657,657,657,657,657
technician,1812,1812,1812,1812,1812,1812,1812,1812,1812,1812,...,1812,1812,1812,1812,1812,1812,1812,1812,1812,1812


In [17]:
data.groupby('education').count()

#combine illiterare with unknown, illiterate is not significant by itself 

data.loc[(data.education == 'illiterate'), 'education'] = 'unknown'
data.groupby('education').count()

Unnamed: 0_level_0,age,job,marital,default,housing,loan,contact,month,day_of_week,campaign,...,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,subscribed,year,date
education,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
basic.4y,1209,1209,1209,1209,1209,1209,1209,1209,1209,1209,...,1209,1209,1209,1209,1209,1209,1209,1209,1209,1209
basic.6y,674,674,674,674,674,674,674,674,674,674,...,674,674,674,674,674,674,674,674,674,674
basic.9y,1878,1878,1878,1878,1878,1878,1878,1878,1878,1878,...,1878,1878,1878,1878,1878,1878,1878,1878,1878,1878
high.school,3258,3258,3258,3258,3258,3258,3258,3258,3258,3258,...,3258,3258,3258,3258,3258,3258,3258,3258,3258,3258
professional.course,1550,1550,1550,1550,1550,1550,1550,1550,1550,1550,...,1550,1550,1550,1550,1550,1550,1550,1550,1550,1550
university.degree,4060,4060,4060,4060,4060,4060,4060,4060,4060,4060,...,4060,4060,4060,4060,4060,4060,4060,4060,4060,4060
unknown,587,587,587,587,587,587,587,587,587,587,...,587,587,587,587,587,587,587,587,587,587


In [27]:
data.groupby('loan').count()

#combine illiterare with unknown, illiterate is not significant by itself 

data.loc[(data.loan == 'unknown'), 'loan'] = 'no'
data.groupby('loan').count()

Unnamed: 0_level_0,age,job,marital,education,default,housing,contact,month,day_of_week,campaign,...,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,subscribed,year,date
loan,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
no,11205,11205,11205,11205,11205,11205,11205,11205,11205,11205,...,11205,11205,11205,11205,11205,11205,11205,11205,11205,11205
yes,2011,2011,2011,2011,2011,2011,2011,2011,2011,2011,...,2011,2011,2011,2011,2011,2011,2011,2011,2011,2011


In [30]:
data.groupby('housing').count()

#combine illiterare with unknown, illiterate is not significant by itself 

data.loc[(data.housing == 'unknown'), 'housing'] = 'no'
data.groupby('housing').count()

Unnamed: 0_level_0,age,job,marital,education,default,loan,contact,month,day_of_week,campaign,...,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,subscribed,year,date
housing,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
no,5785,5785,5785,5785,5785,5785,5785,5785,5785,5785,...,5785,5785,5785,5785,5785,5785,5785,5785,5785,5785
yes,7431,7431,7431,7431,7431,7431,7431,7431,7431,7431,...,7431,7431,7431,7431,7431,7431,7431,7431,7431,7431


In [31]:
data.groupby('default').count()

#combine illiterare with unknown, illiterate is not significant by itself 

# data.loc[(data.housing == 'unknown'), 'housing'] = 'no'
# data.groupby('housing').count()

Unnamed: 0_level_0,age,job,marital,education,housing,loan,contact,month,day_of_week,campaign,...,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,subscribed,year,date
default,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
no,11807,11807,11807,11807,11807,11807,11807,11807,11807,11807,...,11807,11807,11807,11807,11807,11807,11807,11807,11807,11807
unknown,1409,1409,1409,1409,1409,1409,1409,1409,1409,1409,...,1409,1409,1409,1409,1409,1409,1409,1409,1409,1409


In [None]:
def separate_variables(df):
    numeric_vars = df.select_dtypes(include=['float64', 'int64'])
    categorical_vars = df.select_dtypes(include=['object'])
    return numeric_vars, categorical_vars

numeric_df, categorical_df = separate_variables(data)

def plot_histogram(data, column, y):
    plt.figure(figsize=(8, 6)) 
    sns.countplot(x=column,hue=y, data=data) 
    plt.title(f'Histogram of {column} by {y}', fontsize=14)  
    plt.xlabel(column, fontsize=12) 
    plt.ylabel('Frequency', fontsize=12)  
    plt.xticks(rotation='vertical',fontsize=10)  
    plt.yticks(fontsize=10)  
    plt.show()

for column in categorical_df.columns:
    plot_histogram(data, column, y='subscribed')

In [44]:
# At this point, data is ready for sampling

# validation data - from 2010-09-01 to latest

data.info()
validation_data = data.loc[data['date'] >= '2010-09-01']

validation_data.to_csv('./data/validation_set.csv', index=False)
validation_data


<class 'pandas.core.frame.DataFrame'>
Int64Index: 13216 entries, 0 to 13215
Data columns (total 22 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   age             13216 non-null  float64
 1   job             13216 non-null  object 
 2   marital         13216 non-null  object 
 3   education       13216 non-null  object 
 4   default         13216 non-null  object 
 5   housing         13216 non-null  object 
 6   loan            13216 non-null  object 
 7   contact         13216 non-null  object 
 8   month           13216 non-null  object 
 9   day_of_week     13216 non-null  object 
 10  campaign        13216 non-null  float64
 11  pdays           13216 non-null  float64
 12  previous        13216 non-null  float64
 13  poutcome        13216 non-null  object 
 14  emp.var.rate    13216 non-null  float64
 15  cons.price.idx  13216 non-null  float64
 16  cons.conf.idx   13216 non-null  float64
 17  euribor3m       13216 non-null 

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,subscribed,year,date
12581,64.0,retired,married,professional.course,no,yes,no,cellular,sep,wed,...,0.0,nonexistent,-1.1,94.199,-37.5,0.886,4963.6,yes,2010,2010-09-01
12582,72.0,retired,married,professional.course,no,no,no,cellular,sep,wed,...,1.0,failure,-1.1,94.199,-37.5,0.886,4963.6,no,2010,2010-09-01
12583,34.0,admin.,married,university.degree,no,yes,no,cellular,sep,wed,...,2.0,failure,-1.1,94.199,-37.5,0.886,4963.6,no,2010,2010-09-01
12584,69.0,retired,married,high.school,no,yes,yes,cellular,sep,wed,...,2.0,success,-1.1,94.199,-37.5,0.886,4963.6,yes,2010,2010-09-01
12585,33.0,technician,married,professional.course,no,yes,no,cellular,sep,wed,...,0.0,nonexistent,-1.1,94.199,-37.5,0.886,4963.6,yes,2010,2010-09-01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13211,72.0,retired,married,professional.course,no,yes,no,cellular,nov,fri,...,0.0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,yes,2010,2010-11-01
13212,46.0,blue-collar,married,professional.course,no,no,no,cellular,nov,fri,...,0.0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,no,2010,2010-11-01
13213,56.0,retired,married,university.degree,no,yes,no,cellular,nov,fri,...,0.0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,no,2010,2010-11-01
13214,44.0,technician,married,professional.course,no,no,no,cellular,nov,fri,...,0.0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,yes,2010,2010-11-01


In [45]:
#Sampling train/test

model_data = data.loc[data['date'] < '2010-09-01']

from sklearn.model_selection import train_test_split

# Split the dataset into training and test sets
training_data, testing_data = train_test_split(model_data, test_size=0.3, random_state=42)

# Print the shapes of the resulting sets
print(f"No. of training examples: {training_data.shape[0]}")
print(f"No. of testing examples: {testing_data.shape[0]}")

testing_data.to_csv('./data/testing_set.csv', index=False)


#Apply sampling techniques to training data

training_data.to_csv('./data/training_set.csv', index=False)


No. of training examples: 8806
No. of testing examples: 3775
