In [8]:
# Import modules
import pandas as pd
import numpy as np
import matplotlib as plt
from matplotlib import pyplot
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn.metrics import accuracy_score, mean_squared_error, r2_score

# Figures inline and set visualization style
%matplotlib inline
sns.set()

ModuleNotFoundError: No module named 'seaborn'

In [None]:
# Reading the csv into a dataframe.
df_bank = pd.read_csv('https://raw.githubusercontent.com/cmattcasey/ISDS7070/main/bank-additional-full.csv')

In [None]:
# Verifying the data loaded correctly.
df_bank.head(10)

In [None]:
# Check the datatypes and look for null values.
df_bank.info()

In [None]:
# Renaming columns that contain a "." and replacing with "_"
df_bank.rename(columns={'emp.var.rate':'emp_var_rate', 'cons.price.idx':'cons_price_idx', 'cons.conf.idx':'cons_conf_idx','nr.employed':'nr_employed',}, inplace=True)

In [None]:
# Missing values are coded as unknown, replacing with NaN
df_bank.replace('unknown', np.NaN, inplace=True)

We dropped two columns due to lack of useful information. Default showed only 3 instances where someone defaulted on a loan and the metrics for pdays were greatly skewed because every new client was represented by the number '999.' pdays also felt redundant due to the information provided by the 'campaign' and 'previous' columns.

In [None]:
df_bank.drop(columns = ['default', 'pdays'], inplace=True)

In [None]:
# Verifying we now see some columns have null values, new column names and dropped columns
df_bank.info()

In [None]:
# Viewing the desciptive statistics
df_bank.describe()

In [None]:
# Utilizing .value_counts.idxmax() for categorical data to replace missing values with the most common value
df_bank['job'].value_counts().idxmax()

In [None]:
df_bank['education'].value_counts().idxmax()

In [None]:
df_bank['housing'].value_counts().idxmax()

In [None]:
df_bank['loan'].value_counts().idxmax()

In [None]:
df_bank['marital'].value_counts().idxmax()

In [None]:
# Replacing missing values with the most common value for each column
df_bank['loan'].replace(np.nan, "no", inplace=True)
df_bank['housing'].replace(np.nan, "yes", inplace=True)
df_bank['education'].replace(np.nan, "university.degree", inplace=True)
df_bank['job'].replace(np.nan, "admin.", inplace=True)
df_bank['marital'].replace(np.nan, "married", inplace=True)

In [None]:
# Verifying we no longer have any missing values
df_bank.info()

In [None]:
# Many binary columns have yes and no values, replacing with 1 or 0 to for models which require numeric data
df_bank = df_bank.replace({'yes': 1, 'no': 0})

In [None]:
# Converting binary columns into new datatype 'boolean'
df_bank[["housing", "loan",]] = df_bank[["housing", "loan"]].astype("bool")
print(df_bank['housing'].dtypes)
print(df_bank['loan'].dtypes)

In [None]:
# Converting our y variable to integer
df_bank["y"] = pd.to_numeric(df_bank["y"])
print(df_bank["y"].dtypes)

In [None]:
# Verifying replace was successful
df_bank.head()

In [None]:
# Grouping education data into Higher Education, Some Education and Little Education
df_bank['education'] = df_bank['education'].replace(['university.degree', 'professional.course'],'higher')
df_bank['education'] = df_bank['education'].replace(['high.school', 'basic.9y', 'basic.6y'],'some')
df_bank['education'] = df_bank['education'].replace(['basic.4y', 'illiterate'],'little')

In [None]:
# Checking counts after replacments
df_bank['education'].value_counts()

In [None]:
# Creating histogram to view the distrubtion of age
plt.pyplot.hist(df_bank["age"])

# Set x/y labels and plot title
plt.pyplot.xlabel("age")
plt.pyplot.ylabel("count")
plt.pyplot.title("age distribution")

In [None]:
# Creating 4 bins for age
df_bank['bin_age'] = pd.qcut(df_bank.age, q=4, labels=False)

In [None]:
df_bank['bin_age'].value_counts()

In [None]:
# Verifying creation of age bins
df_bank['bin_age'].unique()

In [None]:
# Grouping jobs based expected income
# Lower Class Employment Status: student, unemployed, housemaid
# Middle Class Employment Status: blue-collar, technician, retired, services, admin.
# Upper Class Employment Status: management, entrepreneur, self-employed
df_bank['job'] = df_bank['job'].replace(['student', 'unemployed', 'housemaid'],'lower')
df_bank['job'] = df_bank['job'].replace(['blue-collar', 'technician', 'retired', 'services', 'admin.'],'middle')
df_bank['job'] = df_bank['job'].replace(['management', 'entrepreneur', 'self-employed'],'upper')

In [None]:
# Verifying job replacements and counts
df_bank['job'].value_counts()

In [None]:
# This shows how many of each marital category said yes to subscribing to a term deposit
df_bank.groupby(['marital']).y.sum()

In [None]:
# Calculating the proportion of "yes" by each marital status
print(df_bank[df_bank.marital == 'married'].y.sum()/df_bank[df_bank.marital == 'married'].y.count())

In [None]:
print(df_bank[df_bank.marital == 'single'].y.sum()/df_bank[df_bank.marital == 'single'].y.count())

In [None]:
print(df_bank[df_bank.marital == 'divorced'].y.sum()/df_bank[df_bank.marital == 'divorced'].y.count())

In [None]:
# Exploring marital status impact on subscribed a deposit (y)
sns.catplot(x='y', col='marital', kind='count', data=df_bank, col_wrap=3)

In [None]:
# Exploring education impact on subscribed a deposit (y)
sns.catplot(x='y', col='education', kind='count', data=df_bank, col_wrap=3)

In [None]:
# Exploring job impact on subscribed a deposit (y)
sns.catplot(x='y', col='job', kind='count', data=df_bank, col_wrap=3)

In [None]:
#Exploring previous outcome impact on subscribed a deposit (y)
sns.catplot(x='y', col='poutcome', kind='count', data=df_bank, col_wrap=3)

In [None]:
#Exploring if different days of the week impact on subscribed a deposit (y)
sns.catplot(x='y', col='day_of_week', kind='count', data=df_bank, col_wrap=3)

In [None]:
#Exploring if different months impact on subscribed a deposit (y)
sns.catplot(x='y', col='month', kind='count', data=df_bank, col_wrap=3)

In [None]:
# Creating dummy variables so that all values are numeric in preparation for modeling
bank_dum = pd.get_dummies(df_bank, drop_first=True)
bank_dum.head()

In [None]:
# Verifying the creation of new columns
bank_dum.info()

In [None]:
# Designating X and y to use in test/train split creation
X = bank_dum.drop('y', axis=1)
y = bank_dum['y']

In [None]:
# Creating test/train split 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=9)

In [None]:
#Verifying split functioned correctly
X_train.head()

In [None]:
y_train.tail(10)

In [None]:
from sklearn import metrics
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

In [None]:
model = RandomForestRegressor(n_estimators=200, max_depth=7, bootstrap=True)
model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_test)

In [None]:
# The mean squared error
mse = mean_squared_error(y_test, y_pred)
print('Mean squared error:', mse)

# The coefficient of determination: 1 is perfect prediction
r2 = r2_score(y_test, y_pred)
print('R-Squared:', r2)

In [None]:
X = bank_dum[['previous', 'euribor3m', 'poutcome_nonexistent', 'poutcome_success']]
y = bank_dum['y']

In [None]:
# Creating test/train split 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=9)

In [None]:
model2 = RandomForestRegressor(n_estimators=200, max_depth=7, bootstrap=True)
model2.fit(X_train, y_train)

In [None]:
y_pred = model2.predict(X_test)

In [None]:
# The mean squared error
mse = mean_squared_error(y_test, y_pred)
print('Mean squared error:', mse)

# The coefficient of determination: 1 is perfect prediction
r2 = r2_score(y_test, y_pred)
print('R-Squared:', r2)

In [None]:
from sklearn.feature_selection import SequentialFeatureSelector
from time import time

tic_bwd = time()
sfs_backward = SequentialFeatureSelector(lasso, n_features_to_select=2,
                                         direction='backward').fit(X, y)
toc_bwd = time()

print("Features selected by forward sequential selection: "
      f"{feature_names[sfs_forward.get_support()]}")
print(f"Done in {toc_fwd - tic_fwd:.3f}s")
print("Features selected by backward sequential selection: "
      f"{feature_names[sfs_backward.get_support()]}")
print(f"Done in {toc_bwd - tic_bwd:.3f}s")