# <center> Logistic Regression, Kmeans Clustering</center>
<center> University of Denver </center>
<center> Eric Browne </center>

# Uploading the Data

In [None]:
## import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.discrete.discrete_model import Logit
from sklearn.linear_model import LogisticRegression
from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import KMeans
from sklearn.inspection import permutation_importance
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.metrics import *
from scipy.cluster.hierarchy import dendrogram
from sklearn.model_selection import GridSearchCV

In [None]:
## Read in the Data:
# Start with the smaller dataset
df = pd.read_csv('bank.csv')
df.head()

In [None]:
## Check for data types and NaN Values
print(df.dtypes)
print("---")
print(df.isnull().sum())
print('---')
print(df.describe())

In [None]:
# Check for unique values in the variables: (default, housing, contact, campaign, poutcome, previous)
print(f"Default: {df.default.unique()}")
print(f"Housing: {df.housing.unique()}")
print(f"Contact: {df.contact.unique()}")
print(f"Campaign: {df.campaign.unique()}")
print(f"poutcome: {df.poutcome.unique()}")
print(f"previous: {df.previous.unique()}")

In [None]:
pd.set_option('display.max_columns', None)
pd.set_option('max_columns', None)
pd.set_option('max_rows', None)
pd.options.display.width = 0   # automatically ajust to window length


# Data Preprocessing 

In [None]:
# change all non-numeric into category data
df['job'] = df['job'].astype('category')
df['job'] = df['job'].cat.codes
df['marital'] = df['marital'].astype('category')
df['marital'] = df['marital'].cat.codes
df['education'] = df['education'].astype('category')
df['education'] = df['education'].cat.codes
df['default'] = df['default'].astype('category')
df['default'] = df['default'].cat.codes
df['contact'] = df['contact'].astype('category')
df['contact'] = df['contact'].cat.codes
df['month'] = df['month'].astype('category')
df['month'] = df['month'].cat.codes
df['poutcome'] = df['poutcome'].astype('category')
df['poutcome'] = df['poutcome'].cat.codes
df['housing'] = df['housing'].astype('category')
df['housing'] = df['housing'].cat.codes
df['loan'] = df['loan'].astype('category')
df['loan'] = df['loan'].cat.codes
df['y'] = df['y'].astype('category')
df['y'] = df['y'].cat.codes

df = df.drop('pdays',axis=1)
print("\n\nAfter converting to numeric, dropping pdays, and normalizing balance:")
print( df.head() )



# Data Splitting 

In [None]:
## Split original df into train and test
features = df.drop('y',axis=1)
label = df['y']
Xtrain, Xtest, ytrain, ytest = train_test_split(features,label, test_size=0.3,random_state=420)
print(Xtrain.shape)
print(Xtest.shape)
print(ytrain.shape)
print(ytest.shape)

In [None]:
#drop the y variable for clustering
df2 = df.drop('y',axis=1)
df2.head() # or could also use 'features'

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
numericCols = ['age','duration']
scaler.fit(df2[numericCols])
df2[numericCols] = scaler.transform(df2[numericCols])


In [None]:
## Perform PCA
save_xtrain = Xtrain
save_xtest = Xtest
pca = PCA()
df_pca_train = pca.fit_transform(save_xtrain)
df_pca_test = pca.fit_transform(save_xtest)
explained_variance = pca.explained_variance_ratio_
print("explained_variance = ")
print(explained_variance)

print(f'Params: {pca.get_params()}')

In [None]:
## Plot the amount of explained_variance:
%matplotlib inline
plt.plot(range(1,len(explained_variance)+1),explained_variance)
plt.title('Explained Variance using PCA')
plt.xlabel('Number of Components')
plt.ylabel('Explained Variance Ratio')
plt.show()

In [None]:
## But for visualization purposes, we'll break it down into 2 components
pca = PCA(n_components=2)
df_PCA = pca.fit_transform(df2)
df_PCA # now a numpy array

In [None]:
## Use Agglomerative clustering to get optimal number of clusters

def plot_dendrogram(model, **kwargs):
    """
    Function to plot the dendrogram
    """
    counts = np.zeros(model.children_.shape[0])
    n_samples = len(model.labels_)
    for i, merge in enumerate(model.children_):
        current = 0
        for child_idx in merge:
            if child_idx < n_samples:
                current += 1  # leaf node
            else:
                current += counts[child_idx - n_samples]
        counts[i] = current

    linkage = np.column_stack([model.children_, model.distances_,
                                      counts]).astype(float)

    #dendrogram plot
    dendrogram(linkage, **kwargs)


In [None]:
agglomodel = AgglomerativeClustering(distance_threshold=0, n_clusters=None)
agglomodel.fit(df2)

In [None]:
# setting distance_threshold=0 ensures we compute the full tree.
plt.title('Hierarchical Clustering Dendrogram')
# plot the top three levels of the dendrogram
plot_dendrogram(agglomodel, truncate_mode='level', p=3)
plt.xlabel("Number of points in node (or index of point if no parenthesis).")
plt.show()

# Model Building 

In [None]:
kmeans = KMeans(n_clusters=2, random_state=420)
kmeans.fit(df_PCA)
print('means for the 2 clusters are:')
print(kmeans.cluster_centers_)

In [None]:
## Predict with Kmeans
clusterPreds = kmeans.predict(df_PCA)
np.unique(clusterPreds) # to make sure that the predictions arent all the same


In [None]:
print(df_PCA.shape)
print(clusterPreds.shape)

In [None]:
## Visualize the clusters
def plot_clustering(df_kmeans, kmeanLabels):
    fig,ax = plt.subplots()

    for i in range(len(df_kmeans)):
        if kmeanLabels[i] == 0: theColor = 'red'
        if kmeanLabels[i] == 1: theColor = 'green'
        if kmeanLabels[i] == 2: theColor = 'blue'
        if kmeanLabels[i] == 3: theColor = 'purple'
        ax.scatter(df_kmeans[i][0],df_kmeans[i][1], s=9.5, alpha=1.0,color=theColor)
        ax.set_title('kmeans')
        
    plt.show()
    
# Call the plotting function
plot_clustering(df_PCA,clusterPreds)

In [None]:
## Now use a for loop for plotting
for i in range(2,5):
    kmeans = KMeans(n_clusters=i, random_state=420)
    kmeans.fit(df_PCA)
    clusterPreds = kmeans.predict(df_PCA)
    plot_clustering(df_PCA,clusterPreds)
    
    

In [None]:
## Add values of the to the original dataset:
kmeans = KMeans(n_clusters=2, random_state=420)
kmeans.fit(df_PCA)
## Predict with Kmeans
clusterPreds = kmeans.predict(df_PCA)
np.unique(clusterPreds) # to make sure that the predictions arent all the same
df2['label'] = clusterPreds
df2.head()


In [None]:
logit = LogisticRegression()
logit.set_params(max_iter=800)
Xtrain,Xtest,ytrain,ytest = train_test_split(df2.drop('label',axis=1),df2['label'],test_size=0.3,random_state=42)
logit.fit(Xtrain,ytrain)
logit_train_preds = logit.predict(Xtrain)
print(f'Training F1 Score: {f1_score(logit_train_preds,ytrain)}')

In [None]:
params = {'C': [0.1,0.2,0.3,0.4,0.5,1.0]}
grid_logit = GridSearchCV(logit, params, cv=3, scoring='accuracy',
                           return_train_score=True)
grid_logit.fit(features, label)
best_params=grid_logit.best_params_
print(best_params)

In [None]:
# Optimal cut off was 0.3
final_logit = grid_logit.best_estimator_
print(final_logit)

# Model Evaluation 

In [None]:
print(f'Testing Accuracy: {accuracy_score(final_logit.predict(Xtest),ytest)}')

In [None]:
## Read in the data
#predicting 'RainTomorrow'
rainDF = pd.read_csv('RainOrNot.csv')
rainDF.head()
numericRain = ['MinTemp','MaxTemp','Rainfall','WindGustSpeed','WindSpeed9am','WindSpeed3pm','Humidity9am','Humidity3pm','Pressure9am','Pressure3pm','Cloud9am','Cloud3pm','Temp9am','Temp3pm']
scale_rain = StandardScaler().fit(rainDF[numericRain])
rainDF[numericRain] = scale_rain.transform(rainDF[numericRain])
rainDF.head()

In [None]:
## Train Test Split
Xtrain, Xtest, ytrain, ytest = train_test_split(rainDF.drop('RainTomorrow',axis=1),rainDF['RainTomorrow'],test_size=0.3,random_state=420)

## Make the Logit
log_reg = sm.Logit(ytrain, Xtrain).fit() 

In [None]:
## Print the Summary
print(log_reg.summary()) 

In [None]:
## Test set predictions
yhat = log_reg.predict(Xtest) 
prediction = list(map(round, yhat)) 

## Compute Accuracies:
cm = confusion_matrix(ytest, prediction)  
print ("Confusion Matrix : \n", cm)  
  
# accuracy score of the model 
print('Test accuracy = ', accuracy_score(ytest, prediction))

# Conclusion 

**Using LogisticRegression() from sklearn, and Logitstic Regression from the statsmodels api achieved pretty good results form using cluster labels from our Kmeans algorithm to classify accurately**