In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Install PySpark, FindSpark, and Keras Tuner
!pip install pyspark
!pip install -q findspark
!pip install keras-tuner

In [None]:
# Standard Libraries
import os
from pathlib import Path
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

# Spark and PySpark
import findspark
from pyspark.sql import SparkSession
from pyspark import SparkFiles

# Scikit-learn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

# Imbalanced-learn
from imblearn.over_sampling import SMOTE

# TensorFlow and Keras
import tensorflow as tf
from keras import callbacks
import keras_tuner as kt

# Setting up Spark version
spark_version = 'spark-3.5.1'
os.environ['SPARK_VERSION'] = spark_version

Spark Session

In [None]:
# Initialize a Spark session
spark = SparkSession.builder.appName("SparkSQL").getOrCreate()

# Start a SparkSession
findspark.init()

# Read the CSV file from the Resources folder
path = "/content/drive/MyDrive/project-4-group-6/Resources/bank-full.csv"
spark.sparkContext.addFile(path)
df = spark.read.csv(SparkFiles.get("bank-full.csv"), sep=";", header=True, inferSchema=True)

# Review the DataFrame
df.show()

In [None]:
# Register the DataFrame as a SQL temporary view
df.createOrReplaceTempView("bank_data")

# Run the SQL query to find 35 year olds with "yes" in deposit_made
print("Query to retrieve 35-year-olds who made a deposit:")
query_1 = """
SELECT *
FROM bank_data
WHERE age = 35 AND y = 'yes'
LIMIT 5
"""
spark.sql(query_1).show()

# Run SQL query to count occurrences of "yes" in deposit_made column by campaign
print("\nQuery to count 'yes' occurrences by campaign:")
query_2 = """
SELECT campaign,
COUNT(*)
FROM bank_data
WHERE y = 'yes'
GROUP BY campaign
ORDER BY campaign asc
"""
spark.sql(query_2).show()

# Run SQL query to count occurrences of "yes" in deposit_made column by job, sorted by count descending
print("\nQuery to count 'yes' occurrences by job, sorted by count descending:")
query_3 = """
SELECT job,
COUNT(*)
FROM bank_data
WHERE y = 'yes'
GROUP BY job
ORDER BY count(job) desc
"""
spark.sql(query_3).show()

Pandas

In [None]:
# Convert Spark DF to Pandas
bank_full_df = df.toPandas()

# Rename the column 'y' to 'deposit_made'
bank_full_df.rename(columns={'y': 'deposit_made'}, inplace=True)

# Convert deposit_made from categorical to numeric for later clustering purposes
bank_full_df.replace("no", 0, inplace=True)
bank_full_df.replace("yes", 1, inplace=True)

# Review the DataFrame
bank_full_df.head(-20)

Figures

In [None]:
# Create a figure with subplots
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Group by 'age' and count 'deposit_made' = 'yes',
age_deposit_counts = bank_full_df[bank_full_df['deposit_made'] == 1].groupby('age').size()

# Plot the data on the first subplot
age_deposit_counts.plot(kind='line', color='g', ax=axes[0, 0])
axes[0, 0].set_ylabel('Count of Deposit Made')
axes[0, 0].set_xlabel('Age')
axes[0, 0].set_title('Deposits Made by Age')

# Group by 'job' and count 'deposit_made' = 'yes', then sort by count in ascending order
Job_deposit_counts = bank_full_df[bank_full_df['deposit_made'] == 1].groupby('job').size().sort_values(ascending=True)

# Plot the sorted data on the second subplot
Job_deposit_counts.plot(kind='barh', color='b', ax=axes[0, 1])
axes[0, 1].set_xlabel('Count of Deposit Made')
axes[0, 1].set_ylabel('Job')
axes[0, 1].set_title('Deposits Made by Job Category')

# Group by 'job' and 'deposit_made', count occurrences
Job_total_deposit_counts = bank_full_df.groupby(['job', 'deposit_made']).size().unstack()
Job_total_deposit_counts = Job_total_deposit_counts.sort_values(by=1, ascending=True)

# Plot the sorted data on the third subplot
Job_total_deposit_counts.plot(kind='barh', color=('r','b'), ax=axes[1, 0])
axes[1, 0].set_title('Job Categories vs Deposits Made')
axes[1, 0].set_xlabel('Number of Deposits')
axes[1, 0].set_ylabel('Job')

# Group by 'campaign' and count 'deposit_made' = 'yes'
contacts_deposits = bank_full_df[bank_full_df['deposit_made'] == 1].groupby('campaign').size()

# Plot the data with a grid on the fourth subplot
contacts_deposits.plot(kind='area', color='b', grid=True, ax=axes[1, 1])
axes[1, 1].set_ylabel('Number of Customers who made Deposits')
axes[1, 1].set_xlabel('Number of Times Customer was Contacted before deposit was made')
axes[1, 1].set_title('Deposits Made by Number of Campaign Contacts')

# Adjust layout
plt.tight_layout()

# Display the plot
plt.show()

Pre-processing

In [None]:
# Separate the y variable
y = bank_full_df['deposit_made']

# Separate the X variable
X = bank_full_df.drop(columns=['deposit_made'])

# Review the y variable Series
print("y variable Series:")
print(y.head())

In [None]:
# Review the X variable DataFrame
print("\nX variable DataFrame:")
print(X.head())

In [None]:
# Convert categorical values to numeric
X = pd.get_dummies(X, dtype=int)

# Display the first few rows of the transformed dataset
X.head()

Evaluations

In [None]:
# Assign a random_state of 13 to the function and split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=13, stratify=y)

# Scale testing and training groups
scaler = StandardScaler()
X_scaler = scaler.fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Create Logistic Regression Model
model = LogisticRegression(solver='lbfgs')

# Fit the training data to the model
model.fit(X_train_scaled, y_train)

In [None]:
# Make predictions
predictions = model.predict(X_test_scaled)
# Evaluate findings with confusion matrix/classification report
report = classification_report(y_test, predictions)
print(report)

In [None]:
# Create confusion matrix
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame to display the confusion matrix
cm_df = pd.DataFrame(cm)

# Display the confusion matrix DataFrame
print("Confusion Matrix:")
print(cm_df)

In [None]:
# Random Forest Model
rf_model = RandomForestClassifier(n_estimators=500, random_state=42)

# Train rf_model
rf_model.fit(X_train_scaled, y_train)

In [None]:
# Make predictions using the trained Random Forest model on the scaled test data
rf_predictions = rf_model.predict(X_test_scaled)

# RF classification report
rf_report = classification_report(y_test, rf_predictions)
print(rf_report)

Neural Networks

In [None]:
# Create a method that creates a new Sequential model with hyperparameter options
def create_model(hp):
    nn_model = tf.keras.models.Sequential()
    # Allow kerastuner to decide which activation function to use in hidden layers
    activation = hp.Choice('activation',['relu','tanh','sigmoid'])
    # Allow kerastuner to decide number of neurons in first layer
    nn_model.add(tf.keras.layers.Dense(units=hp.Int('first_units',
        min_value=1,
        max_value=10,
        step=2), activation=activation, input_dim=48))
    # Allow kerastuner to decide number of hidden layers and neurons in hidden layers
    for i in range(hp.Int('num_layers', 1, 6)):
        nn_model.add(tf.keras.layers.Dense(units=hp.Int('units_' + str(i),
            min_value=1,
            max_value=10,
            step=2),
            activation=activation))
    nn_model.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))
    # Compile the model
    nn_model.compile(loss="binary_crossentropy", optimizer='adam', metrics=["accuracy"])
    return nn_model

# Define unique file paths for storing tuner configurations
tuner1_file_path = './untitled_project/tuner1.json'

# Define a Hyperband tuner to search for the best hyperparameters
tuner1 = kt.Hyperband(
    create_model,
    objective="val_accuracy",
    max_epochs=20,
    hyperband_iterations=2)

# Search for the best hyperparameters using the oversampled training data
tuner1.search(X_train, y_train, epochs=20, validation_data=(X_test_scaled, y_test))

# Get best model hyperparameters
best_hyper1 = tuner1.get_best_hyperparameters(1)[0]
best_hyper1.values

# Evaluate best model against full test data
best_model1 = tuner1.get_best_models(1)[0]
model_loss1, model_accuracy1 = best_model1.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss1}, Accuracy: {model_accuracy1}")

Oversampling

In [None]:
# Define the desired ratio for oversampling
desired_ratio = .5

# Calculate the number of samples needed in the minority class to achieve the desired ratio
num_majority = y_train.value_counts()[0]
num_minority_desired = int(desired_ratio * num_majority)

# Apply SMOTE with the specified ratio
smote = SMOTE(sampling_strategy={1: num_minority_desired}, random_state=42)

# Apply SMOTE to the training data
X_train_oversampled, y_train_oversampled = smote.fit_resample(X_train_scaled, y_train)

# Confirm the new class distribution after oversampling
print("After oversampling:")
print(y_train_oversampled.value_counts())

# Create and train the model on the oversampled data
model_smote = LogisticRegression(solver='lbfgs')
model_smote.fit(X_train_oversampled, y_train_oversampled)

# Make predictions on the test set
predictions_smote = model_smote.predict(X_test_scaled)

# Evaluate the model with SMOTE
report_smote = classification_report(y_test, predictions_smote)
print("Classification Report with SMOTE:\n", report_smote)

In [None]:
# Create a method that creates a new Sequential model with hyperparameter options
def create_model(hp):
    nn_model = tf.keras.models.Sequential()
    # Allow kerastuner to decide which activation function to use in hidden layers
    activation = hp.Choice('activation',['relu','tanh','sigmoid'])
    # Allow kerastuner to decide number of neurons in first layer
    nn_model.add(tf.keras.layers.Dense(units=hp.Int('first_units',
        min_value=1,
        max_value=10,
        step=2), activation=activation, input_dim=48))
    # Allow kerastuner to decide number of hidden layers and neurons in hidden layers
    for i in range(hp.Int('num_layers', 1, 6)):
        nn_model.add(tf.keras.layers.Dense(units=hp.Int('units_' + str(i),
            min_value=1,
            max_value=10,
            step=2),
            activation=activation))
    nn_model.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))
    # Compile the model
    nn_model.compile(loss="binary_crossentropy", optimizer='adam', metrics=["accuracy"])
    return nn_model

# Define unique file paths for storing tuner configurations
tuner2_file_path = './untitled_project/tuner2.json'

# Define a Hyperband tuner to search for the best hyperparameters
tuner2 = kt.Hyperband(
    create_model,
    objective="val_accuracy",
    max_epochs=20,
    hyperband_iterations=2,
    directory='./untitled_project',  # Specify directory for tuner2 configuration
    project_name='tuner2',  # Specify project name for tuner2 configuration
)

# Search for the best hyperparameters using the oversampled training data
tuner2.search(X_train_oversampled, y_train_oversampled, epochs=20, validation_data=(X_test_scaled, y_test))

# Get best model hyperparameters
best_hyper2 = tuner2.get_best_hyperparameters(1)[0]
best_hyper2.values

# Evaluate best model against full test data
best_model2 = tuner2.get_best_models(1)[0]
model_loss2, model_accuracy2 = best_model2.evaluate(X_test_scaled, y_test, verbose=2)
print(f"Loss: {model_loss2}, Accuracy: {model_accuracy2}")