In [1]:
import os
# Find the latest version of spark 3.x  from http://www.apache.org/dist/spark/ and enter as the spark version
# For example:
# spark_version = 'spark-3.4.0'
spark_version = 'spark-3.4.1'
os.environ['SPARK_VERSION']=spark_version

# Install Spark and Java
!apt-get update
!apt-get install openjdk-11-jdk-headless -qq > /dev/null
!wget -q http://www.apache.org/dist/spark/$SPARK_VERSION/$SPARK_VERSION-bin-hadoop3.tgz
!tar xf $SPARK_VERSION-bin-hadoop3.tgz
!pip install -q findspark

# Set Environment Variables
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["SPARK_HOME"] = f"/content/{spark_version}-bin-hadoop3"

# Start a SparkSession
import findspark
findspark.init()

# Import and install other necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
import tensorflow as tf
!pip install keras-tuner

0% [Working]            Get:1 http://security.ubuntu.com/ubuntu jammy-security InRelease [110 kB]
0% [Connecting to archive.ubuntu.com] [1 InRelease 14.2 kB/110 kB 13%] [Waiting                                                                               Hit:2 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease
Hit:3 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
Hit:4 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:5 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [119 kB]
Hit:6 https://ppa.launchpadcontent.net/c2d4u.team/c2d4u4.0+/ubuntu jammy InRelease
Get:7 http://archive.ubuntu.com/ubuntu jammy-backports InRelease [109 kB]
Hit:8 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:9 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Hit:10 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Fetched 338 kB in 2s (163 kB/s)
Reading package li

In [2]:
# Import packages
from pyspark.sql import SparkSession
from pyspark.sql.functions import hour, mean, round
import time
import keras_tuner as kt

# Create a SparkSession
spark = SparkSession.builder.appName("SparkSQL").getOrCreate()

# Dataset Preparation

In [3]:
# Read in the csv
from pyspark import SparkFiles
url = "https://raw.githubusercontent.com/ejmatthe/project-four/main/resources/master.csv"
spark.sparkContext.addFile(url)
suicides_df = spark.read.csv(SparkFiles.get("master.csv"), sep=",", header=True)

# Print head to confirm it was read in
suicides_df.show()

+-------+----+------+-----------+-----------+----------+---------------------+----------------+------------------+---------------+------------+
|country|year|   sex|        age|suicides_no|population|suicides_per_100k_pop|gdp_for_year_USD|gdp_per_capita_USD|     generation|suicide_tier|
+-------+----+------+-----------+-----------+----------+---------------------+----------------+------------------+---------------+------------+
|Albania|1987|  male|15-24 years|         21|    312900|                 6.71|      2156624900|               796|   Generation X|           0|
|Albania|1987|  male|35-54 years|         16|    308000|                 5.19|      2156624900|               796|         Silent|           0|
|Albania|1987|female|15-24 years|         14|    289700|                 4.83|      2156624900|               796|   Generation X|           0|
|Albania|1987|  male|  75+ years|          1|     21800|                 4.59|      2156624900|               796|G.I. Generation|      

In [4]:
# Create a temporary view of the DataFrame.
suicides_df.createOrReplaceTempView('suicides')

In [5]:
# Make dataframe to compare suicide rate and GDP per capita
suicides_gdp_df = spark.sql("""
SELECT gdp_for_year_USD, suicide_tier
FROM suicides
""")


suicides_gdp_df.show()

+----------------+------------+
|gdp_for_year_USD|suicide_tier|
+----------------+------------+
|      2156624900|           0|
|      2156624900|           0|
|      2156624900|           0|
|      2156624900|           0|
|      2156624900|           0|
|      2156624900|           0|
|      2156624900|           0|
|      2156624900|           0|
|      2156624900|           0|
|      2156624900|           0|
|      2156624900|           0|
|      2156624900|           0|
|      2126000000|           0|
|      2126000000|           0|
|      2126000000|           0|
|      2126000000|           0|
|      2126000000|           0|
|      2126000000|           0|
|      2126000000|           0|
|      2126000000|           0|
+----------------+------------+
only showing top 20 rows



In [6]:
# Convert PySpark dataframes to Pandas dataframes then check data types
suicides_gdp_df = suicides_gdp_df.toPandas()
suicides_df = suicides_df.toPandas()

suicides_df.info()
suicides_gdp_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27820 entries, 0 to 27819
Data columns (total 11 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   country                27820 non-null  object
 1   year                   27820 non-null  object
 2   sex                    27820 non-null  object
 3   age                    27820 non-null  object
 4   suicides_no            27820 non-null  object
 5   population             27820 non-null  object
 6   suicides_per_100k_pop  27820 non-null  object
 7   gdp_for_year_USD       27820 non-null  object
 8   gdp_per_capita_USD     27820 non-null  object
 9   generation             27820 non-null  object
 10  suicide_tier           27820 non-null  object
dtypes: object(11)
memory usage: 2.3+ MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27820 entries, 0 to 27819
Data columns (total 2 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            ----------

In [7]:
# Convert datatypes
suicides_df = suicides_df.astype({'year': 'int64', 'suicides_no': 'int64', 'population': 'int64', 'suicides_per_100k_pop': 'float64',
                                  'gdp_for_year_USD': 'float64', 'gdp_per_capita_USD': 'int64', 'suicide_tier': 'int64'})
suicides_gdp_df = suicides_gdp_df.astype('int64')

# Confirm new datatypes
suicides_gdp_df.info()
suicides_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27820 entries, 0 to 27819
Data columns (total 2 columns):
 #   Column            Non-Null Count  Dtype
---  ------            --------------  -----
 0   gdp_for_year_USD  27820 non-null  int64
 1   suicide_tier      27820 non-null  int64
dtypes: int64(2)
memory usage: 434.8 KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27820 entries, 0 to 27819
Data columns (total 11 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   country                27820 non-null  object 
 1   year                   27820 non-null  int64  
 2   sex                    27820 non-null  object 
 3   age                    27820 non-null  object 
 4   suicides_no            27820 non-null  int64  
 5   population             27820 non-null  int64  
 6   suicides_per_100k_pop  27820 non-null  float64
 7   gdp_for_year_USD       27820 non-null  float64
 8   gdp_per_capita_USD     27820 non-null 

In [8]:
# Get dummies
country_dummies = pd.get_dummies(suicides_df["country"])
sex_dummies = pd.get_dummies(suicides_df["sex"])
age_dummies = pd.get_dummies(suicides_df["age"])

# Add dummies
suicides_x_df = pd.concat([suicides_gdp_df, country_dummies, sex_dummies, age_dummies], axis=1)
suicides_x_df.head()

Unnamed: 0,gdp_for_year_USD,suicide_tier,Albania,Antigua and Barbuda,Argentina,Armenia,Aruba,Australia,Austria,Azerbaijan,...,Uruguay,Uzbekistan,female,male,15-24 years,25-34 years,35-54 years,5-14 years,55-74 years,75+ years
0,2156624900,0,1,0,0,0,0,0,0,0,...,0,0,0,1,1,0,0,0,0,0
1,2156624900,0,1,0,0,0,0,0,0,0,...,0,0,0,1,0,0,1,0,0,0
2,2156624900,0,1,0,0,0,0,0,0,0,...,0,0,1,0,1,0,0,0,0,0
3,2156624900,0,1,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,1
4,2156624900,0,1,0,0,0,0,0,0,0,...,0,0,0,1,0,1,0,0,0,0


In [9]:
# Set target and feature sets
# Set target and feature sets
y = suicides_gdp_df["suicide_tier"].values
x = suicides_x_df.drop(columns="suicide_tier").values

In [10]:
# Split training/test datasets
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=42)

In [11]:
# Preprocess numerical data for neural network

# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
x_scaler = scaler.fit(x_train)

# Scale the data
x_train_scaled = x_scaler.transform(x_train)
x_test_scaled = x_scaler.transform(x_test)

# Prepare and run hyperparameter tuning with Keras Tuner

In [12]:
# Create a method that creates a new Sequential model with hyperparameter options
def create_model(hp):
    nn_model = tf.keras.models.Sequential()

    # Allow kerastuner to decide which activation function to use in hidden layers
    activation = hp.Choice('activation',['relu','tanh','sigmoid'])

    # Allow kerastuner to decide number of neurons in first layer
    nn_model.add(tf.keras.layers.Dense(units=hp.Int('first_units',
        min_value=1,
        max_value=110,
        step=2), activation=activation, input_dim=110))

    # Allow kerastuner to decide number of hidden layers and neurons in hidden layers
    for i in range(hp.Int('num_layers', 1, 6)):
        nn_model.add(tf.keras.layers.Dense(units=hp.Int('units_' + str(i),
            min_value=1,
            max_value=110,
            step=2),
            activation=activation))

    nn_model.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

    # Compile the model
    nn_model.compile(loss="binary_crossentropy", optimizer='adam', metrics=["accuracy"])

    return nn_model

In [13]:
# Set up the tuner
tuner = kt.Hyperband(
    create_model,
    objective="val_accuracy",
    max_epochs=20,
    hyperband_iterations=2)

In [14]:
# Run the kerastuner search for best hyperparameters
tuner.search(
    x_train_scaled,
    y_train,epochs=20,
    validation_data=(x_test_scaled,y_test)
)

Trial 60 Complete [00h 00m 25s]
val_accuracy: 0.8632638454437256

Best val_accuracy So Far: 0.8670021295547485
Total elapsed time: 00h 13m 23s


In [15]:
# Get best model hyperparameters
best_hyper = tuner.get_best_hyperparameters(1)[0]
best_hyper.values

{'activation': 'tanh',
 'first_units': 99,
 'num_layers': 3,
 'units_0': 75,
 'units_1': 87,
 'units_2': 37,
 'units_3': 43,
 'units_4': 9,
 'units_5': 109,
 'tuner/epochs': 20,
 'tuner/initial_epoch': 0,
 'tuner/bracket': 0,
 'tuner/round': 0}

In [16]:
# Review summary of most accurate model
best_model = tuner.get_best_models(1)[0]
best_model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 99)                10989     
                                                                 
 dense_1 (Dense)             (None, 75)                7500      
                                                                 
 dense_2 (Dense)             (None, 87)                6612      
                                                                 
 dense_3 (Dense)             (None, 37)                3256      
                                                                 
 dense_4 (Dense)             (None, 1)                 38        
                                                                 
Total params: 28,395
Trainable params: 28,395
Non-trainable params: 0
_________________________________________________________________


# Create, fit and test model based on prior parameters

In [17]:
# Define the deep learning model
nn_model_tuned = tf.keras.models.Sequential()
nn_model_tuned.add(tf.keras.layers.Dense(units=99, activation="tanh"))
nn_model_tuned.add(tf.keras.layers.Dense(units=75, activation="tanh"))
nn_model_tuned.add(tf.keras.layers.Dense(units=87, activation="tanh"))
nn_model_tuned.add(tf.keras.layers.Dense(units=37, activation="tanh"))
nn_model_tuned.add(tf.keras.layers.Dense(units=1, activation="tanh"))


# Compile the Sequential model together and customize metrics
nn_model_tuned.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [18]:
# Train the model
fit_model_tuned = nn_model_tuned.fit(x_train_scaled, y_train, epochs=100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [19]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn_model_tuned.evaluate(x_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

218/218 - 0s - loss: -1.5272e+00 - accuracy: 0.8651 - 314ms/epoch - 1ms/step
Loss: -1.5271735191345215, Accuracy: 0.8651329874992371
