# Smoker Dataset
## Module 10.13 Assignment 10: Final Project
Dec. 2024

Authors: Tyler Earps, Ryan Smith, Basil Mullings, & Ean Vandergraaf

### Abstract and Data Description

https://www.kaggle.com/competitions/playground-series-s3e24/data?select=train.csv

### Stage 0: Import Data

In [None]:
# Downlod the data
# https://www.kaggle.com/competitions/playground-series-s3e24/data?select=train.csv

# Extract train.csv & test.csv to folder "./Data" in the same directory as repo

In [None]:
from pyspark.sql import SparkSession
import pandas as pd
from pyspark.sql.functions import count, countDistinct, format_number, when, col, explode, lower

spark = SparkSession.builder.getOrCreate()

#Load Kaggle data into a DataFrame
def csvToDF(fileName):
    return spark.read\
                .format("csv")\
                .option("header", "true")\
                .option('escape','"')\
                .load(fileName)
    
df_train = csvToDF("data/train.csv")
df_test = csvToDF("data/test.csv")

#Cast all columns to double
df_train = df_train.select([col(column).cast('double') for column in df_train.columns])
df_test = df_test.select([col(column).cast('double') for column in df_test.columns])

##Show our basic statistics
print('#=> Summary of statistics for our Training data :')
df_train.summary().show()
print("\n")

##Show the data schema
print('#=> Training data schema:')
df_train.printSchema()
print("\n")

##List the column names 
print('#=> Column names:')
print(df_train.columns)
print("\n")

##Verify any null relevant data in any of our columns.
print('#=> Training dataset with Null values:')
df_train.select([count(when(col(c).isNull(), c )).alias(c) for c in df_train.columns]).show()
print("\n")

### Stage 1: Data Preparation

<i>EDA & Any adjustments to clean the data</i>

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Find most important features
# Do some box and whisker charts
# Decide if any transformations need to be made

df_train.describe().show()

def box_chart(df, title):
    plt.figure(figsize=(len(df.columns)*2, 6))
    sns.boxplot(data=df)
    plt.title(title)
    plt.show()

## Distributions
print('Distributions within the dataset')
df_train_age = df_train.select('age').toPandas()
box_chart(df_train_age,"Distributions of Age")

df_train_measurements = df_train.select('height(cm)', 'weight(kg)', 'waist(cm)').toPandas()
box_chart(df_train_measurements,"Distributions of Measurements")

df_train_senses = df_train.select('eyesight(left)', 'eyesight(right)', 'hearing(left)', 'hearing(right)').toPandas()
box_chart(df_train_senses,"Distributions of Senses")

df_train_lifestyle = df_train.select('systolic', 'relaxation', 'fasting blood sugar', 'Cholesterol', 'triglyceride').toPandas()
box_chart(df_train_lifestyle,"Distributions of Lifestyle Factors")

## Binary Analysis
df_train_smokers = df_train.filter('smoking == 1').count()
df_train_non_smokers = df_train.filter('smoking == 0').count()
y = np.array([df_train_smokers, df_train_non_smokers])
labels = ['Smokers', 'Non-Smokers']
plt.title('Breakdown of Smokers vs Non-Smokers in Data')
plt.pie(y, labels = labels, startangle=90)
plt.show() 


### Stage 2: Feature Extraction

In [None]:
from pyspark.ml.feature import FeatureHasher
from pyspark.sql.functions import concat

# Split data from label
x_features = ['age', 'height(cm)', 'weight(kg)', 'waist(cm)', 'eyesight(left)', 'eyesight(right)', 'hearing(left)', 'hearing(right)', 'systolic', 'relaxation', 'fasting blood sugar', 'Cholesterol', 'triglyceride', 'HDL', 'LDL', 'hemoglobin', 'Urine protein', 'serum creatinine', 'AST', 'ALT', 'Gtp', 'dental caries']
y_features = ['smoking']

# df_train_x = df_train.select(x_features)
# df_train_y = df_train.select(y_features)
# Scale the data

# Use FeatureHasher to combine features into one column
hasher = FeatureHasher()
hasher.setInputCols(x_features)
hasher.setOutputCol("features")
df_train_x = hasher.transform(df_train.withColumn("label", df_train.smoking))
df_train_x = df_train_x.select(['features', 'label'])

df = df_train_x.select(concat(df_train_x.select('features')).alias('s'))
df.collect()

# Now display 2 rows of the processed train data.
print("\n")
print('#=> Displaying processed train data:')
df_train_x.show(n=2, truncate=False)

# Use RandomForestClassifier for feature importance ranking

### Stage 3: Machine Learning Algorithm Preparation and Parameter Tuning

In [None]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.classification import LogisticRegression, DecisionTreeClassifier, RandomForestClassifier, NaiveBayes

#=> Stage 3.7 - Perform our train/test split
##Split the training data into training and test sets (80% training, 20% test)
X_train, X_test = df_train_x.randomSplit([0.8, 0.2], seed=42)

##Our list of ML models
ml_models = {
    "randomForest_model": RandomForestClassifier(),
    "decisionTree_model": DecisionTreeClassifier(),
    "logisticReg_model": LogisticRegression(),
    "naive_bayes_model": NaiveBayes(smoothing=0.5)
}

##Catalog the "smoking" prediction results of our models.
results = {}
for model_name, model in ml_models.items():
    ##Proceed to train the model.
    trained_model = model.fit(X_train)

    ##Do the prediction
    y_pred = trained_model.transform(X_test)

    #=> Stage 3.8
    ##Do the evaluation.
    evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
    accuracy = evaluator.evaluate(y_pred)
    results[model_name] = { "accuracy": accuracy, "y_pred": y_pred }
    print(f"{model_name}: {accuracy=}")

    ##Check the predictions.
    y_pred.select("label", "prediction").show(5)

highest_accuracy = 0
best_performing_model = ""

##Now traverse our results dictionary and find the highest accuracy of each of our model.
for model_name, result in results.items():
    if result["accuracy"] > highest_accuracy:
        highest_accuracy = result["accuracy"]
        best_model = model_name

print(f'The Best performing model: {best_model}={highest_accuracy}')


### Stage 4: Model Evaluation

### Stage 5: Visualization

### Limitations, Future Work, and Conclusion

### Overall Performance and Documentation