In [None]:
# /---------------------------------------------------------------
# Course            : Big Data Analytics
# Course Code       : CDB3034
# Assignment        : 2
# Group             : 1
# Student Name 1    : Chan Seow Fen / 0207368
# Student Name 2    : Cheah Pin Chee / 0197637
# Student Name 3    : Ong Yi Wen / 0207333
# Student Name 4    : Saw Keat Loon / 0207778
# /---------------------------------------------------------------
# Data source: https://openlearning.uowmkdu.edu.my/courses/pg-cbd-3034n-big-data-analysis-jjoshua/data_a2/?cl=1
# Original Data Files: T_Data_C1.csv, T_Data_C2.csv, T_Data_C3.csv
# /---------------------------------------------------------------
from pyspark.sql.functions import col
from pyspark.ml.feature import Imputer, StringIndexer, VectorAssembler, OneHotEncoder
import matplotlib.pyplot as plt

default_file_store_path = "dbfs:/FileStore/"

# Task 2: Customer Retention Prediction and Display Results
# Load the cleaned data
df_final = spark.read.csv("dbfs:/FileStore/tables/BDA_T_data_Final.csv", header=True, inferSchema=True)
# Drop same customer ID
df_final = df_final.dropDuplicates(subset=["CustomerID"])
print("Data size of BDA_T_data_Merged.csv: ", df_final.count(), " rows", " and ", len(df_final.columns), " columns")
display(df_final)

In [None]:
# Task 2.1 Find the number of churn users are in the dataset, discuss your opinion on the output whether the given dataset is balanced or imbalanced. What will be the descriptive statistics for tenure, totalcharges, and monthly charges?

# Find the total user, churn user, and no churn user in the dataset
total_user = df_final.count()
churn_count = df_final.filter(col("Churn") == "Yes").count()
no_churn_count = df_final.filter(col("Churn") == "No").count()
print("Total number of users in the dataset     : ", total_user)
print("Number of churn users in the dataset     : ", churn_count)
print("Number of no churn users in the dataset  : ", no_churn_count)

# Calculate ratio
churn_ratio = churn_count / total_user * 100
no_churn_ratio = no_churn_count / total_user * 100


# Plot pie chart with value labeled to visualize the churn ratio with matplotlib
labels = 'Churn', 'No Churn'
sizes = [churn_count, no_churn_count]
colors = ['royalblue', 'lightcoral']
explode = (0.1, 0)  # explode 1st slice
plt.pie(sizes, explode=explode, labels=labels, colors=colors, autopct=lambda pct: f"{pct:.1f}%\n({int(pct/100 * sum(sizes)):.0f})", startangle=140)
plt.axis('equal')
plt.title("Churn Ratio")
plt.show()

# Descriptive statistics for tenure, totalcharges, and monthly charges
df_final.describe(["tenure", "TotalCharges", "MonthlyCharges"]).display()

# Insight with histogram for each feature and visualize the distribution of tenure, totalcharges, and monthly charges
# For Tenure
tenure = df_final.select("tenure").rdd.flatMap(lambda x: x).collect()
counts, bins, patches = plt.hist(tenure, bins=30, color='royalblue', edgecolor='white', linewidth=0.2)
plt.title("Tenure Distribution")
plt.xlabel("Tenure")
plt.ylabel("Frequency")
# Display the count for each bin
for count, bin in zip(counts, bins):
    plt.text(bin, count, str(int(count)), fontsize=8, color='black', weight='bold')
plt.show()
# For TotalCharges
totalcharges = df_final.select("TotalCharges").rdd.flatMap(lambda x: x).collect()
counts, bins, patches = plt.hist(totalcharges, bins=30, color='royalblue', edgecolor='white', linewidth=0.2)
plt.title("TotalCharges Distribution")
plt.xlabel("TotalCharges")
plt.ylabel("Frequency")
# Display the count for each bin
for count, bin in zip(counts, bins):
    plt.text(bin, count, str(int(count)), fontsize=8, color='black', weight='bold')
plt.show()
# For MonthlyCharges
monthlycharges = df_final.select("MonthlyCharges").rdd.flatMap(lambda x: x).collect()
counts, bins, patches = plt.hist(monthlycharges, bins=30, color='royalblue', edgecolor='white', linewidth=0.2)
plt.title("MonthlyCharges Distribution")
plt.xlabel("MonthlyCharges")
plt.ylabel("Frequency")
# Display the count for each bin
for count, bin in zip(counts, bins):
    plt.text(bin, count, str(int(count)), fontsize=8, color='black', weight='bold')
plt.show()



In [None]:
# Task 2.2 Do the analysis on a) Male, Female, churn, and b) SeniorCitizen, churn out of the two which is the most suitable attribute that you can keep for further analysis? Does the tenure correlate to people become churn? Pivot the values and plot the values for better representation.

In [None]:
# Task 2.3	Use a model building process with Databricks for the given data, create a suitable training, evaluation data items, then build a pipeline with suitable transformers (onehotencoder, stringindexer, vectorassembler)
spark_dr = spark_dr.drop("customerID")

spark_dr = spark_dr.withColumn("tenure", col("tenure").cast("double")).withColumn("MonthlyCharges", col("MonthlyCharges").cast("double")).withColumn("TotalCharges", col("TotalCharges").cast("double"))

imputer = Imputer(inputCols=["TotalCharges"], outputCols=["TotalCharges"], strategy="mean")  # Or "median"

spark_dr = imputer.fit(spark_dr).transform(spark_dr)
trainDF, testDF = spark_dr.randomSplit([0.8, 0.2], seed=42)
print(trainDF.cache().count())  # Cache because accessing training data multiple times
print(testDF.count())

In [None]:
# Task 2.4	Manipulate the feature engineering, using the transformers Imputer, StringIndexer, QuantileDiscretizer. Use VectorAssembler to give input to the model, you may take note, to use suitable numerical columns to end the feature engineering stage.

In [None]:
# Task 2.5	Combine the pipeline fit, configure the train, test data and apply suitable regression model for the label and features columns. Execute the model and print the results of accuracy, plot the output with Area Under ROC. Evaluate the model with Binaryclassifiermetrics to analyze the prediction.