In [8]:
import pyspark

In [9]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType

In [10]:
spark = SparkSession.builder.appName('ChurnPrediction').getOrCreate()

In [11]:
df = spark.read.option('header', 'true').csv('../data/raw/data.csv', inferSchema=True)

In [12]:
df.show()

+----------+------+-------------+-------+----------+------+------------+----------------+---------------+-------------------+-------------------+-------------------+-------------------+-------------------+-------------------+--------------+----------------+--------------------+--------------+------------+-----+
|customerID|gender|SeniorCitizen|Partner|Dependents|tenure|PhoneService|   MultipleLines|InternetService|     OnlineSecurity|       OnlineBackup|   DeviceProtection|        TechSupport|        StreamingTV|    StreamingMovies|      Contract|PaperlessBilling|       PaymentMethod|MonthlyCharges|TotalCharges|Churn|
+----------+------+-------------+-------+----------+------+------------+----------------+---------------+-------------------+-------------------+-------------------+-------------------+-------------------+-------------------+--------------+----------------+--------------------+--------------+------------+-----+
|7590-VHVEG|Female|            0|    Yes|        No|     1|  

In [13]:
#Drop column ID
df = df.drop('customerID')

In [14]:
#Drop Missing Values
df = df.na.drop()

In [15]:
#Drop Duplicates
df = df.dropDuplicates()

In [16]:
from pyspark.sql.functions import when, col
df = df.withColumn("gender", when(col("gender") == "Male", 1).otherwise(0))
df = df.withColumn("Partner", when(col("Partner") == "Yes", 1).otherwise(0))
df = df.withColumn("PhoneService", when(col("PhoneService") == "Yes", 1).otherwise(0))
df = df.withColumn("OnlineSecurity", when(col("OnlineSecurity") == "Yes", 1).otherwise(0))
df = df.withColumn("OnlineBackup", when(col("OnlineBackup") == "Yes", 1).otherwise(0))
df = df.withColumn("TechSupport", when(col("TechSupport") == "Yes", 1).otherwise(0))
df = df.withColumn("StreamingTV", when(col("TechSupport") == "Yes", 1).otherwise(0))
df = df.withColumn("StreamingMovies", when(col("StreamingMovies") == "Yes", 1).otherwise(0))
df = df.withColumn("PaperlessBilling", when(col("PaperlessBilling") == "Yes", 1).otherwise(0))
df = df.withColumn("Churn", when(col("Churn") == "Yes", 1).otherwise(0))

In [19]:
df = df.withColumn("Dependents", when(col("Dependents") == "Yes", 1).otherwise(0))

In [20]:
df.show()

+------+-------------+-------+----------+------+------------+----------------+---------------+--------------+------------+-------------------+-----------+-----------+---------------+--------------+----------------+--------------------+--------------+------------+-----+
|gender|SeniorCitizen|Partner|Dependents|tenure|PhoneService|   MultipleLines|InternetService|OnlineSecurity|OnlineBackup|   DeviceProtection|TechSupport|StreamingTV|StreamingMovies|      Contract|PaperlessBilling|       PaymentMethod|MonthlyCharges|TotalCharges|Churn|
+------+-------------+-------+----------+------+------------+----------------+---------------+--------------+------------+-------------------+-----------+-----------+---------------+--------------+----------------+--------------------+--------------+------------+-----+
|     1|            0|      1|         1|    10|           1|              No|            DSL|             0|           1|                 No|          0|          0|              0|      On

In [21]:
# Generate timestamp
from datetime import datetime
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')

# Output path
output_path = f"../data/processed/data_{timestamp}.csv"

# Save DataFrame as single CSV
df.toPandas().to_csv(output_path)

In [2]:
import pandas as pd
import numpy as np
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report
import warnings
warnings.filterwarnings('ignore')

In [22]:
tc_folder = "../data/processed/"
df = pd.read_csv(tc_folder + "data_20241231_153937.csv")

In [23]:
df

Unnamed: 0.1,Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,0,1,0,1,1,10,1,No,DSL,0,...,No,0,0,0,One year,0,Mailed check,49.55,475.7,0
1,1,0,0,0,0,31,0,No phone service,DSL,1,...,No,1,0,1,Two year,1,Bank transfer (automatic),49.85,1520.1,0
2,2,0,0,1,1,38,1,No,DSL,1,...,No,1,0,1,One year,0,Credit card (automatic),80.30,3058.65,1
3,3,0,0,0,0,68,1,Yes,DSL,1,...,Yes,1,0,1,Two year,1,Electronic check,88.15,6148.45,0
4,4,0,1,1,0,72,1,Yes,Fiber optic,0,...,Yes,1,0,1,Two year,1,Credit card (automatic),110.80,7882.25,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7016,7016,1,0,1,0,23,1,No,DSL,1,...,No,0,0,1,Month-to-month,1,Bank transfer (automatic),59.95,1406,0
7017,7017,0,1,1,0,42,1,Yes,Fiber optic,0,...,Yes,0,0,0,Month-to-month,1,Electronic check,95.55,3930.6,0
7018,7018,0,0,0,0,10,1,No,No,0,...,No internet service,0,0,0,One year,1,Mailed check,19.80,198.25,0
7019,7019,1,0,0,0,1,1,No,No,0,...,No internet service,0,0,0,Month-to-month,0,Electronic check,20.65,20.65,0


In [3]:
# demi kesehatan mental, mager ngubah datasetnya
X, y = make_classification(n_samples=1000, n_features=10, n_informative=2, n_redundant=8, 
                           weights=[0.9, 0.1], flip_y=0, random_state=42)

np.unique(y, return_counts=True)

(array([0, 1]), array([900, 100], dtype=int64))

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)

In [5]:
# Define the model hyperparameters
params = {
    "solver": "lbfgs",
    "max_iter": 1000,
    "multi_class": "auto",
    "random_state": 8888,
}

# Train the model
lr = LogisticRegression(**params)
lr.fit(X_train, y_train)

# Predict on the test set
y_pred = lr.predict(X_test)

report = classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

           0       0.95      0.97      0.96       270
           1       0.62      0.50      0.56        30

    accuracy                           0.92       300
   macro avg       0.79      0.73      0.76       300
weighted avg       0.91      0.92      0.92       300



In [6]:
report_dict = classification_report(y_test, y_pred, output_dict=True)
report_dict

{'0': {'precision': 0.9456521739130435,
  'recall': 0.9666666666666667,
  'f1-score': 0.956043956043956,
  'support': 270.0},
 '1': {'precision': 0.625,
  'recall': 0.5,
  'f1-score': 0.5555555555555556,
  'support': 30.0},
 'accuracy': 0.92,
 'macro avg': {'precision': 0.7853260869565217,
  'recall': 0.7333333333333334,
  'f1-score': 0.7557997557997558,
  'support': 300.0},
 'weighted avg': {'precision': 0.9135869565217392,
  'recall': 0.92,
  'f1-score': 0.915995115995116,
  'support': 300.0}}

In [7]:
import mlflow

In [9]:
mlflow.set_experiment("First Experiment")
mlflow.set_tracking_uri("http://localhost:5000")

with mlflow.start_run():
    mlflow.log_params(params)
    mlflow.log_metrics({
        'accuracy': report_dict['accuracy'],
        'recall_class_0': report_dict['0']['recall'],
        'recall_class_1': report_dict['1']['recall'],
        'f1_score_macro': report_dict['macro avg']['f1-score']
    })
    mlflow.sklearn.log_model(lr, "Logistic Regression") 

UnsupportedModelRegistryStoreURIException:  Model registry functionality is unavailable; got unsupported URI 'localhost:5000' for model registry data storage. Supported URI schemes are: ['', 'file', 'databricks', 'databricks-uc', 'uc', 'http', 'https', 'postgresql', 'mysql', 'sqlite', 'mssql']. See https://www.mlflow.org/docs/latest/tracking.html#storage for how to run an MLflow server against one of the supported backend storage locations.