<a href="https://colab.research.google.com/github/delecena/-IA-Your-Own-Static-Site-Pt.-1---Writing-HTML/blob/main/Lab2_Data_Representationons_Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Supervised Learning with RRC

## Loading Dataset


In [None]:
import pandas as pd

df_raw = pd.read_csv('Telecom.csv')               # load dataset into a dataframe
df_raw

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.30,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.70,151.65,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,6840-RESVB,Male,0,Yes,Yes,24,Yes,Yes,DSL,Yes,...,Yes,Yes,Yes,Yes,One year,Yes,Mailed check,84.80,1990.5,No
7039,2234-XADUH,Female,0,Yes,Yes,72,Yes,Yes,Fiber optic,No,...,Yes,No,Yes,Yes,One year,Yes,Credit card (automatic),103.20,7362.9,No
7040,4801-JZAZL,Female,0,Yes,Yes,11,No,No phone service,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.60,346.45,No
7041,8361-LTMKD,Male,1,Yes,No,4,Yes,Yes,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Mailed check,74.40,306.6,Yes


## Data Cleaning

In [None]:
df_clean = df_raw.drop(columns=['Churn','customerID'])      # get features minus the label
df_encoded = pd.get_dummies(df_clean, columns=['gender', 'Partner'], dtype=int)


# Find what feature has missing values
df_clean['TotalCharges'] = pd.to_numeric(df_clean.TotalCharges, errors='coerce')
df_clean.isnull().sum()

Unnamed: 0,0
gender,0
SeniorCitizen,0
Partner,0
Dependents,0
tenure,0
PhoneService,0
MultipleLines,0
InternetService,0
OnlineSecurity,0
OnlineBackup,0


In [None]:
# Imputation via Median to fill the missing values
df_notmissing = df_clean.loc[df_clean["TotalCharges"].notna()]
df_notmissing["TotalCharges"].median()
df_clean["TotalCharges"] = df_clean["TotalCharges"].fillna(df_notmissing["TotalCharges"].median())
df_clean.isnull().sum()




Unnamed: 0,0
gender,0
SeniorCitizen,0
Partner,0
Dependents,0
tenure,0
PhoneService,0
MultipleLines,0
InternetService,0
OnlineSecurity,0
OnlineBackup,0


In [None]:
# One Hot Encoding
df_encoded = pd.get_dummies(df_clean, columns=['gender', 'Partner', 'Dependents','PhoneService','MultipleLines', 'InternetService', 'OnlineSecurity', 'DeviceProtection', 'TechSupport', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod','OnlineBackup',
'StreamingTV'], dtype=int)

df_encoded

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges,TotalCharges,gender_Female,gender_Male,Partner_No,Partner_Yes,Dependents_No,Dependents_Yes,...,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,OnlineBackup_No,OnlineBackup_No internet service,OnlineBackup_Yes,StreamingTV_No,StreamingTV_No internet service,StreamingTV_Yes
0,0,1,29.85,29.85,1,0,0,1,1,0,...,0,0,1,0,0,0,1,1,0,0
1,0,34,56.95,1889.50,0,1,1,0,1,0,...,0,0,0,1,1,0,0,1,0,0
2,0,2,53.85,108.15,0,1,1,0,1,0,...,0,0,0,1,0,0,1,1,0,0
3,0,45,42.30,1840.75,0,1,1,0,1,0,...,1,0,0,0,1,0,0,1,0,0
4,0,2,70.70,151.65,1,0,1,0,1,0,...,0,0,1,0,1,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,0,24,84.80,1990.50,0,1,0,1,0,1,...,0,0,0,1,1,0,0,0,0,1
7039,0,72,103.20,7362.90,1,0,0,1,0,1,...,0,1,0,0,0,0,1,0,0,1
7040,0,11,29.60,346.45,1,0,0,1,0,1,...,0,0,1,0,1,0,0,1,0,0
7041,1,4,74.40,306.60,0,1,0,1,1,0,...,0,0,0,1,1,0,0,1,0,0


## Data Preparation

### Normalization

We normalize the data so that all data scales in similar ranges. This makes the model understand the data better.

In [None]:
# MinMax Normalization for Tenure, MonthlyCharges, and TotalCharges

df_features = df_encoded.columns.tolist() # Get a list of all feature names
scaled_df = df_encoded.copy() # Create a copy of the original DataFrame

from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaled_df[df_features] = scaler.fit_transform(df_encoded[df_features])
X_minmax = scaled_df

X_minmax

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges,TotalCharges,gender_Female,gender_Male,Partner_No,Partner_Yes,Dependents_No,Dependents_Yes,...,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,OnlineBackup_No,OnlineBackup_No internet service,OnlineBackup_Yes,StreamingTV_No,StreamingTV_No internet service,StreamingTV_Yes
0,0.0,0.013889,0.115423,0.001275,1.0,0.0,0.0,1.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
1,0.0,0.472222,0.385075,0.215867,0.0,1.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0
2,0.0,0.027778,0.354229,0.010310,0.0,1.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0
3,0.0,0.625000,0.239303,0.210241,0.0,1.0,1.0,0.0,1.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
4,0.0,0.027778,0.521891,0.015330,1.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,0.0,0.333333,0.662189,0.227521,0.0,1.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0
7039,0.0,1.000000,0.845274,0.847461,1.0,0.0,0.0,1.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
7040,0.0,0.152778,0.112935,0.037809,1.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
7041,1.0,0.055556,0.558706,0.033210,0.0,1.0,0.0,1.0,1.0,0.0,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0


## Neural Network

In [None]:
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

SEED=42
torch.manual_seed(SEED)

<torch._C.Generator at 0x7a8156672cb0>

In [None]:
class MyNetwork(nn.Module):
    def __init__(self):
        super(MyNetwork, self).__init__()
        self.fc1 = nn.Linear(45, 35)  # Input layer to first hidden layer
        self.relu1 = nn.ReLU()
        self.fc2 = nn.Linear(35, 25) # First hidden layer to second hidden layer
        self.relu2 = nn.ReLU()
        self.fc3 = nn.Linear(25, 15)  # Second hidden layer to third hidden layer
        self.relu3 = nn.ReLU()
        self.fc4 = nn.Linear(15, 1)   # Third hidden layer to output layer
        #self.relu4 = nn.ReLU()
        #self.fc5 = nn.Linear(10, 1)    # Output layer
        #self.relu5 = nn.ReLU()
        #self.fc6 = nn.Linear(20, 1)


    def forward(self, x):
        x = self.fc1(x)
        x = self.relu1(x)
        x = self.fc2(x)
        x = self.relu2(x)
        x = self.fc3(x)
        x = self.relu3(x)
        x = self.fc4(x)
        #x = self.relu4(x)
        #x = self.fc5(x)
        #x = self.relu5(x)
        #x = self.fc6(x)
        return x

In [None]:
class TelecomDataset(Dataset):
    def __init__(self, features, labels):
        self.features = torch.tensor(features.values, dtype=torch.float32)
        self.labels = torch.tensor(labels.map({'No': 0, 'Yes': 1}).values, dtype=torch.long)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return self.features[idx], self.labels[idx]

# Create the dataset
full_dataset = TelecomDataset(X_minmax, df_raw['Churn'])

# Split the dataset into training and validation sets
train_dataset, val_dataset = train_test_split(full_dataset, test_size=0.2, random_state=SEED)

# Create DataLoaders for training and validation sets
batch_size = 32
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=False)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

print(f"Number of training samples: {len(train_dataset)}")
print(f"Number of validation samples: {len(val_dataset)}")

Number of training samples: 5634
Number of validation samples: 1409


In [None]:
# Create Components for training

model = MyNetwork()
learning_rate = 0.001
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
loss_function = nn.BCEWithLogitsLoss() # Changed loss function to BCEWithLogitsLoss

In [None]:
def train_fn(model, optimizer, loader, loss_fn):
  total_loss = 0.0

  #ave_loss
  model.train()
  for x, y in loader:
    optimizer.zero_grad()
    outputs = model(x)
    # Reshape target to match output shape and convert to float
    loss = loss_fn(outputs, y.float().unsqueeze(1))
    loss.backward()
    optimizer.step()
    total_loss += loss.item()
  ave_loss = total_loss / len(loader)
  return ave_loss

In [None]:
# Assuming model, optimizer, loss_function, train_dataloader, and val_dataloader are defined

epochs = 100

for epoch in range(epochs):
    # Training phase
    ave_loss = train_fn(model, optimizer, train_dataloader, loss_function)
    print(f"Epoch ({epoch+1}/{epochs}): Training Loss: {ave_loss:.4f}", end=" ")

    # Validation phase
    model.eval()  # Set the model to evaluation mode
    total_val_loss = 0.0
    with torch.no_grad():  # Disable gradient calculation
        for xb, yb in val_dataloader:
            # Reshape target to match output shape and convert to float
            yb_reshaped = yb.float().unsqueeze(1)
            outputs = model(xb)
            loss = loss_function(outputs, yb_reshaped)
            total_val_loss += loss.item()

    ave_val_loss = total_val_loss / len(val_dataloader)
    print(f"Validation Loss: {ave_val_loss:.4f}")

    model.train()

Epoch (1/100): Training Loss: 0.5266 Validation Loss: 0.4373
Epoch (2/100): Training Loss: 0.4325 Validation Loss: 0.4250
Epoch (3/100): Training Loss: 0.4264 Validation Loss: 0.4208
Epoch (4/100): Training Loss: 0.4234 Validation Loss: 0.4179
Epoch (5/100): Training Loss: 0.4211 Validation Loss: 0.4157
Epoch (6/100): Training Loss: 0.4191 Validation Loss: 0.4137
Epoch (7/100): Training Loss: 0.4174 Validation Loss: 0.4127
Epoch (8/100): Training Loss: 0.4161 Validation Loss: 0.4122
Epoch (9/100): Training Loss: 0.4147 Validation Loss: 0.4123
Epoch (10/100): Training Loss: 0.4134 Validation Loss: 0.4119
Epoch (11/100): Training Loss: 0.4123 Validation Loss: 0.4120
Epoch (12/100): Training Loss: 0.4111 Validation Loss: 0.4123
Epoch (13/100): Training Loss: 0.4101 Validation Loss: 0.4128
Epoch (14/100): Training Loss: 0.4089 Validation Loss: 0.4129
Epoch (15/100): Training Loss: 0.4079 Validation Loss: 0.4131
Epoch (16/100): Training Loss: 0.4069 Validation Loss: 0.4135
Epoch (17/100): T

In [None]:
import torch
from sklearn.metrics import confusion_matrix, f1_score
import numpy as np

model.eval()
y_true = []
y_pred = []

with torch.no_grad():
    for xb, yb in val_dataloader:
        outputs = model(xb)
        preds = torch.sigmoid(outputs).round()
        y_true.extend(yb.cpu().numpy().flatten())
        y_pred.extend(preds.cpu().numpy().flatten())

# Convert to numpy arrays
y_true = np.array(y_true)
y_pred = np.array(y_pred)

# Confusion matrix and F1 score
cm = confusion_matrix(y_true, y_pred)
f1 = f1_score(y_true, y_pred, average='binary', pos_label=1)

print("Confusion Matrix:\n", cm)
print(f"F1 Score on Validation Set: {f1:.4f}")


Confusion Matrix:
 [[932 104]
 [181 192]]
F1 Score on Validation Set: 0.5740
