In [10]:
import pandas as pd

In [11]:
def numerize_csv(path: str):
    ''' Takes in a path to a project csv and converts its entries to numerical '''
    df = pd.read_csv(path)
    df['gender'] = (df['gender'] == 'Female').astype(int)

    for header in ['Partner', 'Dependents', 'PhoneService', 'PaperlessBilling', \
                'Discontinued']:
        df[header] = (df[header] == 'Yes').astype(int)

    for header in ['MultipleLines', 'OnlineSecurity', 'OnlineBackup', \
                'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies']:
        # some here have the value no phone/internet service, which are casted to 0
        df[header] = (df[header] == 'Yes').astype(int)

    for header in ['MonthlyCharges', 'TotalCharges', 'tenure']:
        # lines that need normalization
        df[header] /= (max(df[header]) - min(df[header]))

    df['InternetService'] = df['InternetService'].map({'Fiber optic': 2, 'DSL': 1, 'No': 0})
    df['Contract'] = df['Contract'].map({'Two year': 2, 'One year': 1, 'Month-to-month': 0})
    # Note that the PaymentMethod column contains some entries that are marked automatic
    # that's probably correlated with discontinuation in some way.
    df['PaymentMethod'] = df['PaymentMethod'].map({
        'Credit card (automatic)': 3,
        'Electronic check': 2,
        'Bank transfer (automatic)': 1,
        'Mailed check': 0})
    df.drop('customerID', axis=1, inplace=True)
    df.dropna(inplace=True)
    return df
    
raw = numerize_csv('train.csv')
raw

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Discontinued
0,1,0,0,0,0.013889,1,0,2,0,0,0,0,0,0,0,0,2,0.701493,0.008135,1
1,1,0,0,0,0.208333,1,0,1,0,0,0,0,1,0,1,0,0,0.558706,0.107536,0
2,0,0,1,1,0.166667,1,0,1,0,0,0,0,0,1,1,0,3,0.533831,0.080355,1
3,1,0,0,0,0.305556,1,0,2,0,1,0,0,0,1,0,1,0,0.838806,0.214131,1
4,0,0,0,0,0.027778,0,0,1,0,0,1,0,1,1,0,1,2,0.490050,0.010426,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5338,1,1,0,1,0.013889,1,0,2,0,0,0,0,0,0,0,1,2,0.706468,0.008193,1
5339,1,0,1,0,1.000000,1,1,2,1,0,1,1,1,0,2,1,1,0.977114,0.809594,0
5340,0,1,0,0,0.069444,1,0,2,0,0,0,0,0,0,0,1,0,0.699005,0.038299,1
5341,0,0,1,1,0.638889,1,0,0,0,0,0,0,0,0,1,1,1,0.200995,0.097577,0


In [12]:
def combine_related_columns(df: pd.DataFrame):
    ''' takes in a project dataframe and combines its related rows '''
    df_cpy = df
    PHONE_SERVICE_WEIGHT = 0.7
    TV_STREAM_WEIGHT = 0.5
    SECURITY_WEIGHTS = {
        'security': 0.25,
        'backup': 0.25,
        'protection': 0.25,
        'support': 0.25
    }
    df_cpy['PhoneUsageScore'] = df_cpy.pop('PhoneService').values * PHONE_SERVICE_WEIGHT \
                                + df_cpy.pop('MultipleLines').values * (1 - PHONE_SERVICE_WEIGHT)
    df_cpy['InternetSecurityScore'] = df_cpy.pop('OnlineSecurity').values * SECURITY_WEIGHTS.get('security') \
                                        + df_cpy.pop('OnlineBackup').values * SECURITY_WEIGHTS.get('backup') \
                                        + df_cpy.pop('DeviceProtection').values * SECURITY_WEIGHTS.get('protection') \
                                        + df_cpy.pop('TechSupport').values * SECURITY_WEIGHTS.get('support')
    df_cpy['InternetStreamingScore'] = df_cpy.pop('StreamingTV').values * TV_STREAM_WEIGHT \
                                        + df_cpy.pop('StreamingMovies').values * (1 - TV_STREAM_WEIGHT)
    df_cpy.insert(0, 'Discontinued', df_cpy.pop('Discontinued'))
    return df_cpy

combined=combine_related_columns(raw)

In [13]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.datasets import make_classification
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
import numpy as np
import torch
import torchvision
import torch.nn as nn


# Define the model
model = nn.Sequential(
    nn.Flatten(),
    nn.Linear(14, 100),  
    nn.Sigmoid(),
    nn.Dropout(0.5),
    nn.Linear(100, 256),   
    nn.Sigmoid(),
    nn.Dropout(0.5),
    nn.Linear(256, 128),
    nn.Linear(128, 64),   
    nn.Sigmoid(),
    nn.Dropout(0.5),
    nn.Linear(64, 40),   
    nn.Sigmoid(),
    nn.Dropout(0.5),
    nn.Linear(40, 1),   
)


print(model)



# Parameters
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
loss_fn = nn.MSELoss()

#Data
Y = combined['Discontinued']
X = combined.drop('Discontinued', axis=1)  
X_train, X_test, y_train, y_test = train_test_split(X, Y, random_state=0, train_size = .75)

X_train_tensor = torch.tensor(X_train.values, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test.values, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32)  
y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32) 

y_train_tensor = y_train_tensor.view(-1, 1) 
y_test_tensor = y_test_tensor.view(-1, 1)

print(len(y_test_tensor))

# Training
num_epochs = 10000
for epoch in range(num_epochs):
    model.train()  
    optimizer.zero_grad()  

    outputs = model(X_train_tensor) 
    loss = loss_fn(outputs, y_train_tensor)  
    loss.backward()  
    optimizer.step() 
    model.eval() 
    with torch.no_grad():
        outputs = model(X_test_tensor)
        test_loss = loss_fn(outputs, y_test_tensor)
        print(f'Accuracy: {1-test_loss.item()}')





Sequential(
  (0): Flatten(start_dim=1, end_dim=-1)
  (1): Linear(in_features=14, out_features=100, bias=True)
  (2): Sigmoid()
  (3): Dropout(p=0.5, inplace=False)
  (4): Linear(in_features=100, out_features=256, bias=True)
  (5): Sigmoid()
  (6): Dropout(p=0.5, inplace=False)
  (7): Linear(in_features=256, out_features=128, bias=True)
  (8): Linear(in_features=128, out_features=64, bias=True)
  (9): Sigmoid()
  (10): Dropout(p=0.5, inplace=False)
  (11): Linear(in_features=64, out_features=40, bias=True)
  (12): Sigmoid()
  (13): Dropout(p=0.5, inplace=False)
  (14): Linear(in_features=40, out_features=1, bias=True)
)
1334
Accuracy: 0.7442610859870911
Accuracy: 0.772470161318779
Accuracy: 0.7932082861661911
Accuracy: 0.8063072264194489
Accuracy: 0.8119241893291473
Accuracy: 0.8112303614616394
Accuracy: 0.8068308234214783
Accuracy: 0.8017328977584839
Accuracy: 0.7981366366147995
Accuracy: 0.7971846610307693
Accuracy: 0.7984972596168518
Accuracy: 0.8012115806341171
Accuracy: 0.80454529

KeyboardInterrupt: 