In [1]:
import pandas as pd

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
def numerize_csv(path: str):
    ''' Takes in a path to a project csv and converts its entries to numerical '''
    df = pd.read_csv(path)
    df['gender'] = (df['gender'] == 'Female').astype(int)

    for header in ['Partner', 'Dependents', 'PhoneService', 'PaperlessBilling', \
                'Discontinued']:
        df[header] = (df[header] == 'Yes').astype(int)

    for header in ['MultipleLines', 'OnlineSecurity', 'OnlineBackup', \
                'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies']:
        # some here have the value no phone/internet service, which are casted to 0
        df[header] = (df[header] == 'Yes').astype(int)

    for header in ['MonthlyCharges', 'TotalCharges', 'tenure']:
        # lines that need normalization
        df[header] /= (max(df[header]) - min(df[header]))

    df['InternetService'] = df['InternetService'].map({'Fiber optic': 2, 'DSL': 1, 'No': 0})
    df['Contract'] = df['Contract'].map({'Two year': 2, 'One year': 1, 'Month-to-month': 0})
    # Note that the PaymentMethod column contains some entries that are marked automatic
    # that's probably correlated with discontinuation in some way.
    df['PaymentMethod'] = df['PaymentMethod'].map({
        'Credit card (automatic)': 3,
        'Electronic check': 2,
        'Bank transfer (automatic)': 1,
        'Mailed check': 0})
    df.drop('customerID', axis=1, inplace=True)
    df.dropna(inplace=True)
    return df
    
raw = numerize_csv('train.csv')
raw

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Discontinued
0,1,0,0,0,0.013889,1,0,2,0,0,0,0,0,0,0,0,2,0.701493,0.008135,1
1,1,0,0,0,0.208333,1,0,1,0,0,0,0,1,0,1,0,0,0.558706,0.107536,0
2,0,0,1,1,0.166667,1,0,1,0,0,0,0,0,1,1,0,3,0.533831,0.080355,1
3,1,0,0,0,0.305556,1,0,2,0,1,0,0,0,1,0,1,0,0.838806,0.214131,1
4,0,0,0,0,0.027778,0,0,1,0,0,1,0,1,1,0,1,2,0.490050,0.010426,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5338,1,1,0,1,0.013889,1,0,2,0,0,0,0,0,0,0,1,2,0.706468,0.008193,1
5339,1,0,1,0,1.000000,1,1,2,1,0,1,1,1,0,2,1,1,0.977114,0.809594,0
5340,0,1,0,0,0.069444,1,0,2,0,0,0,0,0,0,0,1,0,0.699005,0.038299,1
5341,0,0,1,1,0.638889,1,0,0,0,0,0,0,0,0,1,1,1,0.200995,0.097577,0


In [3]:
def combine_related_columns(df: pd.DataFrame):
    ''' takes in a project dataframe and combines its related rows '''
    df_cpy = df
    PHONE_SERVICE_WEIGHT = 0.7
    TV_STREAM_WEIGHT = 0.5
    SECURITY_WEIGHTS = {
        'security': 0.25,
        'backup': 0.25,
        'protection': 0.25,
        'support': 0.25
    }
    df_cpy['PhoneUsageScore'] = df_cpy.pop('PhoneService').values * PHONE_SERVICE_WEIGHT \
                                + df_cpy.pop('MultipleLines').values * (1 - PHONE_SERVICE_WEIGHT)
    df_cpy['InternetSecurityScore'] = df_cpy.pop('OnlineSecurity').values * SECURITY_WEIGHTS.get('security') \
                                        + df_cpy.pop('OnlineBackup').values * SECURITY_WEIGHTS.get('backup') \
                                        + df_cpy.pop('DeviceProtection').values * SECURITY_WEIGHTS.get('protection') \
                                        + df_cpy.pop('TechSupport').values * SECURITY_WEIGHTS.get('support')
    df_cpy['InternetStreamingScore'] = df_cpy.pop('StreamingTV').values * TV_STREAM_WEIGHT \
                                        + df_cpy.pop('StreamingMovies').values * (1 - TV_STREAM_WEIGHT)
    df_cpy.insert(0, 'Discontinued', df_cpy.pop('Discontinued'))
    return df_cpy

combined=combine_related_columns(raw)

In [11]:
# Data
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score

Y = combined['Discontinued']
X = combined.drop('Discontinued', axis=1)  
X_train, X_test, y_train, y_test = train_test_split(X, Y, random_state=0, train_size = .75)

In [8]:
# Test with different max_depths
max_depths = [1, 2, 3, 4, 5]
for depth in max_depths:
    print("Depth: " + str(depth))
    base_estimator = DecisionTreeClassifier(criterion="entropy", max_depth=depth)
    clf = AdaBoostClassifier(estimator=base_estimator, n_estimators=100, random_state=0)
    clf.fit(X_train, y_train)
    y_test_preds = clf.predict(X_test)
    y_train_preds = clf.predict(X_train)
    y_test_prob_preds = clf.predict_proba(X_test)
    y_train_prob_preds = clf.predict_proba(X_train)

    roc_auc_testing_accuracy = roc_auc_score(y_test, y_test_preds)
    roc_auc_training_accuracy = roc_auc_score(y_train, y_train_preds)
    prob_roc_auc_testing_accuracy = roc_auc_score(y_test, y_test_prob_preds[:, 1])
    prob_roc_auc_training_accuracy = roc_auc_score(y_train, y_train_prob_preds[:, 1])
    testing_accuracy = accuracy_score(y_test, y_test_preds)
    training_accuracy = accuracy_score(y_train, y_train_preds)

    print(f'ROC AUC Testing Accuracy: {roc_auc_testing_accuracy}')
    print(f'ROC AUC Training Accuracy: {roc_auc_training_accuracy}')
    print(f'ROC AUC Testing Accuracy with Probabilities: {prob_roc_auc_testing_accuracy}')
    print(f'ROC AUC Training Accuracy with Probabilities: {prob_roc_auc_training_accuracy}')
    print(f'Testing Accuracy: {testing_accuracy}')
    print(f'Training Accuracy: {training_accuracy}')

# Depth 1 seems to be best, model is very prone to overfitting, maybe RandomForest is the way to go

Depth: 1
ROC AUC Testing Accuracy: 0.7119640718562874
ROC AUC Training Accuracy: 0.7317032978252296
ROC AUC Testing Accuracy with Probabilities: 0.844504491017964
ROC AUC Training Accuracy with Probabilities: 0.8640462264716805
Testing Accuracy: 0.7998500749625187
Training Accuracy: 0.8145463634091478
Depth: 2
ROC AUC Testing Accuracy: 0.6895
ROC AUC Training Accuracy: 0.7799230324524584
ROC AUC Testing Accuracy with Probabilities: 0.8038473053892214
ROC AUC Training Accuracy with Probabilities: 0.9175427239441689
Testing Accuracy: 0.7841079460269865
Training Accuracy: 0.8407898025493626
Depth: 3
ROC AUC Testing Accuracy: 0.6899640718562874
ROC AUC Training Accuracy: 0.8797249664040675
ROC AUC Testing Accuracy with Probabilities: 0.7759491017964073
ROC AUC Training Accuracy with Probabilities: 0.973400667707194
Testing Accuracy: 0.7668665667166417
Training Accuracy: 0.9112721819545113
Depth: 4
ROC AUC Testing Accuracy: 0.6615089820359281
ROC AUC Training Accuracy: 0.9937879429845438
RO

In [12]:
# Test with different n_estimators
num_estimators = [10, 25, 50, 100, 200, 400, 700, 1000]
for num in num_estimators:
    print("Number of Estimators: " + str(num))
    base_estimator = DecisionTreeClassifier(criterion="entropy", max_depth=1)
    clf = AdaBoostClassifier(estimator=base_estimator, n_estimators=num, random_state=0)
    clf.fit(X_train, y_train)
    y_test_preds = clf.predict(X_test)
    y_train_preds = clf.predict(X_train)
    y_test_prob_preds = clf.predict_proba(X_test)
    y_train_prob_preds = clf.predict_proba(X_train)

    roc_auc_testing_accuracy = roc_auc_score(y_test, y_test_preds)
    roc_auc_training_accuracy = roc_auc_score(y_train, y_train_preds)
    prob_roc_auc_testing_accuracy = roc_auc_score(y_test, y_test_prob_preds[:, 1])
    prob_roc_auc_training_accuracy = roc_auc_score(y_train, y_train_prob_preds[:, 1])
    testing_accuracy = accuracy_score(y_test, y_test_preds)
    training_accuracy = accuracy_score(y_train, y_train_preds)

    print(f'ROC AUC Testing Accuracy: {roc_auc_testing_accuracy}')
    print(f'ROC AUC Training Accuracy: {roc_auc_training_accuracy}')
    print(f'ROC AUC Testing Accuracy with Probabilities: {prob_roc_auc_testing_accuracy}')
    print(f'ROC AUC Training Accuracy with Probabilities: {prob_roc_auc_training_accuracy}')
    print(f'Testing Accuracy: {testing_accuracy}')
    print(f'Training Accuracy: {training_accuracy}')

# 200 seems to be best, after that model is overfitting on training data

Number of Estimators: 10
ROC AUC Testing Accuracy: 0.7029640718562875
ROC AUC Training Accuracy: 0.7155324237439332
ROC AUC Testing Accuracy with Probabilities: 0.8277500000000001
ROC AUC Training Accuracy with Probabilities: 0.8404403779079784
Testing Accuracy: 0.7863568215892054
Training Accuracy: 0.8012996750812297
Number of Estimators: 25
ROC AUC Testing Accuracy: 0.7124520958083832
ROC AUC Training Accuracy: 0.7270501176017162
ROC AUC Testing Accuracy with Probabilities: 0.8396946107784431
ROC AUC Training Accuracy with Probabilities: 0.8530741075861082
Testing Accuracy: 0.7946026986506747
Training Accuracy: 0.8055486128467882
Number of Estimators: 50
ROC AUC Testing Accuracy: 0.7049790419161677
ROC AUC Training Accuracy: 0.7248919059645702
ROC AUC Testing Accuracy with Probabilities: 0.8412604790419163
ROC AUC Training Accuracy with Probabilities: 0.8588252973224073
Testing Accuracy: 0.7968515742128935
Training Accuracy: 0.8080479880029993
Number of Estimators: 100
ROC AUC Testin