In [45]:
import pandas as pd

In [46]:
def numerize_csv(path: str):
    ''' Takes in a path to a project csv and converts its entries to numerical '''
    df = pd.read_csv(path)
    df['gender'] = (df['gender'] == 'Female').astype(int)

    for header in ['Partner', 'Dependents', 'PhoneService', 'PaperlessBilling', \
                'Discontinued']:
        df[header] = (df[header] == 'Yes').astype(int)

    for header in ['MultipleLines', 'OnlineSecurity', 'OnlineBackup', \
                'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies']:
        # some here have the value no phone/internet service, which are casted to 0
        df[header] = (df[header] == 'Yes').astype(int)

    for header in ['MonthlyCharges', 'TotalCharges', 'tenure']:
        # lines that need normalization
        df[header] /= (max(df[header]) - min(df[header]))

    df['InternetService'] = df['InternetService'].map({'Fiber optic': 2, 'DSL': 1, 'No': 0})
    df['Contract'] = df['Contract'].map({'Two year': 2, 'One year': 1, 'Month-to-month': 0})
    # Note that the PaymentMethod column contains some entries that are marked automatic
    # that's probably correlated with discontinuation in some way.
    df['PaymentMethod'] = df['PaymentMethod'].map({
        'Credit card (automatic)': 3,
        'Electronic check': 2,
        'Bank transfer (automatic)': 1,
        'Mailed check': 0})
    df.drop('customerID', axis=1, inplace=True)
    mean = df.mean()
    df.fillna(mean, inplace=True)
    return df
    
raw = numerize_csv('train.csv')
raw.corr().abs()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Discontinued
gender,1.0,0.008058,0.009926,0.012358,0.001276,0.0021,0.001972,0.013781,0.021603,0.026409,0.000435,0.006752,0.012226,0.021835,0.005884,0.00935,0.013126,0.018596,0.008715,0.00961
SeniorCitizen,0.008058,1.0,0.019705,0.217911,0.013427,0.003393,0.132697,0.258504,0.036485,0.083121,0.066034,0.060911,0.114115,0.128221,0.144865,0.152825,0.124163,0.221413,0.101402,0.152664
Partner,0.009926,0.019705,1.0,0.456296,0.388151,0.011418,0.131955,0.010862,0.145272,0.14458,0.164363,0.130111,0.128223,0.127838,0.293836,0.006516,0.072057,0.106155,0.32863,0.150313
Dependents,0.012358,0.217911,0.456296,1.0,0.170039,0.006,0.025398,0.17226,0.088248,0.02738,0.019564,0.072248,0.016426,0.035688,0.248846,0.107969,0.044406,0.107569,0.073181,0.165446
tenure,0.001276,0.013427,0.388151,0.170039,1.0,0.006648,0.322456,0.033743,0.321803,0.354553,0.363212,0.313915,0.269105,0.288481,0.670743,0.009223,0.179691,0.244038,0.82115,0.344429
PhoneService,0.0021,0.003393,0.011418,0.006,0.006648,1.0,0.282921,0.091643,0.102215,0.047848,0.077713,0.111825,0.036102,0.044218,0.007187,0.017776,0.007016,0.242753,0.110471,0.015895
MultipleLines,0.001972,0.132697,0.131955,0.025398,0.322456,0.282921,1.0,0.335583,0.09409,0.202748,0.188941,0.093279,0.241986,0.247713,0.103415,0.159566,0.174553,0.480175,0.456905,0.042595
InternetService,0.013781,0.258504,0.010862,0.17226,0.033743,0.091643,0.335583,1.0,0.160173,0.317616,0.319025,0.163913,0.433022,0.425073,0.291289,0.369531,0.2798,0.907717,0.433378,0.315576
OnlineSecurity,0.021603,0.036485,0.145272,0.088248,0.321803,0.102215,0.09409,0.160173,1.0,0.268132,0.280324,0.353521,0.173911,0.186616,0.238484,0.002107,0.076551,0.295334,0.409077,0.166174
OnlineBackup,0.026409,0.083121,0.14458,0.02738,0.354553,0.047848,0.202748,0.317616,0.268132,1.0,0.296703,0.282705,0.283671,0.273234,0.148977,0.126865,0.139234,0.446769,0.510063,0.079801


In [47]:
def combine_related_columns(df: pd.DataFrame):
    ''' takes in a project dataframe and combines its related rows '''
    df_cpy = df
    PHONE_SERVICE_WEIGHT = 0.7
    TV_STREAM_WEIGHT = 0.5
    SECURITY_WEIGHTS = {
        'security': 0.25,
        'backup': 0.25,
        'protection': 0.25,
        'support': 0.25
    }
    df_cpy['PhoneUsageScore'] = df_cpy.pop('PhoneService').values * PHONE_SERVICE_WEIGHT \
                                + df_cpy.pop('MultipleLines').values * (1 - PHONE_SERVICE_WEIGHT)
    df_cpy['InternetSecurityScore'] = df_cpy.pop('OnlineSecurity').values * SECURITY_WEIGHTS.get('security') \
                                        + df_cpy.pop('OnlineBackup').values * SECURITY_WEIGHTS.get('backup') \
                                        + df_cpy.pop('DeviceProtection').values * SECURITY_WEIGHTS.get('protection') \
                                        + df_cpy.pop('TechSupport').values * SECURITY_WEIGHTS.get('support')
    df_cpy['InternetStreamingScore'] = df_cpy.pop('StreamingTV').values * TV_STREAM_WEIGHT \
                                        + df_cpy.pop('StreamingMovies').values * (1 - TV_STREAM_WEIGHT)
    df_cpy.insert(0, 'Discontinued', df_cpy.pop('Discontinued'))
    return df_cpy

combine_related_columns(raw)

Unnamed: 0,Discontinued,gender,SeniorCitizen,Partner,Dependents,tenure,InternetService,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,PhoneUsageScore,InternetSecurityScore,InternetStreamingScore
0,1,1,0,0,0,0.013889,2,0,0,2,0.701493,0.008135,0.7,0.00,0.0
1,0,1,0,0,0,0.208333,1,1,0,0,0.558706,0.107536,0.7,0.00,0.5
2,1,0,0,1,1,0.166667,1,1,0,3,0.533831,0.080355,0.7,0.00,0.5
3,1,1,0,0,0,0.305556,2,0,1,0,0.838806,0.214131,0.7,0.25,0.5
4,1,0,0,0,0,0.027778,1,0,1,2,0.490050,0.010426,0.0,0.25,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5338,1,1,1,0,1,0.013889,2,0,1,2,0.706468,0.008193,0.7,0.00,0.0
5339,0,1,0,1,0,1.000000,2,2,1,1,0.977114,0.809594,1.0,0.75,0.5
5340,1,0,1,0,0,0.069444,2,0,1,0,0.699005,0.038299,0.7,0.00,0.0
5341,0,0,0,1,1,0.638889,0,1,1,1,0.200995,0.097577,0.7,0.00,0.0


In [48]:
def random_forest_attempt():
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.metrics import accuracy_score, roc_auc_score
    from sklearn.model_selection import train_test_split

    combined = combine_related_columns(numerize_csv('train.csv'))
    Y = combined['Discontinued']
    X = combined.drop('Discontinued', axis=1)
    X_train, X_test, y_train, y_test = train_test_split(X, Y, random_state=0, train_size = .75)


    # results of extensive testing
    model = RandomForestClassifier(n_estimators=200, random_state=42, criterion='entropy')
    model.set_params(min_samples_leaf=7, max_features=1)
    model.fit(X_train, y_train)
    y_test_preds = model.predict_proba(X_test)[:, 1]
    y_train_preds = model.predict_proba(X_train)[:, 1]
    # Calculate accuracy on the training set
    # testing_accuracy = accuracy_score(y_test, y_test_preds)
    # training_accuracy = accuracy_score(y_train, y_train_preds)
    testing_roc = roc_auc_score(y_test, y_test_preds)
    training_roc = roc_auc_score(y_train, y_train_preds)
    print(testing_roc)
    print(training_roc)
    # print(f'Testing Accuracy: {testing_accuracy}')
    # print(f'Training Accuracy: {training_accuracy}')
random_forest_attempt()

4007
0.855591166540383
0.9044393152430454


In [49]:
def write_submission(preds):
    '''
    Utility to
    '''
    df = pd.read_csv('submission.csv')
    pred_len = len(preds)
    target_len = len(df['ID'])
    if pred_len != target_len:
        raise ValueError
    df['TARGET'] = pd.Series(preds)
    df.set_index('ID', inplace=True)
    df.to_csv('submission.csv')

write_submission([0 for _ in range(1700)])