In [11]:
import pandas as pd

In [12]:
def numerize_csv(path: str):
    ''' Takes in a path to a project csv and converts its entries to numerical '''
    df = pd.read_csv(path)
    df['gender'] = (df['gender'] == 'Female').astype(int)

    for header in ['Partner', 'Dependents', 'PhoneService', 'PaperlessBilling', \
                'Discontinued']:
        df[header] = (df[header] == 'Yes').astype(int)

    for header in ['MultipleLines', 'OnlineSecurity', 'OnlineBackup', \
                'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies']:
        # some here have the value no phone/internet service, which are casted to 0
        df[header] = (df[header] == 'Yes').astype(int)

    for header in ['MonthlyCharges', 'TotalCharges', 'tenure']:
        # lines that need normalization
        df[header] /= (max(df[header]) - min(df[header]))

    df['InternetService'] = df['InternetService'].map({'Fiber optic': 2, 'DSL': 1, 'No': 0})
    df['Contract'] = df['Contract'].map({'Two year': 2, 'One year': 1, 'Month-to-month': 0})
    # Note that the PaymentMethod column contains some entries that are marked automatic
    # that's probably correlated with discontinuation in some way.
    df['PaymentMethod'] = df['PaymentMethod'].map({
        'Credit card (automatic)': 3,
        'Electronic check': 2,
        'Bank transfer (automatic)': 1,
        'Mailed check': 0})
    df.drop('customerID', axis=1, inplace=True)
    df.dropna(inplace=True)
    return df
    
raw = numerize_csv('train.csv')
raw.corr().abs()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Discontinued
gender,1.0,0.008073,0.009551,0.012417,0.001303,0.00337,0.002732,0.013341,0.021219,0.026053,0.001623,0.005928,0.011089,0.021877,0.005647,0.009754,0.012644,0.018207,0.008722,0.009631
SeniorCitizen,0.008073,1.0,0.020239,0.217319,0.012572,0.003733,0.132927,0.258179,0.036392,0.083243,0.066141,0.060677,0.114203,0.127778,0.144183,0.152691,0.123765,0.221116,0.101417,0.152333
Partner,0.009551,0.020239,1.0,0.455667,0.390519,0.012353,0.132355,0.012196,0.145802,0.145222,0.164242,0.130221,0.128153,0.128924,0.292937,0.005536,0.073331,0.107451,0.328888,0.149772
Dependents,0.012417,0.217319,0.455667,1.0,0.173546,0.00488,0.025823,0.170642,0.088028,0.027295,0.019454,0.071395,0.01642,0.03398,0.24646,0.107256,0.042604,0.106133,0.073305,0.164423
tenure,0.001303,0.012572,0.390519,0.173546,1.0,0.005664,0.323454,0.031994,0.322909,0.355433,0.364115,0.315595,0.26962,0.287425,0.675561,0.008371,0.178442,0.243047,0.822232,0.346162
PhoneService,0.00337,0.003733,0.012353,0.00488,0.005664,1.0,0.282494,0.092001,0.100599,0.047588,0.076204,0.110404,0.034595,0.044916,0.005943,0.017809,0.006076,0.243316,0.110667,0.015475
MultipleLines,0.002732,0.132927,0.132355,0.025823,0.323454,0.282494,1.0,0.336383,0.094621,0.202651,0.189622,0.093341,0.242752,0.248214,0.103096,0.160323,0.175635,0.480858,0.457265,0.042783
InternetService,0.013341,0.258179,0.012196,0.170642,0.031994,0.092001,0.336383,1.0,0.159941,0.317497,0.318912,0.163824,0.432982,0.424584,0.290223,0.369352,0.278659,0.90761,0.433785,0.315136
OnlineSecurity,0.021219,0.036392,0.145802,0.088028,0.322909,0.100599,0.094621,0.159941,1.0,0.267796,0.280008,0.352704,0.173454,0.187105,0.238477,0.002719,0.075755,0.295424,0.409441,0.166189
OnlineBackup,0.026053,0.083243,0.145222,0.027295,0.355433,0.047588,0.202651,0.317497,0.267796,1.0,0.296432,0.281876,0.283396,0.273657,0.149015,0.127219,0.138813,0.446669,0.510463,0.079824


In [13]:
def combine_related_columns(df: pd.DataFrame):
    ''' takes in a project dataframe and combines its related rows '''
    df_cpy = df
    PHONE_SERVICE_WEIGHT = 0.7
    TV_STREAM_WEIGHT = 0.5
    SECURITY_WEIGHTS = {
        'security': 0.25,
        'backup': 0.25,
        'protection': 0.25,
        'support': 0.25
    }
    df_cpy['PhoneUsageScore'] = df_cpy.pop('PhoneService').values * PHONE_SERVICE_WEIGHT \
                                + df_cpy.pop('MultipleLines').values * (1 - PHONE_SERVICE_WEIGHT)
    df_cpy['InternetSecurityScore'] = df_cpy.pop('OnlineSecurity').values * SECURITY_WEIGHTS.get('security') \
                                        + df_cpy.pop('OnlineBackup').values * SECURITY_WEIGHTS.get('backup') \
                                        + df_cpy.pop('DeviceProtection').values * SECURITY_WEIGHTS.get('protection') \
                                        + df_cpy.pop('TechSupport').values * SECURITY_WEIGHTS.get('support')
    df_cpy['InternetStreamingScore'] = df_cpy.pop('StreamingTV').values * TV_STREAM_WEIGHT \
                                        + df_cpy.pop('StreamingMovies').values * (1 - TV_STREAM_WEIGHT)
    df_cpy.insert(0, 'Discontinued', df_cpy.pop('Discontinued'))
    return df_cpy

combine_related_columns(raw)

Unnamed: 0,Discontinued,gender,SeniorCitizen,Partner,Dependents,tenure,InternetService,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,PhoneUsageScore,InternetSecurityScore,InternetStreamingScore
0,1,1,0,0,0,0.013889,2,0,0,2,0.701493,0.008135,0.7,0.00,0.0
1,0,1,0,0,0,0.208333,1,1,0,0,0.558706,0.107536,0.7,0.00,0.5
2,1,0,0,1,1,0.166667,1,1,0,3,0.533831,0.080355,0.7,0.00,0.5
3,1,1,0,0,0,0.305556,2,0,1,0,0.838806,0.214131,0.7,0.25,0.5
4,1,0,0,0,0,0.027778,1,0,1,2,0.490050,0.010426,0.0,0.25,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5338,1,1,1,0,1,0.013889,2,0,1,2,0.706468,0.008193,0.7,0.00,0.0
5339,0,1,0,1,0,1.000000,2,2,1,1,0.977114,0.809594,1.0,0.75,0.5
5340,1,0,1,0,0,0.069444,2,0,1,0,0.699005,0.038299,0.7,0.00,0.0
5341,0,0,0,1,1,0.638889,0,1,1,1,0.200995,0.097577,0.7,0.00,0.0


In [14]:
def random_forest_attempt():
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.metrics import accuracy_score
    from sklearn.model_selection import train_test_split

    combined = combine_related_columns(numerize_csv('train.csv'))
    Y = combined['Discontinued']
    X = combined.drop('Discontinued', axis=1)
    X_train, X_test, y_train, y_test = train_test_split(X, Y, random_state=0, train_size = .75)


    # results of extensive testing
    model = RandomForestClassifier(n_estimators=200, random_state=42, criterion='entropy')
    model.set_params(min_samples_leaf=7, max_features=1)
    model.fit(X_train, y_train)
    y_test_preds = model.predict(X_test)
    y_train_preds = model.predict(X_train)
    # Calculate accuracy on the training set
    testing_accuracy = accuracy_score(y_test, y_test_preds)
    training_accuracy = accuracy_score(y_train, y_train_preds)
    print(f'Testing Accuracy: {testing_accuracy}')
    print(f'Training Accuracy: {training_accuracy}')

In [15]:
def write_submission(preds):
    '''
    Utility to
    '''
    df = pd.read_csv('submission.csv')
    pred_len = len(preds)
    target_len = len(df['ID'])
    if pred_len != target_len:
        raise ValueError
    df['TARGET'] = pd.Series(preds)
    df.set_index('ID', inplace=True)
    df.to_csv('submission.csv')

write_submission([0 for _ in range(1700)])