In [110]:
import pandas as pd

In [111]:
def numerize_csv(path: str):
    ''' Takes in a path to a project csv and converts its entries to numerical '''
    df = pd.read_csv(path)
    df['gender'] = (df['gender'] == 'Female').astype(int)

    for header in ['Partner', 'Dependents', 'PhoneService', 'PaperlessBilling', \
                'Discontinued']:
        df[header] = (df[header] == 'Yes').astype(int)

    for header in ['MultipleLines', 'OnlineSecurity', 'OnlineBackup', \
                'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies']:
        # some here have the value no phone/internet service, which are casted to 0
        df[header] = (df[header] == 'Yes').astype(int)

    for header in ['MonthlyCharges', 'TotalCharges', 'tenure']:
        # lines that need normalization
        df[header] /= (max(df[header]) - min(df[header]))

    df['InternetService'] = df['InternetService'].map({'Fiber optic': 2, 'DSL': 1, 'No': 0})
    df['Contract'] = df['Contract'].map({'Two year': 2, 'One year': 1, 'Month-to-month': 0})
    # Note that the PaymentMethod column contains some entries that are marked automatic
    # that's probably correlated with discontinuation in some way.
    df['PaymentMethod'] = df['PaymentMethod'].map({
        'Credit card (automatic)': 3,
        'Electronic check': 2,
        'Bank transfer (automatic)': 1,
        'Mailed check': 0})
    df.drop('customerID', axis=1, inplace=True)
    df.dropna(inplace=True)
    return df
    
raw = numerize_csv('train.csv')
raw

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Discontinued
0,1,0,0,0,0.013889,1,0,2,0,0,0,0,0,0,0,0,2,0.701493,0.008135,1
1,1,0,0,0,0.208333,1,0,1,0,0,0,0,1,0,1,0,0,0.558706,0.107536,0
2,0,0,1,1,0.166667,1,0,1,0,0,0,0,0,1,1,0,3,0.533831,0.080355,1
3,1,0,0,0,0.305556,1,0,2,0,1,0,0,0,1,0,1,0,0.838806,0.214131,1
4,0,0,0,0,0.027778,0,0,1,0,0,1,0,1,1,0,1,2,0.490050,0.010426,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5338,1,1,0,1,0.013889,1,0,2,0,0,0,0,0,0,0,1,2,0.706468,0.008193,1
5339,1,0,1,0,1.000000,1,1,2,1,0,1,1,1,0,2,1,1,0.977114,0.809594,0
5340,0,1,0,0,0.069444,1,0,2,0,0,0,0,0,0,0,1,0,0.699005,0.038299,1
5341,0,0,1,1,0.638889,1,0,0,0,0,0,0,0,0,1,1,1,0.200995,0.097577,0


In [112]:
def combine_related_columns(df: pd.DataFrame):
    ''' takes in a project dataframe and combines its related rows '''
    df_cpy = df
    PHONE_SERVICE_WEIGHT = 0.7
    TV_STREAM_WEIGHT = 0.5
    SECURITY_WEIGHTS = {
        'security': 0.25,
        'backup': 0.25,
        'protection': 0.25,
        'support': 0.25
    }
    df_cpy['PhoneUsageScore'] = df_cpy.pop('PhoneService').values * PHONE_SERVICE_WEIGHT \
                                + df_cpy.pop('MultipleLines').values * (1 - PHONE_SERVICE_WEIGHT)
    df_cpy['InternetSecurityScore'] = df_cpy.pop('OnlineSecurity').values * SECURITY_WEIGHTS.get('security') \
                                        + df_cpy.pop('OnlineBackup').values * SECURITY_WEIGHTS.get('backup') \
                                        + df_cpy.pop('DeviceProtection').values * SECURITY_WEIGHTS.get('protection') \
                                        + df_cpy.pop('TechSupport').values * SECURITY_WEIGHTS.get('support')
    df_cpy['InternetStreamingScore'] = df_cpy.pop('StreamingTV').values * TV_STREAM_WEIGHT \
                                        + df_cpy.pop('StreamingMovies').values * (1 - TV_STREAM_WEIGHT)
    df_cpy.insert(0, 'Discontinued', df_cpy.pop('Discontinued'))
    return df_cpy

combine_related_columns(raw)

Unnamed: 0,Discontinued,gender,SeniorCitizen,Partner,Dependents,tenure,InternetService,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,PhoneUsageScore,InternetSecurityScore,InternetStreamingScore
0,1,1,0,0,0,0.013889,2,0,0,2,0.701493,0.008135,0.7,0.00,0.0
1,0,1,0,0,0,0.208333,1,1,0,0,0.558706,0.107536,0.7,0.00,0.5
2,1,0,0,1,1,0.166667,1,1,0,3,0.533831,0.080355,0.7,0.00,0.5
3,1,1,0,0,0,0.305556,2,0,1,0,0.838806,0.214131,0.7,0.25,0.5
4,1,0,0,0,0,0.027778,1,0,1,2,0.490050,0.010426,0.0,0.25,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5338,1,1,1,0,1,0.013889,2,0,1,2,0.706468,0.008193,0.7,0.00,0.0
5339,0,1,0,1,0,1.000000,2,2,1,1,0.977114,0.809594,1.0,0.75,0.5
5340,1,0,1,0,0,0.069444,2,0,1,0,0.699005,0.038299,0.7,0.00,0.0
5341,0,0,0,1,1,0.638889,0,1,1,1,0.200995,0.097577,0.7,0.00,0.0


In [63]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split

del combined["customerID"]


Y = combined['Discontinued']
X = combined.drop('Discontinued', axis=1)  # Replace 'target_column' with the actual target column name
X_train, X_test, y_train, y_test = train_test_split(X, Y, random_state=0, train_size = .75)



# Replace NaN values with the mean of each column
imputer = SimpleImputer(strategy='mean')
X_imputed = imputer.fit_transform(X_train)


model = RandomForestClassifier(n_estimators=100, random_state=42)

model.fit(X_imputed,y_train)



# Replace NaN values with the mean of each column
imputer = SimpleImputer(strategy='mean')
X_imputed = imputer.fit_transform(X_test)

print(X_imputed)

# Predict on the training set
y_pred_train = model.predict(X_imputed)

# Calculate accuracy on the training set
training_accuracy = accuracy_score(y_test, y_pred_train)

print(f'Testing Accuracy: {training_accuracy}')






[[0.   0.   0.   ... 1.   0.   0.  ]
 [1.   0.   1.   ... 0.   1.   0.5 ]
 [1.   0.   0.   ... 0.7  0.25 0.  ]
 ...
 [0.   0.   1.   ... 0.7  0.   0.  ]
 [0.   0.   0.   ... 0.7  0.   1.  ]
 [0.   0.   0.   ... 0.7  0.25 0.  ]]
Testing Accuracy: 0.7829341317365269
