In [11]:
# Introduction to Data Science Homework 2: Churn Prediction
# Name: Jungwoo Han 
# jh5990@nyu.edu

# Score for this model on Kaggle : 0.71511

In [12]:
from __future__ import division
import pandas as pd
import numpy as np

# read csv format data
churn_df = pd.read_csv('train.csv')    # training data
churn_test = pd.read_csv('test.csv')   # test data

col_names = churn_df.columns.tolist()
col_names_test = churn_test.columns.tolist()

# remove rows which have NaN values
churn_df.dropna(how='any', inplace=True)
churn_test.dropna(how='any', inplace=True)

# replace COLLEGE elements to numbers
churn_df['COLLEGE'] = churn_df['COLLEGE'].replace({"one": "1", "zero": "0"})
churn_test['COLLEGE'] = churn_test['COLLEGE'].replace({"one": "1", "zero": "0"})

# replace REPORTED_SATISFACTION elements to numbers - weighted by frequency
churn_df['REPORTED_SATISFACTION'] = churn_df['REPORTED_SATISFACTION'].replace({"very_sat": "5", "sat": "2", "avg": "6", "unsat": "16", "very_unsat": "45"})
churn_test['REPORTED_SATISFACTION'] = churn_test['REPORTED_SATISFACTION'].replace({"very_sat": "5", "sat": "2", "avg": "6", "unsat": "16", "very_unsat": "45"})

# replace REPORTED_USAGE_LEVEL elements to numbers - weighted by frequency
churn_df['REPORTED_USAGE_LEVEL'] = churn_df['REPORTED_USAGE_LEVEL'].replace({"very_little": "20", "little": "36", "avg": "3", "high": "4", "very_high": "5"})
churn_test['REPORTED_USAGE_LEVEL'] = churn_test['REPORTED_USAGE_LEVEL'].replace({"very_little": "20", "little": "36", "avg": "3", "high": "4", "very_high": "5"})

# replace CONSIDERING_CHANGE_OF_PLAN elements to numbers - weighted by frequency
churn_df['CONSIDERING_CHANGE_OF_PLAN'] = churn_df['CONSIDERING_CHANGE_OF_PLAN'].replace({"never_thought": "2", "no": "8", "perhaps": "3", "considering": "36", "actively_looking_into_it": "25"})
churn_test['CONSIDERING_CHANGE_OF_PLAN'] = churn_test['CONSIDERING_CHANGE_OF_PLAN'].replace({"never_thought": "2", "no": "8", "perhaps": "3", "considering": "36", "actively_looking_into_it": "25"})

In [13]:
# take log of data which have a wide range of numerical value
churn_df['INCOME'] = np.log(churn_df['INCOME'])
churn_test['INCOME'] = np.log(churn_test['INCOME'])

churn_df['HANDSET_PRICE'] = np.log(churn_df['HANDSET_PRICE'])
churn_test['HANDSET_PRICE'] = np.log(churn_test['HANDSET_PRICE'])

churn_df['HOUSE'] = np.log(churn_df['HOUSE'])
churn_test['HOUSE'] = np.log(churn_test['HOUSE'])

In [14]:
# isolate target data
churn_result = churn_df['LEAVE']
y = np.where(churn_result == 1, 1, 0)

# choose the columns to drop
to_drop = ['LEAVE']
to_drop_test = []

churn_feat_space = churn_df.drop(to_drop,axis=1)
churn_feat_test = churn_test.drop(to_drop_test,axis=1)

# save the features selected for training
features = churn_feat_space.columns

In [15]:
X = churn_feat_space.as_matrix().astype(np.float)
X_test = churn_feat_test.as_matrix().astype(np.float)

In [16]:
# normalize the data
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

# combine training data and test data
X2 = np.concatenate((X, X_test), axis=0)

# normalize
X2 = scaler.fit_transform(X2)

# separate the two sets of data
X = X2[0:(X2.shape[0]-X_test.shape[0])]
X_test = X2[X.shape[0]:] 

In [17]:
# model validation
from sklearn.model_selection import KFold
from sklearn.tree import DecisionTreeClassifier as DT
from sklearn.svm import SVC

def run_cv(X,y,clf_class,**kwargs):
    # Construct a kfolds object
    kf = KFold(n_splits=10,shuffle=True)
    y_pred = y.copy()

    # Iterate through folds
    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train = y[train_index]
        # Initialize a classifier with key word arguments
        clf = clf_class(**kwargs)
        clf.fit(X_train,y_train)
        y_pred[test_index] = clf.predict(X_test)
    return y_pred

def accuracy(y_true,y_pred):
    # NumPy interprets True and False as 1. and 0.
    return np.mean(y_true == y_pred)

# uncomment to test SVM
# print("Support Vector Machine:")
# print ("%.3f" % accuracy(y, run_cv(X,y,SVC,C=409, gamma=0.00065, kernel='rbf')))
print("Decision Tree:")
print ("%.3f" % accuracy(y, run_cv(X,y,DT,min_samples_leaf=30, max_depth=4)))

Decision Tree:
0.701


In [18]:
# Model: Decision Tree
from sklearn import tree
model = tree.DecisionTreeClassifier(min_samples_leaf=30,max_depth=4)
model.fit(X,y)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=4,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=30, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [19]:
# run the model on test data
prediction_dt = model.predict(X_test)

# convert float to integer
for j in range(len(prediction_dt)):
    prediction_dt[j] = int(prediction_dt[j])

In [20]:
# save the predicted data as a csv file
np.savetxt("prediction.csv", prediction_dt, delimiter=",")