In [1]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import VotingClassifier,BaggingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn import preprocessing
from sklearn.metrics import confusion_matrix, f1_score, accuracy_score, precision_score
from sklearn.model_selection import StratifiedKFold,train_test_split
from sklearn.externals import joblib
from sklearn.metrics import  f1_score, roc_auc_score
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns 
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from imblearn.combine import SMOTEENN, SMOTETomek
from itertools import *


# Problem Description

In 2015, a cholera outbreak was reported in Dar es Salaam in Kinondoni district; one person died and four
family members were screened and classified as suspected cholera cases. Cholera began spreading throughout the
Dar es Salaam region and twelve other regions: Arusha, Dodoma, Geita, Morogoro, Kigoma, Mara, Mwanza, Shinyanga,
Singida, Tabora, Tanga as well as the island of Zanzibar. As of 19 November 2015, there were 8,954 reported cases
and 129 deaths according to World Health Organization (WHO), and 19 out of 30 regions had detected and reported
cholera cases on mainland Tanzania and Zanzibar (United Nations Resident Coordinator's Office (UNRCO) and Ministry
of Health & Social Welfare (MoHSW). According to the MoHSW and Tanzania Red Cross Society (TRCS) volunteers,
factors that contributed to the cholera outbreak included; contaminated water sources, poor sanitation and poor hygiene
practices.

We want to develop a machine learning model that will predict whether there are going to be any other cholera outbreaks in coming years using data that was collected from the five boroughs of Dar es Salaam region.

# Load Data

In [2]:
feature_columns = ['District','Year','Rainfall', 'Temp_max', 'Temp_min', 'Temp_mean',
       'Temp_range', 'Humidity', 'Wind_Dir', 'WasteWater']

In [3]:
def load_data(file_name):
    """
    data loading
    """
    df = pd.read_csv(file_name)
    ##convert these strings into integer keys
    le = preprocessing.LabelEncoder()
    df['Year'] = le.fit_transform(df['Year'])
    df['District'] = le.fit_transform(df['District'])
    
   
    #delete column with missing values
    df.drop(['Date','Day','Month','Region'],axis=1,inplace= True)
    return df

def load_test_data(file_name, label_name):
       feature = pd.read_csv(file_name)
       label   = pd.read_csv(label_name)

       return feature, label


def get_balanced_data(X, y):
       smote = SMOTEENN(random_state=0)
       X, y = smote.fit_sample(X, y)    
       return X, y     






In [4]:
def plot_confusion_matrix(y_true, y_pred, classes,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues, fig_num=None):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    
    if fig_num is not None:
        plt.subplot(2,2,fig_num)
    fmt =   'd'
    cm = confusion_matrix(y_true, Y_pred)
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=90)
    plt.yticks(tick_marks, classes)
    plt.title("")

    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
         plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")
    plt.ylabel('True label')
    plt.xlabel('Predicted label')


def savefig(filename, leg=None, format='.pdf', *args, **kwargs):
    """
    Save in PDF file with the given filename.
    """
    if leg:
        art=[leg]
        plt.savefig(filename + format, additional_artists=art, bbox_inches="tight", *args, **kwargs)
    else:
        plt.savefig(filename + format,  bbox_inches="tight", *args, **kwargs)
    plt.close()


In [5]:
file='../data/train_data.csv'

In [6]:
df= load_data(file)
df

Unnamed: 0,District,Year,LabResult,Rainfall,Temp_max,Temp_min,Temp_mean,Temp_range,Humidity,Wind_Dir,x_wind,y_wind,WasteWater,PoliticalParty
0,0,0,0,0.0,30.0,21.0,25.50,9.0,75,60,0,0,1,0.5
1,2,0,0,0.0,31.1,20.6,25.85,10.5,80,120,0,0,1,1.0
2,3,2,0,12.6,30.1,24.2,27.15,5.9,87,210,0,0,1,0.5
3,0,0,0,0.0,30.0,21.0,25.50,9.0,76,160,0,0,0,0.5
4,2,0,0,0.0,33.0,23.0,28.00,10.0,77,40,0,0,1,1.0
5,0,0,0,12.5,32.8,23.5,28.15,9.3,83,180,0,0,1,0.5
6,2,0,0,5.1,32.0,25.0,28.50,7.0,89,60,0,0,1,1.0
7,3,0,1,1.7,30.0,21.0,25.50,9.0,78,170,0,0,0,0.5
8,2,0,0,0.0,33.6,22.1,27.85,11.5,72,0,0,0,1,1.0
9,0,0,0,0.0,30.0,21.0,25.50,9.0,81,0,0,0,0,0.5


In [7]:
#feature selection
y=df['LabResult']
X = df.drop(['LabResult'],axis=1) 

In [8]:
# split data into train and test set 
import random
random.seed(1)
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=42)  

In [9]:
parameters =  { "KNN":[{"n_neighbors":[2,3,4,5], "weights":['uniform', 'distance']}],
               "GB" :[{'learning_rate': [0.1,0.01,0.001], "n_estimators":[50,100]}],
               "RF":[{'n_estimators':[10,30,50,100], 'max_features': ['auto','log2',None], 
                      'min_samples_leaf': [0.2,0.4,1]}],
               "BG":[{'base_estimator':[None], 'n_estimators':[10,20,40,50]
                      }]
              }

In [10]:
models = {
          "KNN": KNeighborsClassifier(),
           "GB":GradientBoostingClassifier(),
           "RF":RandomForestClassifier(),
          "BG":BaggingClassifier() }

In [11]:
skf = StratifiedKFold(n_splits=5,random_state=42,shuffle=True) 

In [12]:
for model_name, model in models.items():
            clf = GridSearchCV(model, parameters[model_name], cv=5)
            clf.fit(X_train, y_train)
            be = clf.best_estimator_
            Y_pred = be.predict(X_test)
            accuracy=accuracy_score(y_test, Y_pred)
            print("best estimator:{}".format(be))
            print(accuracy)  
            joblib.dump(clf.best_estimator_, '../models/{}.pkl'.format(model_name))

best estimator:KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=4, p=2,
           weights='uniform')
0.8864292589027911
best estimator:GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              n_iter_no_change=None, presort='auto', random_state=None,
              subsample=1.0, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False)
0.8864292589027911
best estimator:RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=N

In [17]:
class_name = ["Health", "Cholera"]
for model_name in models.keys():
    model=joblib.load('../models/{}.pkl'.format(model_name))
    Y_pred = model.predict(X_test)
    print(Y_pred) 
   

[0 0 0 ... 0 0 0]
[0 0 0 ... 0 0 0]
[0 0 0 ... 0 0 0]
[0 0 0 ... 0 0 0]


# Visualization

In [18]:
plot_confusion_matrix(y_test,Y_pred, classes=class_name,
title='Confusion matrix',cmap=plt.cm.Blues, fig_num=None)

NameError: name 'itertools' is not defined