In [269]:
# Wrangling
import pandas as pd

# Exploring
import scipy.stats as stats

# Visualizing
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

### Loading the swiss data set
from pydataset import data

df = data('swiss')

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn import tree

In [270]:
df.head()

Unnamed: 0,Fertility,Agriculture,Examination,Education,Catholic,Infant.Mortality
Courtelary,80.2,17.0,15,12,9.96,22.2
Delemont,83.1,45.1,6,9,84.84,22.2
Franches-Mnt,92.5,39.7,5,5,93.4,20.2
Moutier,85.8,36.5,12,7,33.77,20.3
Neuveville,76.9,43.5,17,15,5.16,20.6


In [271]:
df.dtypes

Fertility           float64
Agriculture         float64
Examination           int64
Education             int64
Catholic            float64
Infant.Mortality    float64
dtype: object

In [272]:
df['is_catholic'] = df['Catholic'] > 70

In [273]:
df.head()

Unnamed: 0,Fertility,Agriculture,Examination,Education,Catholic,Infant.Mortality,is_catholic
Courtelary,80.2,17.0,15,12,9.96,22.2,False
Delemont,83.1,45.1,6,9,84.84,22.2,True
Franches-Mnt,92.5,39.7,5,5,93.4,20.2,True
Moutier,85.8,36.5,12,7,33.77,20.3,False
Neuveville,76.9,43.5,17,15,5.16,20.6,False


In [274]:
df['is_catholic'] = df['is_catholic'].astype('category',copy=False)
df = df.drop(columns='Catholic')

In [275]:
df.dtypes

Fertility            float64
Agriculture          float64
Examination            int64
Education              int64
Infant.Mortality     float64
is_catholic         category
dtype: object

In [276]:
df.head()

Unnamed: 0,Fertility,Agriculture,Examination,Education,Infant.Mortality,is_catholic
Courtelary,80.2,17.0,15,12,22.2,False
Delemont,83.1,45.1,6,9,22.2,True
Franches-Mnt,92.5,39.7,5,5,20.2,True
Moutier,85.8,36.5,12,7,20.3,False
Neuveville,76.9,43.5,17,15,20.6,False


In [296]:
X = df.drop(columns='is_catholic')
y = df[['is_catholic']]
y = y.astype('int')

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .40, random_state = 13, stratify=y)

X_train.head()

Unnamed: 0,Fertility,Agriculture,Examination,Education,Infant.Mortality
Rive Gauche,42.8,27.7,22,29,19.3
Yverdon,65.4,49.5,15,8,22.5
La Chauxdfnd,65.7,7.7,29,11,20.5
Franches-Mnt,92.5,39.7,5,5,20.2
Courtelary,80.2,17.0,15,12,22.2


# Decision Tree Classifier

In [297]:
clf = DecisionTreeClassifier(max_depth=2, random_state=123)

In [298]:
clf.fit(X_train.drop(columns=['Agriculture', 'Examination', 'Infant.Mortality']), y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=2,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=123,
            splitter='best')

In [299]:
y_pred = clf.predict(X_train.drop(columns=['Agriculture', 'Examination', 'Infant.Mortality']))
y_pred_proba = clf.predict_proba(X_train.drop(columns=['Agriculture', 'Examination', 'Infant.Mortality']))

In [300]:
print('Accuracy of Decision Tree classifier on training set: {:.2f}'
     .format(clf.score(X_train.drop(columns=['Agriculture', 'Examination', 'Infant.Mortality']), y_train)))

Accuracy of Decision Tree classifier on training set: 0.86


In [301]:
confusion_matrix(y_train, y_pred)

array([[17,  1],
       [ 3,  7]])

In [302]:
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.85      0.94      0.89        18
           1       0.88      0.70      0.78        10

   micro avg       0.86      0.86      0.86        28
   macro avg       0.86      0.82      0.84        28
weighted avg       0.86      0.86      0.85        28



# Logistic Regression Model

In [303]:
logit = LogisticRegression(C=1, class_weight={1:2}, random_state = 123, solver='saga')

In [304]:
logit.fit(X_train.drop(columns=['Fertility', 'Education', 'Infant.Mortality']), y_train)

  y = column_or_1d(y, warn=True)


LogisticRegression(C=1, class_weight={1: 2}, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=123, solver='saga',
          tol=0.0001, verbose=0, warm_start=False)

In [305]:
print('Coefficient: \n', logit.coef_)
print('Intercept: \n', logit.intercept_)

Coefficient: 
 [[ 0.10994084 -0.44782548]]
Intercept: 
 [0.00750655]


In [306]:
y_pred = logit.predict(X_train.drop(columns=['Fertility', 'Education', 'Infant.Mortality']))

y_pred_proba = logit.predict_proba(X_train.drop(columns=['Fertility', 'Education', 'Infant.Mortality']))

In [307]:
print('Accuracy of Logistic Regression classifier on training set: {:.2f}'
     .format(logit.score(X_train.drop(columns=['Fertility', 'Education', 'Infant.Mortality']), y_train)))

Accuracy of Logistic Regression classifier on training set: 0.89


In [308]:
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.94      0.89      0.91        18
           1       0.82      0.90      0.86        10

   micro avg       0.89      0.89      0.89        28
   macro avg       0.88      0.89      0.89        28
weighted avg       0.90      0.89      0.89        28



# K Nearest Neighbors model

In [309]:
knn = KNeighborsClassifier(n_neighbors=5, weights='uniform')

In [310]:
knn.fit(X_train.drop(columns=['Agriculture', 'Examination', 'Education']), y_train)

  """Entry point for launching an IPython kernel.


KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform')

In [311]:
y_pred = knn.predict(X_train.drop(columns=['Agriculture', 'Examination', 'Education']))

y_pred_proba = knn.predict_proba(X_train.drop(columns=['Agriculture', 'Examination', 'Education']))

In [312]:
print('Accuracy of KNN classifier on training set: {:.2f}'
     .format(knn.score(X_train.drop(columns=['Agriculture', 'Examination', 'Education']), y_train)))

Accuracy of KNN classifier on training set: 0.82


In [313]:
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.84      0.89      0.86        18
           1       0.78      0.70      0.74        10

   micro avg       0.82      0.82      0.82        28
   macro avg       0.81      0.79      0.80        28
weighted avg       0.82      0.82      0.82        28



# Best Model: Logistic Regression

In [314]:
print('Accuracy of Decision Tree classifier on test set: {:.2f}'
     .format(logit.score(X_test.drop(columns=['Fertility', 'Education', 'Infant.Mortality']), y_test)))

Accuracy of Decision Tree classifier on test set: 0.84
