In [223]:
# Wrangling
import pandas as pd

# Exploring
import scipy.stats as stats

# Visualizing
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

### Loading the swiss data set
from pydataset import data

df = data('swiss')

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn import tree

In [224]:
df.head()

Unnamed: 0,Fertility,Agriculture,Examination,Education,Catholic,Infant.Mortality
Courtelary,80.2,17.0,15,12,9.96,22.2
Delemont,83.1,45.1,6,9,84.84,22.2
Franches-Mnt,92.5,39.7,5,5,93.4,20.2
Moutier,85.8,36.5,12,7,33.77,20.3
Neuveville,76.9,43.5,17,15,5.16,20.6


In [225]:
df.dtypes

Fertility           float64
Agriculture         float64
Examination           int64
Education             int64
Catholic            float64
Infant.Mortality    float64
dtype: object

In [226]:
df['is_catholic'] = df['Catholic'] > 70

In [227]:
df.head()

Unnamed: 0,Fertility,Agriculture,Examination,Education,Catholic,Infant.Mortality,is_catholic
Courtelary,80.2,17.0,15,12,9.96,22.2,False
Delemont,83.1,45.1,6,9,84.84,22.2,True
Franches-Mnt,92.5,39.7,5,5,93.4,20.2,True
Moutier,85.8,36.5,12,7,33.77,20.3,False
Neuveville,76.9,43.5,17,15,5.16,20.6,False


In [228]:
df['is_catholic'] = df['is_catholic'].astype('category',copy=False)
df = df.drop(columns='Catholic')

In [229]:
df.dtypes

Fertility            float64
Agriculture          float64
Examination            int64
Education              int64
Infant.Mortality     float64
is_catholic         category
dtype: object

In [230]:
df.head()

Unnamed: 0,Fertility,Agriculture,Examination,Education,Infant.Mortality,is_catholic
Courtelary,80.2,17.0,15,12,22.2,False
Delemont,83.1,45.1,6,9,22.2,True
Franches-Mnt,92.5,39.7,5,5,20.2,True
Moutier,85.8,36.5,12,7,20.3,False
Neuveville,76.9,43.5,17,15,20.6,False


In [231]:
X = df.drop(columns='is_catholic')
y = df[['is_catholic']]
y = y.astype('int')

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .40, random_state = 123)

X_train.head()

Unnamed: 0,Fertility,Agriculture,Examination,Education,Infant.Mortality
La Chauxdfnd,65.7,7.7,29,11,20.5
Glane,92.4,67.8,14,8,24.9
Morges,65.5,59.8,22,10,18.0
Rive Gauche,42.8,27.7,22,29,19.3
Moutier,85.8,36.5,12,7,20.3


# Decision Tree Classifier

In [232]:
clf = DecisionTreeClassifier(max_depth=2, random_state=123)

In [233]:
clf.fit(X_train.drop(columns=['Agriculture', 'Examination', 'Infant.Mortality']), y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=2,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=123,
            splitter='best')

In [234]:
y_pred = clf.predict(X_train.drop(columns=['Agriculture', 'Examination', 'Infant.Mortality']))
y_pred_proba = clf.predict_proba(X_train.drop(columns=['Agriculture', 'Examination', 'Infant.Mortality']))

In [235]:
print('Accuracy of Decision Tree classifier on training set: {:.2f}'
     .format(clf.score(X_train.drop(columns=['Agriculture', 'Examination', 'Infant.Mortality']), y_train)))

Accuracy of Decision Tree classifier on training set: 0.82


In [236]:
confusion_matrix(y_train, y_pred)

array([[19,  0],
       [ 5,  4]])

In [237]:
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.79      1.00      0.88        19
           1       1.00      0.44      0.62         9

   micro avg       0.82      0.82      0.82        28
   macro avg       0.90      0.72      0.75        28
weighted avg       0.86      0.82      0.80        28



# Logistic Regression Model

In [238]:
logit = LogisticRegression(C=1, class_weight={1:2}, random_state = 123, solver='saga')

In [239]:
logit.fit(X_train.drop(columns=['Fertility', 'Education', 'Infant.Mortality']), y_train)

  y = column_or_1d(y, warn=True)


LogisticRegression(C=1, class_weight={1: 2}, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=123, solver='saga',
          tol=0.0001, verbose=0, warm_start=False)

In [240]:
print('Coefficient: \n', logit.coef_)
print('Intercept: \n', logit.intercept_)

Coefficient: 
 [[ 0.09827056 -0.37194342]]
Intercept: 
 [-0.00138121]


In [241]:
y_pred = logit.predict(X_train.drop(columns=['Fertility', 'Education', 'Infant.Mortality']))

y_pred_proba = logit.predict_proba(X_train.drop(columns=['Fertility', 'Education', 'Infant.Mortality']))

In [242]:
print('Accuracy of Logistic Regression classifier on training set: {:.2f}'
     .format(logit.score(X_train.drop(columns=['Fertility', 'Education', 'Infant.Mortality']), y_train)))

Accuracy of Logistic Regression classifier on training set: 0.89


In [243]:
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.94      0.89      0.92        19
           1       0.80      0.89      0.84         9

   micro avg       0.89      0.89      0.89        28
   macro avg       0.87      0.89      0.88        28
weighted avg       0.90      0.89      0.89        28



# K Nearest Neighbors model

In [244]:
knn = KNeighborsClassifier(n_neighbors=5, weights='uniform')

In [245]:
knn.fit(X_train.drop(columns=['Agriculture', 'Examination', 'Education']), y_train)

  """Entry point for launching an IPython kernel.


KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform')

In [246]:
y_pred = knn.predict(X_train.drop(columns=['Agriculture', 'Examination', 'Education']))

y_pred_proba = knn.predict_proba(X_train.drop(columns=['Agriculture', 'Examination', 'Education']))

In [247]:
print('Accuracy of KNN classifier on training set: {:.2f}'
     .format(knn.score(X_train.drop(columns=['Agriculture', 'Examination', 'Education']), y_train)))

Accuracy of KNN classifier on training set: 0.79


In [248]:
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.84      0.84      0.84        19
           1       0.67      0.67      0.67         9

   micro avg       0.79      0.79      0.79        28
   macro avg       0.75      0.75      0.75        28
weighted avg       0.79      0.79      0.79        28



# Best Model: Decision Tree Classifier

In [249]:
print('Accuracy of Decision Tree classifier on test set: {:.2f}'
     .format(logit.score(X_test.drop(columns=['Fertility', 'Education', 'Infant.Mortality']), y_test)))

Accuracy of Decision Tree classifier on test set: 0.68
