In [10]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression

In [11]:
clean_data = pd.read_csv('data/clean_data.tsv', sep='\t', encoding='utf-8')
clean_data = clean_data.astype(str)
print('Total records:', len(clean_data))

Total records: 72080


In [12]:
clean_data.head()

Unnamed: 0,URBAN-Rural status,Household wealth index in quintiles,Highest educational level,Final say on woman's health care,Unmet need for FP,Whether and when this child's pregnancy wanted,Sex of child,Current age of child in years,Age of child in months,Weight for height percentile (CDC standards),...,Child given nothing as treatment for diarrhea,Child received oral polio 0 vaccination,Child received oral polio 1 vaccination,Child received oral polio 2 vaccination,Year child received oral polio 3 vaccination,"Child received DPT (diphtheria, pertussis, tetanus) 1 vaccination","Child received DPT (diphtheria, pertussis, tetanus) 2 vaccination","Child received DPT (diphtheria, pertussis, tetanus) 3 vaccination",Child received measles (or measles containing) 1 vaccination,Child received BCG (TB) vaccination
0,Urban,Richest,Richest,Husband/partner,Husband/partner,Husband/partner,Male,NIU,NIU,NIU,...,NIU,NIU,NIU,NIU,NIU,NIU,NIU,NIU,NIU,NIU
1,Urban,Richest,Richest,Husband/partner,Husband/partner,Husband/partner,Male,NIU,NIU,NIU,...,NIU,NIU,NIU,NIU,NIU,NIU,NIU,NIU,NIU,NIU
2,Urban,Richest,Primary,NIU,NIU,NIU,Male,NIU,NIU,NIU,...,NIU,NIU,NIU,NIU,NIU,NIU,NIU,NIU,NIU,NIU
3,Urban,Richest,Secondary,Secondary,Secondary,Secondary,Male,NIU,NIU,NIU,...,NIU,NIU,NIU,NIU,NIU,NIU,NIU,NIU,NIU,NIU
4,Urban,Richest,Richest,Husband/partner,Husband/partner,Husband/partner,Female,NIU,NIU,NIU,...,NIU,NIU,NIU,NIU,NIU,NIU,NIU,NIU,NIU,NIU


In [65]:
y = np.load('data/labels.npy')

In [14]:
X = np.array(clean_data)
print('Total Records: ', X.shape[0])
print('Total Variables/Attributes: ', X.shape[1])

Total Records:  72080
Total Variables/Attributes:  54


In [38]:
lr_classifier = LogisticRegression(random_state = 0, solver = 'lbfgs', multi_class='multinomial')

In [16]:
from sklearn.preprocessing import LabelEncoder
for i in range (X.shape[1]):
    le = LabelEncoder()
    X[:,i] = le.fit_transform(X[:,i])



In [61]:
le_y = LabelEncoder()
y = le.fit_transform(y)  

In [35]:
from sklearn.model_selection import train_test_split

In [66]:
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=42)

In [67]:
lr_classifier.fit(X_train,y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='multinomial',
          n_jobs=None, penalty='l2', random_state=0, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False)

In [68]:
print('Testing Accuracy: ',lr_classifier.score(X_test, y_test))

Testing Accuracy:  0.5077136514983351


In [69]:
predicted_y = lr_classifier.predict(X_test)

In [73]:
from sklearn.metrics import f1_score, classification_report, confusion_matrix
print('F1-score Macro Average: ', f1_score(y_test, predicted_y, average='macro'))
print('F1-score Micro Average: ', f1_score(y_test, predicted_y, average='micro'))
print('F1-score Weighted Average: ', f1_score(y_test, predicted_y, average='weighted'))




F1-score Macro Average:  0.027777361562862073
F1-score Micro Average:  0.5077136514983351
F1-score Weighted Average:  0.38828464978477867


  'precision', 'predicted', average, warn_for)


In [71]:
print(classification_report(y_test, predicted_y))

  'precision', 'predicted', average, warn_for)


              precision    recall  f1-score   support

           0       0.61      0.99      0.75      8392
           1       0.21      0.42      0.28      1017
          10       0.00      0.00      0.00       244
          11       0.00      0.00      0.00       240
          12       0.16      0.29      0.20       947
          13       0.00      0.00      0.00       194
          14       0.00      0.00      0.00       155
          15       0.00      0.00      0.00       115
          16       0.00      0.00      0.00        92
          17       0.00      0.00      0.00        71
          18       0.00      0.00      0.00       250
          19       0.00      0.00      0.00        50
           2       0.00      0.00      0.00       767
          20       0.00      0.00      0.00        66
          21       0.00      0.00      0.00        25
          22       0.00      0.00      0.00        36
          23       0.00      0.00      0.00        35
          24       0.22    

# ROC Curve and AUC cannot be calculated on multi-class or regression problems. So it is necessary to binarize the output

In [75]:
cm = confusion_matrix(y_test, predicted_y)

In [84]:
fp = cm.sum(axis=0) - np.diag(cm)
fn = cm.sum(axis=1) - np.diag(cm)
tp = np.diag(cm)
tn = cm.sum() - (fp + fn + tp)

In [88]:
# Recall, Sensitivity, Hit Rate, TPR

tpr = tp/(tp+fn)

In [90]:
print('TPR/Sensitivity for each class', tpr)

TPR/Sensitivity for each class [0.99010963 0.41691249 0.         0.         0.28827878 0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.13043478
 0.         0.         0.         0.         0.         0.02319236
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.        ]


In [91]:
# Specifitivity, TNR

tnr = tn/(tn+fp)

In [92]:
print('TNR/Specifitivity', tnr)

TNR/Specifitivity [0.44640631 0.90642828 1.         1.         0.91360628 1.
 1.         1.         1.         1.         0.99994373 1.
 0.99965223 1.         1.         1.         1.         0.97378914
 1.         1.         1.         1.         1.         0.99878521
 1.         1.         1.         1.         1.         1.
 1.         1.         1.         1.         1.         1.
 1.         1.         1.         1.         1.         1.
 1.         1.         1.         1.         1.         1.
 1.         1.         1.         1.        ]
