# Contraceptive_method_used

In [146]:
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.feature_selection import RFE
from sklearn.metrics import classification_report 
from statistics import mean, stdev
from sklearn.model_selection import KFold, cross_val_score, cross_validate


# Data preprocessing

In [None]:
df=pd.read_csv("BayesianNetworkGenerator_cmc.csv")

In [110]:
df.shape

(55296, 10)

In [75]:
df.dtypes

Wifes_age                       int64
Wifes_education                 int64
Husbands_education              int64
Number_of_children_ever_born    int64
Wifes_religion                  int64
Wifes_now_working%3F            int64
Husbands_occupation             int64
Standard-of-living_index        int64
Media_exposure                  int64
Contraceptive_method_used       int64
dtype: object

In [14]:
df.isnull().sum()

Wifes_age                       0
Wifes_education                 0
Husbands_education              0
Number_of_children_ever_born    0
Wifes_religion                  0
Wifes_now_working%3F            0
Husbands_occupation             0
Standard-of-living_index        0
Media_exposure                  0
Contraceptive_method_used       0
dtype: int64

In [74]:
df.head(5)

Unnamed: 0,Wifes_age,Wifes_education,Husbands_education,Number_of_children_ever_born,Wifes_religion,Wifes_now_working%3F,Husbands_occupation,Standard-of-living_index,Media_exposure,Contraceptive_method_used
0,2,2,3,2,0,1,3,4,0,3
1,2,2,3,1,1,1,2,2,0,1
2,2,1,2,1,1,1,3,3,0,3
3,1,1,1,1,1,1,2,3,0,2
4,2,4,3,1,1,0,3,4,0,2


In [13]:
df["Contraceptive_method_used"].value_counts() # target variable contain 3 values

1    23655
3    19086
2    12555
Name: Contraceptive_method_used, dtype: int64

In [66]:

df['Number_of_children_ever_born'] = df['Number_of_children_ever_born'].replace("\'\\'B1of3\\''", 1)
df['Number_of_children_ever_born'] = df['Number_of_children_ever_born'].replace("\'\\'B2of3\\''", 2)
df['Number_of_children_ever_born'] = df['Number_of_children_ever_born'].replace("\'\\'B3of3\\''", 3)

In [73]:
df['Wifes_age'] = df['Wifes_age'].replace("\'\\'B1of3\\''", 1)
df['Wifes_age'] = df['Wifes_age'].replace("\'\\'B2of3\\''", 2)
df['Wifes_age'] = df['Wifes_age'].replace("\'\\'B3of3\\''", 3)

split data to dependent (target) and independent variables

In [99]:
X= df.drop('Contraceptive_method_used',axis=1)
y = df.Contraceptive_method_used


In [109]:
X.shape

(55296, 9)

In [91]:
#split train and test sets

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,random_state=0)


In [117]:

#import classifier

logreg = LogisticRegression(C=1, solver='lbfgs', multi_class='ovr')


#train classifier
logreg.fit(X_train,y_train)

#classifier performance on test set
logreg.score(X_test,y_test)

print('Accuracy of logistic regression classifier on train set: {:.2f}'
      .format(logreg.score(X_train,y_train)))

print('Accuracy of logistic regression classifier on test set: {:.2f}'
      .format(logreg.score(X_test,y_test)))




Accuracy of logistic regression classifier on train set: 0.49
Accuracy of logistic regression classifier on test set: 0.49


In [None]:
#predictions for test
y_pred = logreg.predict(X_test)

In [119]:
accuracy_score(y_test,y_pred)


0.48833564410151303

In [120]:
confusion =confusion_matrix(y_test,y_pred)

In [132]:

print('Confusion Matrix:\n', confusion)
print("\n Accuracy: {:.2f}".format(accuracy_score(y_test,y_pred)))


Confusion Matrix:
 [[4353  910 1884]
 [1291 1542  904]
 [2568  931 2206]]

 Accuracy: 0.49


In [138]:
#Classification Report including precision and recall scores:

print(classification_report(y_test,y_pred))


              precision    recall  f1-score   support

           1       0.53      0.61      0.57      7147
           2       0.46      0.41      0.43      3737
           3       0.44      0.39      0.41      5705

   micro avg       0.49      0.49      0.49     16589
   macro avg       0.48      0.47      0.47     16589
weighted avg       0.48      0.49      0.48     16589



In [150]:
#Runnig K-Fold:

#from statistics import mean, stdev
#from sklearn.model_selection import KFold, cross_val_score, cross_validate

kf=KFold(n_splits=5, shuffle=True, random_state=None)

logreg = LogisticRegression(C=1, solver='lbfgs', multi_class='ovr')
kf_scores=[]

for train_index, test_index in kf.split(X):
    X_train, y_train=X.iloc[train_index], y.iloc[train_index]
    logreg.fit(X_train,y_train)
    y_pred = logreg.predict(X_test)
    kf_scores.append(accuracy_score(y_test,y_pred))
print(kf_scores)

print('Mean of testing accuracy over 5 folds = %.2f'% mean(kf_scores), 'with std = %.2f'%
      stdev(kf_scores))

[0.433178612333474, 0.43293748869732956, 0.4337814214238351, 0.4332388932425101, 0.43565012960395444]
Mean of testing accuracy over 5 folds = 0.43 with std = 0.00


Conclusion: For Logistic regression, the model is not the best to use with this data set, the accuracy is weak even if I change C value.