# General

In [1]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from sklearn.metrics import confusion_matrix

from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report

In [2]:
#loading dataset
data = pd.read_excel("PCOS_data.xlsx")

In [3]:
data.head()

Unnamed: 0,Sl. No,Patient File No.,PCOS (Y/N),Age (yrs),Weight (Kg),Height(Cm),BMI,Blood Group,Pulse rate(bpm),RR (breaths/min),...,Pimples(Y/N),Fast food (Y/N),Reg.Exercise(Y/N),BP _Systolic (mmHg),BP _Diastolic (mmHg),Follicle No. (L),Follicle No. (R),Avg. F size (L) (mm),Avg. F size (R) (mm),Endometrium (mm)
0,1,1,0,28,44.6,152.0,19.304017,15,78,22,...,0,1,0,110,80,3,3,18.0,18.0,8.5
1,2,2,0,36,65.0,161.5,24.921163,15,74,20,...,0,0,0,120,70,3,5,15.0,14.0,3.7
2,3,3,1,33,68.8,165.0,25.270891,11,72,18,...,1,1,0,120,80,13,15,18.0,20.0,10.0
3,4,4,0,37,65.0,148.0,29.674945,13,72,20,...,0,0,0,120,70,2,2,15.0,14.0,7.5
4,5,5,0,25,52.0,161.0,20.060954,11,72,18,...,0,0,0,120,80,3,4,16.0,14.0,7.0


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 538 entries, 0 to 537
Data columns (total 44 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Sl. No                  538 non-null    int64  
 1   Patient File No.        538 non-null    int64  
 2   PCOS (Y/N)              538 non-null    int64  
 3    Age (yrs)              538 non-null    int64  
 4   Weight (Kg)             538 non-null    float64
 5   Height(Cm)              538 non-null    float64
 6   BMI                     538 non-null    float64
 7   Blood Group             538 non-null    int64  
 8   Pulse rate(bpm)         538 non-null    int64  
 9   RR (breaths/min)        538 non-null    int64  
 10  Hb(g/dl)                538 non-null    float64
 11  Cycle(R/I)              538 non-null    int64  
 12  Cycle length(days)      538 non-null    int64  
 13  Marraige Status (Yrs)   538 non-null    float64
 14  Pregnant(Y/N)           538 non-null    in

In [5]:
#checking to see if there's any null variables
data.isnull().sum()

Sl. No                    0
Patient File No.          0
PCOS (Y/N)                0
 Age (yrs)                0
Weight (Kg)               0
Height(Cm)                0
BMI                       0
Blood Group               0
Pulse rate(bpm)           0
RR (breaths/min)          0
Hb(g/dl)                  0
Cycle(R/I)                0
Cycle length(days)        0
Marraige Status (Yrs)     0
Pregnant(Y/N)             0
No. of aborptions         0
  I   beta-HCG(mIU/mL)    0
II    beta-HCG(mIU/mL)    0
FSH(mIU/mL)               0
LH(mIU/mL)                0
FSH/LH                    0
Hip(inch)                 0
Waist(inch)               0
Waist:Hip Ratio           0
TSH (mIU/L)               0
AMH(ng/mL)                0
PRL(ng/mL)                0
Vit D3 (ng/mL)            0
PRG(ng/mL)                0
RBS(mg/dl)                0
Weight gain(Y/N)          0
hair growth(Y/N)          0
Skin darkening (Y/N)      0
Hair loss(Y/N)            0
Pimples(Y/N)              0
Fast food (Y/N)     

In [6]:
print ("Number of rows in the dataset  : " ,data.shape[0])
print ("Number of Columns in the dataset : " ,data.shape[1])
print ("Number of Features : \n" ,data.columns.tolist())
print ("Missing values :  ", data.isnull().sum().values.sum())
print ("Unique values :  \n",data.nunique())

Number of rows in the dataset  :  538
Number of Columns in the dataset :  44
Number of Features : 
 ['Sl. No', 'Patient File No.', 'PCOS (Y/N)', ' Age (yrs)', 'Weight (Kg)', 'Height(Cm) ', 'BMI', 'Blood Group', 'Pulse rate(bpm) ', 'RR (breaths/min)', 'Hb(g/dl)', 'Cycle(R/I)', 'Cycle length(days)', 'Marraige Status (Yrs)', 'Pregnant(Y/N)', 'No. of aborptions', '  I   beta-HCG(mIU/mL)', 'II    beta-HCG(mIU/mL)', 'FSH(mIU/mL)', 'LH(mIU/mL)', 'FSH/LH', 'Hip(inch)', 'Waist(inch)', 'Waist:Hip Ratio', 'TSH (mIU/L)', 'AMH(ng/mL)', 'PRL(ng/mL)', 'Vit D3 (ng/mL)', 'PRG(ng/mL)', 'RBS(mg/dl)', 'Weight gain(Y/N)', 'hair growth(Y/N)', 'Skin darkening (Y/N)', 'Hair loss(Y/N)', 'Pimples(Y/N)', 'Fast food (Y/N)', 'Reg.Exercise(Y/N)', 'BP _Systolic (mmHg)', 'BP _Diastolic (mmHg)', 'Follicle No. (L)', 'Follicle No. (R)', 'Avg. F size (L) (mm)', 'Avg. F size (R) (mm)', 'Endometrium (mm)']
Missing values :   0
Unique values :  
 Sl. No                    538
Patient File No.          538
PCOS (Y/N)        

In [7]:
# Yes = 1, No = 0 
data['PCOS (Y/N)'].value_counts()

0    362
1    176
Name: PCOS (Y/N), dtype: int64

In [8]:
from sklearn.model_selection import train_test_split

In [9]:
#spliting the testing and training data 

df = data
df_test = df[430:]
df_train = df[:430]

#Now seperate the dataset as response variable and feature variabes
X_test = data.drop('PCOS (Y/N)', axis = 1)
Y_test = data['PCOS (Y/N)']

In [10]:
# Separate majority and minority classes
df_majority = df_train[df_train['PCOS (Y/N)'] == 0]
df_minority = df_train[df_train['PCOS (Y/N)'] == 1]

In [11]:
df_majority.shape, df_minority.shape

((292, 44), (138, 44))

# Upsample

In [12]:
from sklearn.utils import resample

In [13]:
df_minority_upsampled = resample(df_minority, 
                                 replace=True,     # sample with replacement
                                 n_samples=362,    # to match majority class #length of df_majority
                                 random_state=123) 

**Upsampling on the dataset is done for increasing the size of the minority class in the dataset.**

In [14]:
df_upsampled = pd.concat([df_majority, df_minority_upsampled])

In [15]:
df_upsampled

Unnamed: 0,Sl. No,Patient File No.,PCOS (Y/N),Age (yrs),Weight (Kg),Height(Cm),BMI,Blood Group,Pulse rate(bpm),RR (breaths/min),...,Pimples(Y/N),Fast food (Y/N),Reg.Exercise(Y/N),BP _Systolic (mmHg),BP _Diastolic (mmHg),Follicle No. (L),Follicle No. (R),Avg. F size (L) (mm),Avg. F size (R) (mm),Endometrium (mm)
0,1,1,0,28,44.6,152.0,19.304017,15,78,22,...,0,1,0,110,80,3,3,18.0,18.0,8.5
1,2,2,0,36,65.0,161.5,24.921163,15,74,20,...,0,0,0,120,70,3,5,15.0,14.0,3.7
3,4,4,0,37,65.0,148.0,29.674945,13,72,20,...,0,0,0,120,70,2,2,15.0,14.0,7.5
4,5,5,0,25,52.0,161.0,20.060954,11,72,18,...,0,0,0,120,80,3,4,16.0,14.0,7.0
5,6,6,0,36,74.1,165.0,27.217631,15,78,28,...,0,0,0,110,70,9,6,16.0,20.0,8.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
212,214,214,1,26,70.0,150.0,31.111111,15,72,20,...,1,1,0,110,80,8,7,17.0,14.0,6.0
229,231,231,1,40,71.0,152.0,30.730609,15,72,18,...,0,1,0,120,80,4,2,17.0,10.0,12.0
124,125,125,1,26,78.0,159.0,30.853210,13,78,20,...,1,1,1,120,80,8,10,16.0,18.0,8.5
113,114,114,1,32,63.0,164.0,23.423557,17,72,18,...,1,1,1,110,80,5,6,18.0,21.0,9.0


In [16]:
df_up=df_upsampled.sample(frac=1)

In [17]:
df_up['PCOS (Y/N)'].value_counts()

1    362
0    292
Name: PCOS (Y/N), dtype: int64

In [18]:
# Dividing the dataset into two part one having only the target value and other having all other columns
X_up = df_up.drop('PCOS (Y/N)', axis=1)
Y_up = df_up['PCOS (Y/N)']

X_up.shape, X_test.shape,Y_up.shape, Y_test.shape

((654, 43), (538, 43), (654,), (538,))

In [19]:
X_up=X_up.drop(['Sl. No', 'Patient File No.'],axis=1)
X_test=X_test.drop(['Sl. No', 'Patient File No.'],axis=1)
#X_test=X_test.drop('Patient File No.', axis=1)

from sklearn.linear_model import LogisticRegression
logistic_model = LogisticRegression(penalty='l2', C=1.0, solver='liblinear').fit(X_up,Y_up)
y_pred = logistic_model.predict(X_test)

acc = accuracy_score(Y_test, y_pred)
prec = precision_score(Y_test, y_pred)
recall = recall_score(Y_test, y_pred)
f1 = f1_score(Y_test, y_pred)
print("accuracy_score : ", acc)
print("precision_score : ", prec)
print("recall_score : ", recall)
print("f1_score : ", f1)

target_names = ['class 0', 'class 1']
print(classification_report(Y_test, y_pred, target_names=target_names))
cf_matrix=confusion_matrix(y_pred,Y_test)
cf_matrix

accuracy_score :  0.912639405204461
precision_score :  0.8177339901477833
recall_score :  0.9431818181818182
f1_score :  0.8759894459102903
              precision    recall  f1-score   support

     class 0       0.97      0.90      0.93       362
     class 1       0.82      0.94      0.88       176

    accuracy                           0.91       538
   macro avg       0.89      0.92      0.90       538
weighted avg       0.92      0.91      0.91       538



array([[325,  10],
       [ 37, 166]], dtype=int64)

# Downsample

**Downsampling on the dataset is done for decreasing the size of the majority class in the dataset.**

In [20]:
df_majority_downsampled = resample(df_majority, 
                                 replace=True,    # sample without replacement
                                 n_samples=138,     # to match minority class
                                 random_state=123) # reproducible results
 
# Combine minority class with downsampled majority class
df_downsampled = pd.concat([df_majority_downsampled, df_minority])

In [21]:
df_down=df_downsampled.sample(frac=1)

In [22]:
df_down['PCOS (Y/N)'].value_counts()

0    138
1    138
Name: PCOS (Y/N), dtype: int64

In [23]:
# Dividing the dataset into two part one having onlty the target value and other having all other columns
X_down = df_down.drop('PCOS (Y/N)', axis=1)
Y_down= df_down['PCOS (Y/N)']

X_down.shape, X_test.shape,Y_down.shape, Y_test.shape

((276, 43), (538, 41), (276,), (538,))

In [24]:
#X_down=X_down.drop('Patient File No.', axis=1)
#X_test=X_test.drop('Patient File No.', axis=1)
X_down=X_down.drop(['Sl. No', 'Patient File No.'], axis=1)

from sklearn.linear_model import LogisticRegression
logistic_model = LogisticRegression(penalty='l2', C=1.0, solver='liblinear').fit(X_down, Y_down)
y_pred = logistic_model.predict(X_test)

acc = accuracy_score(Y_test, y_pred)
prec = precision_score(Y_test, y_pred)
recall = recall_score(Y_test, y_pred)
f1 = f1_score(Y_test, y_pred)
print("accuracy_score : ", acc)
print("precision_score : ", prec)
print("recall_score : ", recall)
print("f1_score : ", f1)

target_names = ['class 0', 'class 1']
print(classification_report(Y_test, y_pred, target_names=target_names))
cf_matrix=confusion_matrix(y_pred, Y_test)
cf_matrix

accuracy_score :  0.8847583643122676
precision_score :  0.7688679245283019
recall_score :  0.9261363636363636
f1_score :  0.8402061855670104
              precision    recall  f1-score   support

     class 0       0.96      0.86      0.91       362
     class 1       0.77      0.93      0.84       176

    accuracy                           0.88       538
   macro avg       0.86      0.90      0.88       538
weighted avg       0.90      0.88      0.89       538



array([[313,  13],
       [ 49, 163]], dtype=int64)

# SMOTE

SMOTE stands for Synthetic Minority Oversampling Technique. This is a statistical technique for increasing the number of cases in your dataset in a balanced way. The module works by generating new instances from existing minority cases that you supply as input.

In [25]:
df_train['PCOS (Y/N)'].value_counts()

0    292
1    138
Name: PCOS (Y/N), dtype: int64

In [26]:
X_train_smote=df_train.drop(['Sl. No', 'Patient File No.','PCOS (Y/N)'],axis=1)
Y_train_smote=df_train['PCOS (Y/N)']
X_train_smote.shape,Y_train_smote.shape

((430, 41), (430,))

In [27]:
pip install --upgrade imbalanced-learn scikit-learn

Note: you may need to restart the kernel to use updated packages.


In [28]:
from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state = 2)
X_train_res, y_train_res = sm.fit_resample(X_train_smote, Y_train_smote.ravel())

In [29]:
X_train_res.shape, y_train_res.shape

((584, 41), (584,))

In [30]:
len(y_train_res[y_train_res==0]),len(y_train_res[y_train_res==1])

(292, 292)

In [31]:
from sklearn.linear_model import LogisticRegression
logistic_model = LogisticRegression(penalty='l2', C=1.0, solver='liblinear').fit(X_train_res, y_train_res)
y_pred = logistic_model.predict(X_test)

acc = accuracy_score(Y_test, y_pred)
prec = precision_score(Y_test, y_pred)
recall = recall_score(Y_test, y_pred)
f1 = f1_score(Y_test, y_pred)
print("accuracy_score : ", acc)
print("precision_score : ", prec)
print("recall_score : ", recall)
print("f1_score : ", f1)

target_names = ['class 0', 'class 1']
print(classification_report(Y_test, y_pred, target_names=target_names))
cf_matrix=confusion_matrix(y_pred, Y_test)
cf_matrix

accuracy_score :  0.8828996282527881
precision_score :  0.775609756097561
recall_score :  0.9034090909090909
f1_score :  0.8346456692913385
              precision    recall  f1-score   support

     class 0       0.95      0.87      0.91       362
     class 1       0.78      0.90      0.83       176

    accuracy                           0.88       538
   macro avg       0.86      0.89      0.87       538
weighted avg       0.89      0.88      0.88       538



array([[316,  17],
       [ 46, 159]], dtype=int64)

# ADASYN

ADASYN is based on the idea of adaptively generating minority data samples according to their distributions: more synthetic data is generated for minority class samples that are harder to learn compared to those minority samples that are easier to learn.

In [32]:
X_train_adas=df_train.drop(['Sl. No', 'Patient File No.','PCOS (Y/N)'],axis=1)
Y_train_adas=df_train['PCOS (Y/N)']
X_train_adas.shape,Y_train_adas.shape

((430, 41), (430,))

In [33]:
from imblearn.over_sampling import ADASYN
sm = ADASYN(random_state = 2)
X_train_ada, y_train_ada = sm.fit_resample(X_train_adas, Y_train_adas.ravel())

In [34]:
X_train_ada.shape, y_train_ada.shape


((576, 41), (576,))

In [35]:
len(y_train_ada[y_train_ada==0]),len(y_train_ada[y_train_ada==1])


(292, 284)

In [36]:
from sklearn.linear_model import LogisticRegression
logistic_model = LogisticRegression(penalty='l2', C=1.0, solver='liblinear').fit(X_train_ada, y_train_ada)
y_pred = logistic_model.predict(X_test)

acc = accuracy_score(Y_test, y_pred)
prec = precision_score(Y_test, y_pred)
recall = recall_score(Y_test, y_pred)
f1 = f1_score(Y_test, y_pred)
print("accuracy_score : ", acc)
print("precision_score : ", prec)
print("recall_score : ", recall)
print("f1_score : ", f1)

target_names = ['class 0', 'class 1']
print(classification_report(Y_test, y_pred, target_names=target_names))
cf_matrix=confusion_matrix(y_pred, Y_test)
cf_matrix

accuracy_score :  0.8494423791821561
precision_score :  0.7130044843049327
recall_score :  0.9034090909090909
f1_score :  0.7969924812030075
              precision    recall  f1-score   support

     class 0       0.95      0.82      0.88       362
     class 1       0.71      0.90      0.80       176

    accuracy                           0.85       538
   macro avg       0.83      0.86      0.84       538
weighted avg       0.87      0.85      0.85       538





array([[298,  17],
       [ 64, 159]], dtype=int64)

# SMOTE + ENN

SMOTE + ENN is another hybrid technique where more no. of observations are removed from the sample space. Here, ENN is yet another undersampling technique where the nearest neighbors of each of the majority class is estimated. If the nearest neighbors misclassify that particular instance of the majority class, then that instance gets deleted. Integrating this technique with oversampled data done by SMOTE helps in doing extensive data cleaning. Here on misclassification by NN’s samples from both the classes are removed. This results in a more clear and concise class separation.

In [37]:
X_train_se=df_train.drop(['Sl. No', 'Patient File No.','PCOS (Y/N)'],axis=1)
Y_train_se=df_train['PCOS (Y/N)']
X_train_se.shape,Y_train_se.shape

((430, 41), (430,))

In [38]:
from imblearn.combine import SMOTEENN
sm = SMOTEENN(random_state = 2)
X_train_sen, y_train_sen = sm.fit_resample(X_train_se, Y_train_se.ravel())

In [39]:
X_train_sen.shape,y_train_sen.shape

((264, 41), (264,))

In [40]:
from sklearn.linear_model import LogisticRegression
logistic_model = LogisticRegression(penalty='l2', C=1.0, solver='liblinear').fit(X_train_sen, y_train_sen)
y_pred = logistic_model.predict(X_test)

acc = accuracy_score(Y_test, y_pred)
prec = precision_score(Y_test, y_pred)
recall = recall_score(Y_test, y_pred)
f1 = f1_score(Y_test, y_pred)
print("accuracy_score : ", acc)
print("precision_score : ", prec)
print("recall_score : ", recall)
print("f1_score : ", f1)

target_names = ['class 0', 'class 1']
print(classification_report(Y_test, y_pred, target_names=target_names))
cf_matrix=confusion_matrix(y_pred, Y_test)
cf_matrix

accuracy_score :  0.8197026022304833
precision_score :  0.6549019607843137
recall_score :  0.9488636363636364
f1_score :  0.7749419953596288
              precision    recall  f1-score   support

     class 0       0.97      0.76      0.85       362
     class 1       0.65      0.95      0.77       176

    accuracy                           0.82       538
   macro avg       0.81      0.85      0.81       538
weighted avg       0.87      0.82      0.83       538



array([[274,   9],
       [ 88, 167]], dtype=int64)