# General

In [1]:
#Importing required packages.
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
#from sklearn.linear_model import SGDClassifier
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
%matplotlib inline

In [2]:
#loading dataset
data = pd.read_excel("PCOS_data.xlsx")

In [14]:
data.head()

Unnamed: 0,Sl. No,Patient File No.,PCOS (Y/N),Age (yrs),Weight (Kg),Height(Cm),BMI,Blood Group,Pulse rate(bpm),RR (breaths/min),...,Pimples(Y/N),Fast food (Y/N),Reg.Exercise(Y/N),BP _Systolic (mmHg),BP _Diastolic (mmHg),Follicle No. (L),Follicle No. (R),Avg. F size (L) (mm),Avg. F size (R) (mm),Endometrium (mm)
0,1,1,0,28,44.6,152.0,19.304017,15,78,22,...,0,1,0,110,80,3,3,18.0,18.0,8.5
1,2,2,0,36,65.0,161.5,24.921163,15,74,20,...,0,0,0,120,70,3,5,15.0,14.0,3.7
2,3,3,1,33,68.8,165.0,25.270891,11,72,18,...,1,1,0,120,80,13,15,18.0,20.0,10.0
3,4,4,0,37,65.0,148.0,29.674945,13,72,20,...,0,0,0,120,70,2,2,15.0,14.0,7.5
4,5,5,0,25,52.0,161.0,20.060954,11,72,18,...,0,0,0,120,80,3,4,16.0,14.0,7.0


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 538 entries, 0 to 537
Data columns (total 44 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Sl. No                  538 non-null    int64  
 1   Patient File No.        538 non-null    int64  
 2   PCOS (Y/N)              538 non-null    int64  
 3    Age (yrs)              538 non-null    int64  
 4   Weight (Kg)             538 non-null    float64
 5   Height(Cm)              538 non-null    float64
 6   BMI                     538 non-null    float64
 7   Blood Group             538 non-null    int64  
 8   Pulse rate(bpm)         538 non-null    int64  
 9   RR (breaths/min)        538 non-null    int64  
 10  Hb(g/dl)                538 non-null    float64
 11  Cycle(R/I)              538 non-null    int64  
 12  Cycle length(days)      538 non-null    int64  
 13  Marraige Status (Yrs)   538 non-null    float64
 14  Pregnant(Y/N)           538 non-null    in

In [5]:
#checking to see if there's any null variables
data.isnull().sum()

Sl. No                    0
Patient File No.          0
PCOS (Y/N)                0
 Age (yrs)                0
Weight (Kg)               0
Height(Cm)                0
BMI                       0
Blood Group               0
Pulse rate(bpm)           0
RR (breaths/min)          0
Hb(g/dl)                  0
Cycle(R/I)                0
Cycle length(days)        0
Marraige Status (Yrs)     0
Pregnant(Y/N)             0
No. of aborptions         0
  I   beta-HCG(mIU/mL)    0
II    beta-HCG(mIU/mL)    0
FSH(mIU/mL)               0
LH(mIU/mL)                0
FSH/LH                    0
Hip(inch)                 0
Waist(inch)               0
Waist:Hip Ratio           0
TSH (mIU/L)               0
AMH(ng/mL)                0
PRL(ng/mL)                0
Vit D3 (ng/mL)            0
PRG(ng/mL)                0
RBS(mg/dl)                0
Weight gain(Y/N)          0
hair growth(Y/N)          0
Skin darkening (Y/N)      0
Hair loss(Y/N)            0
Pimples(Y/N)              0
Fast food (Y/N)     

In [6]:
# Yes = 1, No = 0 
data['PCOS (Y/N)'].value_counts()

0    362
1    176
Name: PCOS (Y/N), dtype: int64

In [7]:
#Now seperate the dataset as response variable and feature variabes
X = data.drop('PCOS (Y/N)', axis = 1)
y = data['PCOS (Y/N)']

In [8]:
#Train and Test splitting of data 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)

In [9]:
problematic_rows = data[data.isna().any(axis=1)]
print(problematic_rows)

Empty DataFrame
Columns: [Sl. No, Patient File No., PCOS (Y/N),  Age (yrs), Weight (Kg), Height(Cm) , BMI, Blood Group, Pulse rate(bpm) , RR (breaths/min), Hb(g/dl), Cycle(R/I), Cycle length(days), Marraige Status (Yrs), Pregnant(Y/N), No. of aborptions,   I   beta-HCG(mIU/mL), II    beta-HCG(mIU/mL), FSH(mIU/mL), LH(mIU/mL), FSH/LH, Hip(inch), Waist(inch), Waist:Hip Ratio, TSH (mIU/L), AMH(ng/mL), PRL(ng/mL), Vit D3 (ng/mL), PRG(ng/mL), RBS(mg/dl), Weight gain(Y/N), hair growth(Y/N), Skin darkening (Y/N), Hair loss(Y/N), Pimples(Y/N), Fast food (Y/N), Reg.Exercise(Y/N), BP _Systolic (mmHg), BP _Diastolic (mmHg), Follicle No. (L), Follicle No. (R), Avg. F size (L) (mm), Avg. F size (R) (mm), Endometrium (mm)]
Index: []

[0 rows x 44 columns]


In [15]:
#Applying Standard scaling to get optimized result
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.fit_transform(X_test)

In [11]:
X_train[:10]

array([[ 1.38907388,  1.38907388,  0.10147064, -1.50519515,  0.87572725,
        -1.90800602, -0.4269015 , -0.30157042,  1.55410681,  1.06833578,
        -0.61323442,  0.0268121 , -0.15565209,  1.22746351, -0.42610575,
        -0.21525762, -0.15799755, -0.05119673,  0.63238599, -0.09553344,
        -1.00444241, -1.06003168, -0.18224038, -0.1997793 , -0.28701485,
         1.12257488, -0.06388979, -0.05972601,  0.55208666, -0.80119997,
        -0.61834694, -0.66794749, -0.93308454,  0.95831485,  0.96857851,
         1.74442276, -0.59216739,  0.52348561, -1.23368027, -0.84049889,
         1.12690905,  1.05477412,  0.05147428],
       [-1.39213705, -1.39213705,  0.84591865, -0.13987291, -0.74920128,
         0.19364431,  0.12469631, -0.30157042,  0.40720954, -1.38180199,
        -0.61323442,  0.0268121 ,  1.07129104,  1.22746351,  0.99173286,
        -0.14794646, -0.15273528, -0.05746685, -0.93522329, -0.02644314,
         1.04950917,  0.3614088 , -1.22578411, -0.26808185, -0.53208402,
   

# Random Forest Classifier

In [16]:
rfc = RandomForestClassifier(n_estimators=200)
rfc.fit(X_train, y_train)
pred_rfc = rfc.predict(X_test)

In [17]:
X_test[:20]

array([[ 1.32836094e+00,  1.32836094e+00,  8.48938062e-01,
         1.52364861e+00,  1.30327227e+00,  9.86788214e-01,
        -1.50372088e+00, -2.53376661e-01, -7.21687836e-01,
        -3.43340780e-01, -6.29723530e-01,  7.41249317e-02,
        -1.09329488e-01, -7.26843784e-01, -3.99296614e-01,
        -1.39013718e-01, -2.40613825e-01,  2.62617988e-01,
        -8.45177611e-02, -1.72634141e-01,  9.18294908e-01,
         1.82979949e+00,  1.85239225e+00,  2.20381869e+00,
        -6.35159268e-01, -5.44304269e-01, -5.77545154e-01,
        -1.61175213e-01,  6.27279930e-01,  1.35724179e+00,
        -6.10640120e-01, -6.58504608e-01, -8.61356769e-01,
        -8.50531749e-01,  9.63624112e-01, -5.63092506e-01,
        -7.52417401e-01,  6.39293088e-01, -2.01292421e-01,
         5.25608968e-01,  1.37131185e+00,  1.11923679e+00,
        -6.94299501e-01],
       [-1.19975530e+00, -1.19975530e+00, -9.72624402e-01,
         9.05473439e-01,  9.90548083e-01,  5.04763677e-01,
         5.91786925e-01, -2.53

In [13]:
pred_rfc[:20]

array([0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0],
      dtype=int64)

In [18]:
#Let's see how our model performed
print(classification_report(y_test, pred_rfc))

              precision    recall  f1-score   support

           0       0.92      0.88      0.90       110
           1       0.77      0.85      0.81        52

    accuracy                           0.87       162
   macro avg       0.85      0.86      0.85       162
weighted avg       0.88      0.87      0.87       162



In [19]:
#Confusion matrix for the random forest classification
print(confusion_matrix(y_test, pred_rfc))

[[97 13]
 [ 8 44]]


### Accuracy Score

In [20]:
from sklearn.metrics import accuracy_score
cm = accuracy_score(y_test, pred_rfc)

cm

0.8703703703703703

# SVM Classifier

In [24]:
from sklearn import svm
clf=svm.SVC()
clf.fit(X_train, y_train)
pred_clf = clf.predict(X_test)

In [25]:
#Let's see how our model performed
print(classification_report(y_test, pred_clf))

              precision    recall  f1-score   support

           0       0.86      0.95      0.90       110
           1       0.85      0.67      0.75        52

    accuracy                           0.86       162
   macro avg       0.86      0.81      0.83       162
weighted avg       0.86      0.86      0.85       162



In [26]:
#Confusion matrix for the random forest classification
print(confusion_matrix(y_test, pred_clf))

[[104   6]
 [ 17  35]]


In [27]:
from sklearn.metrics import accuracy_score
cm = accuracy_score(y_test, pred_clf)

cm

0.8580246913580247

# Neural Network

In [29]:
from sklearn.neural_network import MLPClassifier

mlpc = MLPClassifier(hidden_layer_sizes=(11,11,11),max_iter=500)
mlpc.fit(X_train, y_train)
pred_mlpc = mlpc.predict(X_test)

In [30]:
#Let's see how our model performed
print(classification_report(y_test, pred_mlpc))

              precision    recall  f1-score   support

           0       0.87      0.88      0.88       110
           1       0.75      0.73      0.74        52

    accuracy                           0.83       162
   macro avg       0.81      0.81      0.81       162
weighted avg       0.83      0.83      0.83       162



In [31]:
#Confusion matrix for the random forest classification
print(confusion_matrix(y_test, pred_mlpc))

[[97 13]
 [14 38]]


In [32]:
data.head(10)

Unnamed: 0,Sl. No,Patient File No.,PCOS (Y/N),Age (yrs),Weight (Kg),Height(Cm),BMI,Blood Group,Pulse rate(bpm),RR (breaths/min),...,Pimples(Y/N),Fast food (Y/N),Reg.Exercise(Y/N),BP _Systolic (mmHg),BP _Diastolic (mmHg),Follicle No. (L),Follicle No. (R),Avg. F size (L) (mm),Avg. F size (R) (mm),Endometrium (mm)
0,1,1,0,28,44.6,152.0,19.304017,15,78,22,...,0,1,0,110,80,3,3,18.0,18.0,8.5
1,2,2,0,36,65.0,161.5,24.921163,15,74,20,...,0,0,0,120,70,3,5,15.0,14.0,3.7
2,3,3,1,33,68.8,165.0,25.270891,11,72,18,...,1,1,0,120,80,13,15,18.0,20.0,10.0
3,4,4,0,37,65.0,148.0,29.674945,13,72,20,...,0,0,0,120,70,2,2,15.0,14.0,7.5
4,5,5,0,25,52.0,161.0,20.060954,11,72,18,...,0,0,0,120,80,3,4,16.0,14.0,7.0
5,6,6,0,36,74.1,165.0,27.217631,15,78,28,...,0,0,0,110,70,9,6,16.0,20.0,8.0
6,7,7,0,34,64.0,156.0,26.298488,11,72,18,...,0,0,0,120,80,6,6,15.0,16.0,6.8
7,8,8,0,33,58.5,159.0,23.139907,13,72,20,...,0,0,0,120,80,7,6,15.0,18.0,7.1
8,9,9,0,32,40.0,158.0,16.023073,11,72,18,...,0,0,0,120,80,5,7,17.0,17.0,4.2
9,10,10,0,36,52.0,150.0,23.111111,15,80,20,...,0,0,0,110,80,1,1,14.0,17.0,2.5


In [47]:
Xnew = [[460, 460, 36, 55, 154, 23.2, 13, 74, 18, 10, 2, 5, 8, 0, 0, 8, 1.99, 3.22, 0.1, 32.20, 38, 37, 0.97, 2.91, 5, 22.84, 53.21, 0.26, 92, 0, 0, 0, 1, 0, 1, 0, 110, 80, 4, 1, 22, 14, 10.5]]
Xnew = sc.transform(Xnew)
ynew = rfc.predict(Xnew)
ynew

array([1], dtype=int64)