In [1]:
#importing necessary libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [2]:
#import dataset
df=pd.read_csv('framingham.csv')

In [3]:
#first few rows
df.head()

Unnamed: 0,male,age,education,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD
0,1,39,4.0,0,0.0,0.0,0,0,0,195.0,106.0,70.0,26.97,80.0,77.0,0
1,0,46,2.0,0,0.0,0.0,0,0,0,250.0,121.0,81.0,28.73,95.0,76.0,0
2,1,48,1.0,1,20.0,0.0,0,0,0,245.0,127.5,80.0,25.34,75.0,70.0,0
3,0,61,3.0,1,30.0,0.0,0,1,0,225.0,150.0,95.0,28.58,65.0,103.0,1
4,0,46,3.0,1,23.0,0.0,0,0,0,285.0,130.0,84.0,23.1,85.0,85.0,0


In [4]:
#no of rows & columns in df
df.shape

(4238, 16)

In [5]:
#drop the education column
df.drop('education',axis=1,inplace=True)

In [6]:
#check about columns name and their datatype
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4238 entries, 0 to 4237
Data columns (total 15 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   male             4238 non-null   int64  
 1   age              4238 non-null   int64  
 2   currentSmoker    4238 non-null   int64  
 3   cigsPerDay       4209 non-null   float64
 4   BPMeds           4185 non-null   float64
 5   prevalentStroke  4238 non-null   int64  
 6   prevalentHyp     4238 non-null   int64  
 7   diabetes         4238 non-null   int64  
 8   totChol          4188 non-null   float64
 9   sysBP            4238 non-null   float64
 10  diaBP            4238 non-null   float64
 11  BMI              4219 non-null   float64
 12  heartRate        4237 non-null   float64
 13  glucose          3850 non-null   float64
 14  TenYearCHD       4238 non-null   int64  
dtypes: float64(8), int64(7)
memory usage: 496.8 KB


In [7]:
#perform basic statistics on numeric columns
df.describe()

Unnamed: 0,male,age,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD
count,4238.0,4238.0,4238.0,4209.0,4185.0,4238.0,4238.0,4238.0,4188.0,4238.0,4238.0,4219.0,4237.0,3850.0,4238.0
mean,0.429212,49.584946,0.494101,9.003089,0.02963,0.005899,0.310524,0.02572,236.721585,132.352407,82.893464,25.802008,75.878924,81.966753,0.151958
std,0.495022,8.57216,0.500024,11.920094,0.169584,0.076587,0.462763,0.158316,44.590334,22.038097,11.91085,4.080111,12.026596,23.959998,0.359023
min,0.0,32.0,0.0,0.0,0.0,0.0,0.0,0.0,107.0,83.5,48.0,15.54,44.0,40.0,0.0
25%,0.0,42.0,0.0,0.0,0.0,0.0,0.0,0.0,206.0,117.0,75.0,23.07,68.0,71.0,0.0
50%,0.0,49.0,0.0,0.0,0.0,0.0,0.0,0.0,234.0,128.0,82.0,25.4,75.0,78.0,0.0
75%,1.0,56.0,1.0,20.0,0.0,0.0,1.0,0.0,263.0,144.0,89.875,28.04,83.0,87.0,0.0
max,1.0,70.0,1.0,70.0,1.0,1.0,1.0,1.0,696.0,295.0,142.5,56.8,143.0,394.0,1.0


In [8]:
#check missing values
df.isnull().sum()

male                 0
age                  0
currentSmoker        0
cigsPerDay          29
BPMeds              53
prevalentStroke      0
prevalentHyp         0
diabetes             0
totChol             50
sysBP                0
diaBP                0
BMI                 19
heartRate            1
glucose            388
TenYearCHD           0
dtype: int64

# Filling Missing values

In [9]:
#filled missing values in 'cigsPerDay' column with 0
df['cigsPerDay'].fillna(value=0.0,inplace=True)

In [10]:
#filled missing values in 'BPMeds' column with its mean
df['BPMeds'].fillna(value=(df['BPMeds'].mean()),inplace=True)

In [11]:
#filled missing values in 'totChol' column with its mean
df['totChol'].fillna(value=(df['totChol'].mean()),inplace=True)

In [12]:
#filled missing values in 'BMI' column with its mean
df['BMI'].fillna(value=(df['BMI'].mean()),inplace=True)

In [13]:
#filled missing values in 'gulcose' column with its mean
df['glucose'].fillna(value=(df['glucose'].mean()),inplace=True)

In [14]:
#dropped all other missing rows
df=df.dropna()

In [15]:
#again checked missing values
df.isnull().sum()

male               0
age                0
currentSmoker      0
cigsPerDay         0
BPMeds             0
prevalentStroke    0
prevalentHyp       0
diabetes           0
totChol            0
sysBP              0
diaBP              0
BMI                0
heartRate          0
glucose            0
TenYearCHD         0
dtype: int64

In [16]:
# seperate the feature variables & target variable
x=df.drop('TenYearCHD',axis=1)
y=df['TenYearCHD']

In [41]:
#train test split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.20)

# Standardization
This process scales each feature such that it has a mean of 0 and a standard deviation of 1

In [42]:
#created an instance of the "StandardScaler" class called "SS"
SS=StandardScaler()

In [43]:
#fitted SS on the training dataset
SS.fit(x_train)

In [44]:
#transformed both training and test datasets
x_train_scaled =SS.transform(x_train)
x_test_scaled=SS.transform(x_test)

# Logistic Regression

In [45]:
#created an instance of the "LogisticRegression" class called "LR"
LR=LogisticRegression()

In [46]:
#Training the model
LR.fit(x_train_scaled,y_train)

In [47]:
#made predictions on test dataset
y_pred=LR.predict(x_test_scaled)

In [48]:
#calculated the accuracy of the model
accuracy_score(y_pred,y_test)

0.8667452830188679