## Loading the libraries

In [38]:
import pandas as pd # for loading the data
from sklearn.preprocessing import LabelEncoder,StandardScaler # encoding categories
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, mean_squared_error

## The Data

In [2]:
stroke = pd.read_csv("data/healthcare-dataset-stroke-data.csv")
stroke.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


### Checking for missing data

In [6]:
stroke.shape[0]

5110

In [8]:
(stroke.isna().sum()/stroke.shape[0])*100

id                   0.000000
gender               0.000000
age                  0.000000
hypertension         0.000000
heart_disease        0.000000
ever_married         0.000000
work_type            0.000000
Residence_type       0.000000
avg_glucose_level    0.000000
bmi                  3.933464
smoking_status       0.000000
stroke               0.000000
dtype: float64

In [9]:
# we can clean the missing data by 

# dropping them.
# we are going with this option
# i will cover other methods later on
stroke = stroke.dropna()
(stroke.isna().sum()/stroke.shape[0])*100
# imputing missing data.

id                   0.0
gender               0.0
age                  0.0
hypertension         0.0
heart_disease        0.0
ever_married         0.0
work_type            0.0
Residence_type       0.0
avg_glucose_level    0.0
bmi                  0.0
smoking_status       0.0
stroke               0.0
dtype: float64

In [10]:
# checking the data types of the variables
# most of the variables should be categorical
stroke.dtypes

id                     int64
gender                object
age                  float64
hypertension           int64
heart_disease          int64
ever_married          object
work_type             object
Residence_type        object
avg_glucose_level    float64
bmi                  float64
smoking_status        object
stroke                 int64
dtype: object

In [12]:
# convert the variables into factors
# defining the columns
cols = ['gender','hypertension','heart_disease','ever_married','work_type','Residence_type','smoking_status','stroke']

# converting the columns to categorical
stroke[cols] = stroke[cols].astype('category')

stroke.dtypes

id                      int64
gender               category
age                   float64
hypertension         category
heart_disease        category
ever_married         category
work_type            category
Residence_type       category
avg_glucose_level     float64
bmi                   float64
smoking_status       category
stroke               category
dtype: object

In [13]:
# dropping the id column
stroke = stroke.drop('id',axis=1)

In [14]:
# encoding the categorical columns
# define the encoder
le = LabelEncoder()

# encoding
stroke[cols] = stroke[cols].apply(lambda col: le.fit_transform(col))

In [15]:
stroke

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,1,67.0,0,1,1,2,1,228.69,36.6,1,1
2,1,80.0,0,1,1,2,0,105.92,32.5,2,1
3,0,49.0,0,0,1,2,1,171.23,34.4,3,1
4,0,79.0,1,0,1,3,0,174.12,24.0,2,1
5,1,81.0,0,0,1,2,1,186.21,29.0,1,1
...,...,...,...,...,...,...,...,...,...,...,...
5104,0,13.0,0,0,0,4,0,103.08,18.6,0,0
5106,0,81.0,0,0,1,3,1,125.20,40.0,2,0
5107,0,35.0,0,0,1,3,0,82.99,30.6,2,0
5108,1,51.0,0,0,1,2,0,166.29,25.6,1,0


## Data splitting

In [17]:
stroke.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,1,67.0,0,1,1,2,1,228.69,36.6,1,1
2,1,80.0,0,1,1,2,0,105.92,32.5,2,1
3,0,49.0,0,0,1,2,1,171.23,34.4,3,1
4,0,79.0,1,0,1,3,0,174.12,24.0,2,1
5,1,81.0,0,0,1,2,1,186.21,29.0,1,1


In [22]:
# defining the X na y
X = stroke.loc[:, 'gender':'smoking_status']
y = stroke[['stroke']]

In [24]:
# convert y to numpy array
y = y.to_numpy()

In [25]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=420)

In [29]:
## scalling the x data
# defining the scaler
sc = StandardScaler()

# x_train scaled
X_train_scalled = sc.fit_transform(X_train)

In [30]:
X_train_scalled

array([[ 1.21171845,  1.61342178,  3.1098546 , ..., -0.74626918,
        -0.68423133, -0.34217283],
       [-0.8235332 , -0.12229107, -0.32155844, ..., -0.09622398,
         0.42442552,  1.52279969],
       [-0.8235332 ,  0.18924713, -0.32155844, ...,  0.15084695,
        -0.54405633,  1.52279969],
       ...,
       [-0.8235332 , -1.10141114, -0.32155844, ..., -0.83855679,
        -1.21944498,  0.59031343],
       [ 1.21171845, -1.54646572, -0.32155844, ..., -0.99199613,
        -1.25767453, -1.27465908],
       [-0.8235332 ,  0.41177442, -0.32155844, ..., -0.35315088,
        -0.18724722, -1.27465908]])

In [34]:
## the lr model
model = LogisticRegression().fit(X_train_scalled, y_train.ravel())
model

LogisticRegression()

In [36]:
# making predictions

# scaling the test set
X_test_scalled = sc.fit_transform(X_test)

# predictions
pred = model.predict(X_test_scalled)

In [37]:
model.predict_proba(X_test_scalled)

array([[0.90874099, 0.09125901],
       [0.91862779, 0.08137221],
       [0.99552612, 0.00447388],
       ...,
       [0.90887481, 0.09112519],
       [0.97355449, 0.02644551],
       [0.99714061, 0.00285939]])

In [39]:
confusion_matrix(y_test, pred)

array([[939,   0],
       [ 42,   1]], dtype=int64)

In [40]:
mean_squared_error(y_test, pred)

0.04276985743380855

In [None]:
# there is an easier way to collect all these steps into a pipeline.
# i will cover that in later videos
# thanks for watching.