# Linear Regression based model 

In [1]:
# import pandas to read the dataset 
import pandas as pd


In [11]:
df = pd.read_csv('train.csv')

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 630000 entries, 0 to 629999
Data columns (total 15 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   id                       629997 non-null  float64
 1   Age                      630000 non-null  int64  
 2   Sex                      630000 non-null  int64  
 3   Chest pain type          629997 non-null  float64
 4   BP                       629990 non-null  float64
 5   Cholesterol              629984 non-null  float64
 6   FBS over 120             629994 non-null  float64
 7   EKG results              630000 non-null  int64  
 8   Max HR                   629979 non-null  float64
 9   Exercise angina          629998 non-null  float64
 10  ST depression            629989 non-null  float64
 11  Slope of ST              630000 non-null  int64  
 12  Number of vessels fluro  630000 non-null  int64  
 13  Thallium                 630000 non-null  int64  
 14  Hear

In [12]:
df.isna().sum()

id                          3
Age                         0
Sex                         0
Chest pain type             3
BP                         10
Cholesterol                16
FBS over 120                6
EKG results                 0
Max HR                     21
Exercise angina             2
ST depression              11
Slope of ST                 0
Number of vessels fluro     0
Thallium                    0
Heart Disease              47
dtype: int64

# Handling Missing Values

In [13]:
def handle_miss_values(df):
    for col in df.columns:
        if df[col].isnull().any():
            if df[col].dtype == 'object':
                df[col].fillna(df[col].mode()[0], inplace=True)
            else:
                df[col].fillna(df[col].median(), inplace=True)
    return df
handle_miss_values(df)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values

Unnamed: 0,id,Age,Sex,Chest pain type,BP,Cholesterol,FBS over 120,EKG results,Max HR,Exercise angina,ST depression,Slope of ST,Number of vessels fluro,Thallium,Heart Disease
0,0.0,58,1,4.0,152.0,239.0,0.0,0,158.0,1.0,3.6,2,2,7,Presence
1,1.0,52,1,1.0,125.0,325.0,0.0,2,171.0,0.0,0.0,1,0,3,Absence
2,2.0,56,0,2.0,160.0,188.0,0.0,2,151.0,0.0,0.0,1,0,3,Absence
3,3.0,44,0,3.0,134.0,229.0,0.0,2,150.0,0.0,1.0,2,0,3,Absence
4,4.0,58,1,4.0,140.0,234.0,0.0,2,125.0,1.0,3.8,2,3,3,Presence
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
629995,629995.0,56,0,1.0,110.0,226.0,0.0,0,132.0,0.0,0.0,1,0,7,Absence
629996,629996.0,54,1,4.0,128.0,249.0,1.0,2,150.0,0.0,0.0,2,0,3,Absence
629997,629997.0,67,1,4.0,130.0,275.0,0.0,0,149.0,0.0,0.0,1,2,7,Presence
629998,629998.0,52,1,4.0,140.0,199.0,0.0,2,157.0,0.0,0.0,1,0,6,Presence


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 630000 entries, 0 to 629999
Data columns (total 15 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   id                       630000 non-null  float64
 1   Age                      630000 non-null  int64  
 2   Sex                      630000 non-null  int64  
 3   Chest pain type          630000 non-null  float64
 4   BP                       630000 non-null  float64
 5   Cholesterol              630000 non-null  float64
 6   FBS over 120             630000 non-null  float64
 7   EKG results              630000 non-null  int64  
 8   Max HR                   630000 non-null  float64
 9   Exercise angina          630000 non-null  float64
 10  ST depression            630000 non-null  float64
 11  Slope of ST              630000 non-null  int64  
 12  Number of vessels fluro  630000 non-null  int64  
 13  Thallium                 630000 non-null  int64  
 14  Hear

# Encoding process

In [14]:
df['Heart Disease'] = df['Heart Disease'].map({"Absence": 0, "Presence": 1})  # Only one columns is an objecty type so we've mapped it



In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 630000 entries, 0 to 629999
Data columns (total 15 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   id                       630000 non-null  float64
 1   Age                      630000 non-null  int64  
 2   Sex                      630000 non-null  int64  
 3   Chest pain type          630000 non-null  float64
 4   BP                       630000 non-null  float64
 5   Cholesterol              630000 non-null  float64
 6   FBS over 120             630000 non-null  float64
 7   EKG results              630000 non-null  int64  
 8   Max HR                   630000 non-null  float64
 9   Exercise angina          630000 non-null  float64
 10  ST depression            630000 non-null  float64
 11  Slope of ST              630000 non-null  int64  
 12  Number of vessels fluro  630000 non-null  int64  
 13  Thallium                 630000 non-null  int64  
 14  Hear

In [16]:
df.head()

Unnamed: 0,id,Age,Sex,Chest pain type,BP,Cholesterol,FBS over 120,EKG results,Max HR,Exercise angina,ST depression,Slope of ST,Number of vessels fluro,Thallium,Heart Disease
0,0.0,58,1,4.0,152.0,239.0,0.0,0,158.0,1.0,3.6,2,2,7,1
1,1.0,52,1,1.0,125.0,325.0,0.0,2,171.0,0.0,0.0,1,0,3,0
2,2.0,56,0,2.0,160.0,188.0,0.0,2,151.0,0.0,0.0,1,0,3,0
3,3.0,44,0,3.0,134.0,229.0,0.0,2,150.0,0.0,1.0,2,0,3,0
4,4.0,58,1,4.0,140.0,234.0,0.0,2,125.0,1.0,3.8,2,3,3,1


In [20]:
# Remove non-competetive elements
df.drop(['id', 'Sex'], axis=1, inplace=True)



In [21]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 630000 entries, 0 to 629999
Data columns (total 13 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   Age                      630000 non-null  int64  
 1   Chest pain type          630000 non-null  float64
 2   BP                       630000 non-null  float64
 3   Cholesterol              630000 non-null  float64
 4   FBS over 120             630000 non-null  float64
 5   EKG results              630000 non-null  int64  
 6   Max HR                   630000 non-null  float64
 7   Exercise angina          630000 non-null  float64
 8   ST depression            630000 non-null  float64
 9   Slope of ST              630000 non-null  int64  
 10  Number of vessels fluro  630000 non-null  int64  
 11  Thallium                 630000 non-null  int64  
 12  Heart Disease            630000 non-null  int64  
dtypes: float64(7), int64(6)
memory usage: 62.5 MB


# Scaling process

In [22]:
from sklearn.preprocessing import MinMaxScaler
def do_scale(df):
    scaler = MinMaxScaler()
    num_cols = df.select_dtypes(include=['int64', 'float64']).columns.drop('Heart Disease')
    for col in df.columns:
        df[num_cols] = scaler.fit_transform(df[num_cols])
    return df
df = do_scale(df)
        


In [23]:
df.head()

Unnamed: 0,Age,Chest pain type,BP,Cholesterol,FBS over 120,EKG results,Max HR,Exercise angina,ST depression,Slope of ST,Number of vessels fluro,Thallium,Heart Disease
0,0.604167,1.0,0.54717,0.257991,0.0,0.0,0.664122,1.0,0.580645,0.5,0.666667,1.0,1
1,0.479167,0.0,0.292453,0.454338,0.0,1.0,0.763359,0.0,0.0,0.0,0.0,0.0,0
2,0.5625,0.333333,0.622642,0.141553,0.0,1.0,0.610687,0.0,0.0,0.0,0.0,0.0,0
3,0.3125,0.666667,0.377358,0.23516,0.0,1.0,0.603053,0.0,0.16129,0.5,0.0,0.0,0
4,0.604167,1.0,0.433962,0.246575,0.0,1.0,0.412214,1.0,0.612903,0.5,1.0,0.0,1


# Model Selection 

In [24]:
from sklearn.model_selection import train_test_split

In [25]:
x = df.drop('Heart Disease', axis=1)
y = df['Heart Disease']

In [26]:
x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size = 0.2, random_state = 42
)

# Linear Regression 

In [27]:
from sklearn.linear_model import LinearRegression

In [28]:
lr = LinearRegression()
lr

0,1,2
,"fit_intercept  fit_intercept: bool, default=True Whether to calculate the intercept for this model. If set to False, no intercept will be used in calculations (i.e. data is expected to be centered).",True
,"copy_X  copy_X: bool, default=True If True, X will be copied; else, it may be overwritten.",True
,"tol  tol: float, default=1e-6 The precision of the solution (`coef_`) is determined by `tol` which specifies a different convergence criterion for the `lsqr` solver. `tol` is set as `atol` and `btol` of :func:`scipy.sparse.linalg.lsqr` when fitting on sparse training data. This parameter has no effect when fitting on dense data. .. versionadded:: 1.7",1e-06
,"n_jobs  n_jobs: int, default=None The number of jobs to use for the computation. This will only provide speedup in case of sufficiently large problems, that is if firstly `n_targets > 1` and secondly `X` is sparse or if `positive` is set to `True`. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. See :term:`Glossary ` for more details.",
,"positive  positive: bool, default=False When set to ``True``, forces the coefficients to be positive. This option is only supported for dense arrays. For a comparison between a linear regression model with positive constraints on the regression coefficients and a linear regression without such constraints, see :ref:`sphx_glr_auto_examples_linear_model_plot_nnls.py`. .. versionadded:: 0.24",False


In [29]:
lr.fit(x_train, y_train)

0,1,2
,"fit_intercept  fit_intercept: bool, default=True Whether to calculate the intercept for this model. If set to False, no intercept will be used in calculations (i.e. data is expected to be centered).",True
,"copy_X  copy_X: bool, default=True If True, X will be copied; else, it may be overwritten.",True
,"tol  tol: float, default=1e-6 The precision of the solution (`coef_`) is determined by `tol` which specifies a different convergence criterion for the `lsqr` solver. `tol` is set as `atol` and `btol` of :func:`scipy.sparse.linalg.lsqr` when fitting on sparse training data. This parameter has no effect when fitting on dense data. .. versionadded:: 1.7",1e-06
,"n_jobs  n_jobs: int, default=None The number of jobs to use for the computation. This will only provide speedup in case of sufficiently large problems, that is if firstly `n_targets > 1` and secondly `X` is sparse or if `positive` is set to `True`. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. See :term:`Glossary ` for more details.",
,"positive  positive: bool, default=False When set to ``True``, forces the coefficients to be positive. This option is only supported for dense arrays. For a comparison between a linear regression model with positive constraints on the regression coefficients and a linear regression without such constraints, see :ref:`sphx_glr_auto_examples_linear_model_plot_nnls.py`. .. versionadded:: 0.24",False


# Prediction

In [32]:
y_prediction = lr.predict(x_test)

In [33]:
y_prediction[1:10]

array([ 0.3504888 ,  1.10356494,  0.97725168,  0.09925729,  0.22893662,
       -0.06421276,  0.7041533 ,  1.12424467,  0.99961435])

# Evaluation 

In [34]:
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

In [35]:
r2 = r2_score(y_test, y_prediction)
mae = mean_absolute_error(y_test, y_prediction)
mse = mean_squared_error(y_test, y_prediction)
rmse = pow(mse, (1/2))



In [36]:
print('r2: ', r2)
print('mae: ', mae)
print('mse: ', mse)
print('rmse: ', rmse)

r2:  0.5911503336690405
mae:  0.2445974326080288
mse:  0.10110114276777112
rmse:  0.3179640589245444


# So the results are: 

# r2:  0.5911503336690405
# mae:  0.2445974326080288
# mse:  0.10110114276777112
# rmse:  0.3179640589245444