In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import requests
from io import StringIO
from sklearn import svm
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

### 1. 

In [None]:
#Given URL
url="https://www.statlearning.com/s/Heart.csv"
response = requests.get(url)

In [None]:
#Get the data from the url in panda dataframe, remove categorical columns and na values
data = StringIO(response.text)
df = pd.read_csv(data)
df = pd.get_dummies(df, columns=['ChestPain'], drop_first=True)
non_numeric_columns = df.select_dtypes(exclude=[np.number]).columns
df = df.drop(non_numeric_columns, axis=1)
df.dropna(inplace=True)
print(df)

In [None]:
#Get the names of the column names
col_names = list(df.columns)
#Printing the names of the columns
print(f'The column names for the Heart.csv data set in a list:: {col_names}.')

In [None]:
#Get the 25th row from the dataframe
row_obs = df.iloc[24]
#Printing the 25th row of the dataframe
print(f'The 25th row in the Heart.csv data set in a list:: {row_obs}.')

### 2.

In [None]:
#Creating a plot with two subplots
fig, (plt1, plt2) = plt.subplots(1, 2, figsize=(12,6))
#Plotting AGE against MAXHR
plt1.scatter(df['Age'], df['MaxHR'], alpha = 0.5, color='blue')
plt1.set_xlabel('AGE')
plt1.set_ylabel('MAX HR')
plt1.set_title('AGE vs MAX HR')

#Plotting AGE against RESTBP
plt2.scatter(df['Age'], df['RestBP'], alpha = 0.5, color='red')
plt2.set_xlabel('AGE')
plt2.set_ylabel('REST BP')
plt2.set_title('AGE vs REST BP')

plt.tight_layout()

In [None]:
#Creating a third standalone plot
fig, main_plt = plt.subplots(figsize=(10,6))

main_plt.scatter(df['Age'], df['MaxHR'], alpha = 0.5, color = 'blue', label = 'MAX HR')

main_plt.scatter(df['Age'], df['RestBP'], alpha = 0.5, color = 'red', label = 'REST BP')

main_plt.set_xlabel('Age')
main_plt.set_ylabel('MAX HR / REST BP')
main_plt.set_title('AGE vs MAX HR and AGE vs REST BP')

main_plt.legend()

plt.show()


### 3.

In [None]:
X = df.drop(['MaxHR'], axis=1)
print(X)

In [None]:
y = df['MaxHR']
print(y)

In [None]:
#Defining test ratios
test_ratio = [0.5, 0.25, 0.1]

In [None]:
for ratio in test_ratio:
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=ratio, random_state=42)

    reg_model = LinearRegression()

    reg_model.fit(X_train, y_train)

    y_train_predict = reg_model.predict(X_train)
    y_test_predict = reg_model.predict(X_test)

    MSE_train = mean_squared_error(y_train, y_train_predict)
    MSE_test = mean_squared_error(y_test, y_test_predict)

    print(f"Training Error for ratio {ratio: .2f} is {MSE_train: .2f}")
    print(f"Testing Error for ratio {ratio: .2f} is {MSE_test: .2f}")