In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
import numpy as np

In [5]:
df = pd.read_csv('diabetes_updated.csv')
print(df.head())
print(df.columns)

   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0            6      148             72             35        0  33.6   
1            1       85             66             29        0  26.6   
2            8      183             64              0        0  23.3   
3            1       89             66             23       94  28.1   
4            0      137             40             35      168  43.1   

   DiabetesPedigreeFunction  Age  Outcome  
0                     0.627   50        1  
1                     0.351   31        0  
2                     0.672   32        1  
3                     0.167   21        0  
4                     2.288   33        1  
Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome'],
      dtype='object')


In [6]:
#Differentiate between independent (X) and dependent (Y) variables
X = df.drop('Outcome', axis=1)
Y = df['Outcome']
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)


In [11]:
#Analyze features and apply StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [10]:
#Create and train the linear regression model
model = LinearRegression()
model.fit(X_train_scaled, Y_train)
print(f"Intercept: {model.intercept_}")
print(f"Coefficients: {model.coef_}")

Intercept: 0.34690553745928343
Coefficients: [ 0.03465559  0.1803234  -0.04219339  0.00820563 -0.03230381  0.11631364
  0.03744793  0.07425473]


In [12]:
#Generate predictions and compare
predictions = model.predict(X_test_scaled)
print(f"Predictions: {predictions}")

Predictions: [ 0.33550028  0.23809869  0.1510522   0.2401365   0.48142376  0.45257375
 -0.17450469  0.60662287  0.52417796  0.70476953  0.32360466  0.85290601
  0.38466612  0.36056948  0.09946712  0.41539557  0.17869123  0.07782301
  0.80730861  0.51299477  0.28090594  0.08303057  0.5099157   0.11381771
  0.51325022  0.82528549  0.17892718 -0.0594202   0.28338572  0.16407949
  0.83851225  0.80737515  0.68154389  0.7649502   0.56140297  0.62123131
  1.06134554  0.30990775  0.51752336  0.63691482  0.07075333  0.57757007
  0.55015462  0.37541745 -0.07644182  0.50119208  0.59600162  0.27464761
  0.42477995  0.9941898   0.00969584  0.61763578  0.73395288  0.31090975
  0.13456812 -0.02536316  0.71219147 -0.30518218  0.41994556  0.67869594
  0.66891428  0.3798452   0.2956646   0.288035    0.06813053  0.55464338
  0.01368504  0.6272007  -0.02033281  0.6372293   0.61928494  0.07019372
  0.26388322  0.14080565  0.12425109  0.50054317  0.24772661  0.21027229
  0.18419241  0.28346361  0.60206367  

In [13]:
#Compute R-squared on the test set
r_squared = r2_score(Y_test, predictions)
print(f"R-squared: {r_squared}")

R-squared: 0.2550028117674177
