In [24]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, mean_absolute_error

In [25]:
# Step 1: Load the datasets
train_data = pd.read_csv(r"D:\Final project\current health regression model\p1_train.csv")
test_data = pd.read_csv(r"D:\Final project\current health regression model\p1_test.csv")

In [26]:
train_data.isnull().sum()

-7.262173392018990370e+00    0
9.572603824406265005e+00     0
5.358725498169498280e+00     0
dtype: int64

In [27]:
test_data.isnull().sum()

1.589300268390259419e+01     0
1.171282902260990966e+01     0
-3.756792885773750612e+01    0
dtype: int64

In [28]:
# Check the structure and basic statistics of the datasets
print("Training Data Info:")
print(train_data.info())
print("\nTest Data Info:")
print(test_data.info())

Training Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9999 entries, 0 to 9998
Data columns (total 3 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   -7.262173392018990370e+00  9999 non-null   float64
 1   9.572603824406265005e+00   9999 non-null   float64
 2   5.358725498169498280e+00   9999 non-null   float64
dtypes: float64(3)
memory usage: 234.5 KB
None

Test Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4999 entries, 0 to 4998
Data columns (total 3 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   1.589300268390259419e+01   4999 non-null   float64
 1   1.171282902260990966e+01   4999 non-null   float64
 2   -3.756792885773750612e+01  4999 non-null   float64
dtypes: float64(3)
memory usage: 117.3 KB
None


In [29]:
# Step 3: Data Preparation
X_train = train_data.iloc[:, :-1]  # Features are all columns except the last one
y_train = train_data.iloc[:, -1]   # Target variable is the last column

In [30]:
X_train

Unnamed: 0,-7.262173392018990370e+00,9.572603824406265005e+00
0,3.462140,10.684524
1,-12.996801,-3.446163
2,7.083537,-14.074146
3,-15.216890,-18.630651
4,6.841930,-8.080967
...,...,...
9994,16.981033,-7.377274
9995,0.258434,-19.488764
9996,17.588664,11.326009
9997,0.531756,-15.189405


In [31]:
y_train

0      -13.275822
1       31.815190
2        1.995030
3       48.812452
4       -0.481346
          ...    
9994   -21.776089
9995    22.098571
9996   -47.812604
9997    20.791085
9998   -20.735447
Name: 5.358725498169498280e+00, Length: 9999, dtype: float64

In [32]:
X_test = test_data.iloc[:, :-1]
y_test = test_data.iloc[:, -1]

In [33]:
X_test

Unnamed: 0,1.589300268390259419e+01,1.171282902260990966e+01
0,-2.722421,-4.540615
1,-14.587111,18.592864
2,0.224331,16.433306
3,-12.215286,11.742682
4,17.213737,-13.958878
...,...,...
4994,-14.424996,7.055352
4995,-12.249485,12.226991
4996,-19.242759,-18.218209
4997,-13.561869,2.773022


In [34]:
y_test

0       11.475211
1       11.468441
2      -15.159309
3       17.392223
4      -18.291343
          ...    
4994    21.402087
4995    14.971488
4996    56.687193
4997    26.627790
4998    11.868025
Name: -3.756792885773750612e+01, Length: 4999, dtype: float64

In [35]:
print("Columns in X_train:", X_train.columns)
print("Columns in X_test:", X_test.columns)

Columns in X_train: Index(['-7.262173392018990370e+00', '9.572603824406265005e+00'], dtype='object')
Columns in X_test: Index(['1.589300268390259419e+01', '1.171282902260990966e+01'], dtype='object')


In [36]:
X_train.shape

(9999, 2)

In [37]:
X_test.shape

(4999, 2)

In [42]:
# Ensure test data has the same columns as training data
missing_cols = set(X_train.columns) - set(X_test.columns)
for col in missing_cols:
    X_test[col] = 0

X_test = X_test[X_train.columns]  # Reorder test columns to match training columns

In [43]:
# Step 4: Model Selection and Training

# Initialize Linear Regression model
linear_reg = LinearRegression()
linear_reg.fit(X_train, y_train)

In [48]:
# Initialize Support Vector Regression (SVR) model
svr_reg = SVR(kernel='linear')  # we can choose different kernels: 'linear', 'poly', 'rbf', etc.
svr_reg.fit(X_train, y_train)

In [49]:
# Step 5: Model Evaluation
# Predict on test data
y_pred_lr = linear_reg.predict(X_test)
y_pred_svr = svr_reg.predict(X_test)

In [50]:
# Step 5: Model Evaluation Metrics
mse_lr = mean_squared_error(y_test, y_pred_lr)
mae_lr = mean_absolute_error(y_test, y_pred_lr)

mse_svr = mean_squared_error(y_test, y_pred_svr)
mae_svr = mean_absolute_error(y_test, y_pred_svr)

In [52]:
# Step 6: Print Metrics
print("Linear Regression Metrics:")
print(f"Mean Squared Error (MSE): {mse_lr}")
print(f"Mean Absolute Error (MAE): {mae_lr}")

print("\nSVR Metrics:")
print(f"Mean Squared Error (MSE): {mse_svr}")
print(f"Mean Absolute Error (MAE): {mae_svr}")

Linear Regression Metrics:
Mean Squared Error (MSE): 688.4626474348057
Mean Absolute Error (MAE): 22.061223967278977

SVR Metrics:
Mean Squared Error (MSE): 688.4673879734054
Mean Absolute Error (MAE): 22.061228536978295
