In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.svm import LinearSVR
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Linear Regression & SVM Regressor

## Bladder Data

In [2]:
# Import data
bladder_path = "datasets/bladder_no_geo.csv"
bladder_df = pd.read_csv(bladder_path)
# print(bladder_df.head())
# print(bladder_df.columns)

# Preprocess data
bladder_df = bladder_df.drop(columns=["Unnamed: 0.1", "geo_code"]) # drop unnecessary columns
bladder_df.fillna(0, inplace=True) # fill NaN values w/ 0

# Extract feature and target features
X = bladder_df.drop(columns=['county_count'])
y = bladder_df['county_count']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=11)

# Standardize features for SVR
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Create models
linreg_model = LinearRegression()
svr_model = LinearSVR(max_iter=5000)

# Fit data to models
linreg_model.fit(X_train, y_train)
svr_model.fit(X_train_scaled, y_train)

# Make predictions
y_pred_linreg = linreg_model.predict(X_test)
y_pred_svr = svr_model.predict(X_test_scaled)

# Evaluate Linear Regression
mse_linreg = mean_squared_error(y_test, y_pred_linreg)
mae_linreg = mean_absolute_error(y_test, y_pred_linreg)
r2_linreg = r2_score(y_test, y_pred_linreg)

# Evaluate SVR
mse_svr = mean_squared_error(y_test, y_pred_svr)
mae_svr = mean_absolute_error(y_test, y_pred_svr)
r2_svr = r2_score(y_test, y_pred_svr)

# Print results
print("Linear Regression Performance\n")
print(f"Mean Squared Error: {mse_linreg}")
print(f"Mean Absolute Error: {mae_linreg}")
print(f"R2 Score: {r2_linreg}")

print("\nLinear SVR Performance\n")
print(f"Mean Squared Error: {mse_svr}")
print(f"Mean Absolute Error: {mae_svr}")
print(f"R2 Score: {r2_svr}")



Linear Regression Performance

Mean Squared Error: 381483960664.73474
Mean Absolute Error: 4693.852534591085
R2 Score: -190445845891.00754

Linear SVR Performance

Mean Squared Error: 33454786.410907477
Mean Absolute Error: 43.649461569467256
R2 Score: -16701422.268385118




## Lung Data

In [3]:
# Import data
lung_path = "datasets/lung_no_geo.csv"
lung_df = pd.read_csv(lung_path)
# print(lung_df.head())
# print(lung_df.columns)

# Preprocess data
lung_df = lung_df.drop(columns=["Unnamed: 0.1", "geo_code"]) # drop unnecessary columns
lung_df.fillna(0, inplace=True) # fill NaN values w/ 0

# Extract feature and target features
X = lung_df.drop(columns=['county_count'])
y = lung_df['county_count']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=11)

# Standardize features for SVR
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Create models
linreg_model = LinearRegression()
svr_model = LinearSVR(max_iter=5000)

# Fit data to models
linreg_model.fit(X_train, y_train)
svr_model.fit(X_train_scaled, y_train)

# Make predictions
y_pred_linreg = linreg_model.predict(X_test)
y_pred_svr = svr_model.predict(X_test_scaled)

# Evaluate Linear Regression
mse_linreg = mean_squared_error(y_test, y_pred_linreg)
mae_linreg = mean_absolute_error(y_test, y_pred_linreg)
r2_linreg = r2_score(y_test, y_pred_linreg)

# Evaluate SVR
mse_svr = mean_squared_error(y_test, y_pred_svr)
mae_svr = mean_absolute_error(y_test, y_pred_svr)
r2_svr = r2_score(y_test, y_pred_svr)

# Print results
print("Linear Regression Performance\n")
print(f"Mean Squared Error: {mse_linreg}")
print(f"Mean Absolute Error: {mae_linreg}")
print(f"R2 Score: {r2_linreg}")

print("\nLinear SVR Performance\n")
print(f"Mean Squared Error: {mse_svr}")
print(f"Mean Absolute Error: {mae_svr}")
print(f"R2 Score: {r2_svr}")



Linear Regression Performance

Mean Squared Error: 4542530511369.597
Mean Absolute Error: 15917.479037323328
R2 Score: -278897502059.74005

Linear SVR Performance

Mean Squared Error: 852140232.9073566
Mean Absolute Error: 218.983184813448
R2 Score: -52318807.15516269


