# 2) Regression
## 1. Prepare Dataset
### 1.1 Import libraries

In [None]:
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt

### 1.2 Import Dataset

In [None]:
from ucimlrepo import fetch_ucirepo

# Fetch the dataset
communities_and_crime = fetch_ucirepo(id=183)

X = communities_and_crime.data.features
y = communities_and_crime.data.targets

### 1.3 Dataset Cleaning
#### 1.3.1 Drop the non-numeric 'communityname' column


In [None]:
X = X.drop(columns=['communityname'])

#### 1.3.2 Replace non-numeric placeholders (e.g., '?') with NaN

In [None]:
X.replace('?', np.nan, inplace=True)


#### 1.3.3 Convert all columns to numeric where possible

In [None]:
X = X.apply(pd.to_numeric, errors='coerce')


### 1.4 Imputation of missing values
#### 1.4.1 Mean Imputation
##### Missing values in the dataset are filled using the mean of each column. 

In [None]:
imputer_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
X_imputed_mean = pd.DataFrame(imputer_mean.fit_transform(X), columns=X.columns)


#### 1.4.2 KNN Imputation
##### KNN imputation is used to fill missing values by considering the nearest neighbors. This method can provide a more accurate estimate than mean imputation by considering the relationships between variables.

In [None]:
knn_imputer = KNNImputer(n_neighbors=5)
X_imputed_knn = pd.DataFrame(knn_imputer.fit_transform(X), columns=X.columns)


### 1.5 Save imputed datasets

In [None]:
X_imputed_mean.to_csv('CandC-imputed-mean.csv', index=False)
X_imputed_knn.to_csv('CandC-imputed-knn.csv', index=False)
print("Imputed datasets saved.")

## 2. Regression Models and Cross-Validation
### 2.1 Function to Evaluate Model Performance
##### This function evaluates the performance of a regression model using 5 different 80-20 train-test splits. It calculates the Mean Squared Error (MSE) for each split, saves the train-test data and target splits, and returns the average MSE and model parameters.

In [None]:
# Function to perform 5 different 80-20 splits, calculate MSE, and save parameters

def evaluate_model(model, X, y, model_name, imputation_method):
    mse_scores = []
    parameters = []

    for i in range(5):
        # Split data
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=i)

        # Fit model
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        # Calculate MSE and save parameters
        mse = mean_squared_error(y_test, y_pred)
        mse_scores.append(mse)
        parameters.append(model.coef_)

        # Save train-test splits
        X_train.to_csv(f'CandC-train-{imputation_method}-{i + 1}.csv', index=False)
        X_test.to_csv(f'CandC-test-{imputation_method}-{i + 1}.csv', index=False)
        pd.DataFrame(y_train).to_csv(f'CandC-train-targets-{imputation_method}-{i + 1}.csv', index=False)
        pd.DataFrame(y_test).to_csv(f'CandC-test-targets-{imputation_method}-{i + 1}.csv', index=False)

    avg_mse = np.mean(mse_scores)
    print(f"{model_name} with {imputation_method} 5-fold Cross-validation MSE: {mse_scores}")
    print(f"Average MSE for {model_name} with {imputation_method}: {avg_mse}")
    return avg_mse, parameters


### 2.2 Linear Regression with Mean and KNN Imputed Data
##### Linear regression is applied to both mean-imputed and KNN-imputed datasets. The function calculates the average MSE and saves the learned parameters.

In [None]:
# Linear Regression
lin_reg = LinearRegression()
avg_mse_mean, params_mean = evaluate_model(lin_reg, X_imputed_mean, y, "Linear Regression", "mean")
avg_mse_knn, params_knn = evaluate_model(lin_reg, X_imputed_knn, y, "Linear Regression", "knn")


### 2.3 Ridge Regression with Multiple Lambda Values
##### Ridge regression is evaluated for a range of lambda (𝜆λ) values. The average MSE is calculated for each 𝜆 and plotted to identify the best value for 𝜆.

In [None]:
# Ridge Regression with multiple lambda values
lambdas = np.logspace(-4, 4, 10)
ridge_mse_scores = []
ridge_parameters = []

for alpha in lambdas:
    ridge = Ridge(alpha=alpha)
    avg_mse, parameters = evaluate_model(ridge, X_imputed_knn, y, f"Ridge Regression (λ={alpha})", "knn")
    ridge_mse_scores.append(avg_mse)
    ridge_parameters.append(parameters)


### 2.4 Plotting Ridge Regression Results
##### A plot is created to visualize the relationship between the regularization parameter λ and the MSE, aiding in the selection of the best λ for Ridge Regression.

In [None]:

# Plot Ridge Regression MSE vs Lambda
plt.plot(lambdas, ridge_mse_scores, marker='o')
plt.xscale('log')
plt.xlabel('Lambda (α)')
plt.ylabel('Average Test MSE')
plt.title('Ridge Regression: MSE vs Lambda')
plt.show()


### 2.5 Selecting the Best Lambda for Ridge Regression
##### The best λ value is selected based on the minimum MSE observed in the Ridge Regression results.

In [None]:

# Select best lambda for Ridge
best_lambda_ridge = lambdas[np.argmin(ridge_mse_scores)]
print("Best lambda for Ridge Regression:", best_lambda_ridge)


### 2.6 Lasso Regression for Feature Selection
#####  Lasso regression is used to perform feature selection, reducing the number of features based on the non-zero coefficients. The MSE of the Lasso model is also calculated and reported.

In [None]:
# Lasso Regression with best lambda for Lasso (separately determined)
lasso = Lasso(alpha=best_lambda_ridge)  # Modify as necessary for the best lambda for Lasso
lasso.fit(X_imputed_knn, y)
y_pred_lasso = lasso.predict(X_imputed_knn)
lasso_mse = mean_squared_error(y, y_pred_lasso)
print("Lasso Regression Test MSE:", lasso_mse)

# Reporting number of selected features by Lasso
selected_features = np.sum(lasso.coef_ != 0)
print(f"Number of selected features by Lasso: {selected_features}")


### 2.7 Linear Regression with Reduced Feature Set
##### After feature selection via Lasso, a linear regression model is fitted using only the selected features. The performance is evaluated and compared to the full model.

In [None]:
# Refit Linear Regression model with selected features
selected_columns = X_imputed_knn.columns[lasso.coef_ != 0]
X_reduced = X_imputed_knn[selected_columns]
lin_reg_reduced = LinearRegression()
evaluate_model(lin_reg_reduced, X_reduced, y, "Linear Regression with Reduced Features", "reduced")


### 2.8 Saving Parameters for Each Model
##### The learned parameters for each model (Linear Regression with mean and KNN imputation, and Ridge Regression) are saved and printed for reference.

In [None]:

# Save parameters for each model
print("Learned parameters for Linear Regression (Mean Imputed):", params_mean)
print("Learned parameters for Linear Regression (KNN Imputed):", params_knn)
print("Learned parameters for Ridge Regression:", ridge_parameters)
