<a href="https://colab.research.google.com/github/comparativechrono/GMO7-Jupyter/blob/main/MRes/M3/Python_regression_normalised.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression

# Load dataset
file_path = '/content/clinical_genomic_toy_data.csv'
data = pd.read_csv(file_path)

# Separate features and target
X = data.drop(columns=['Disease_Progression'])
y = data['Disease_Progression']

# Select a subset of features for visualization
sample_features = ['Gene_1', 'Family_History', 'Age', 'BMI']
X_sample = X[sample_features]

# Plot original feature distributions
plt.figure(figsize=(12, 6))
for i, feature in enumerate(sample_features, 1):
    plt.subplot(2, 2, i)
    sns.histplot(X_sample[feature], kde=True)
    plt.title(f"Original Distribution of {feature}")
plt.tight_layout()
plt.show()

# Apply 0-1 normalisation
scaler = MinMaxScaler()
X_sample_scaled = pd.DataFrame(scaler.fit_transform(X_sample), columns=sample_features)

# Plot scaled feature distributions
plt.figure(figsize=(12, 6))
for i, feature in enumerate(sample_features, 1):
    plt.subplot(2, 2, i)
    sns.histplot(X_sample_scaled[feature], kde=True)
    plt.title(f"Z-Scaled Distribution of {feature}")
plt.tight_layout()
plt.show()

# Fit Linear Regression on Original and Scaled Data
lin_reg_original = LinearRegression()
lin_reg_original.fit(X_sample, y)

lin_reg_scaled = LinearRegression()
lin_reg_scaled.fit(X_sample_scaled, y)

# Display coefficients for comparison
coefficients_df = pd.DataFrame({
    'Feature': sample_features,
    'Original Coefficients': lin_reg_original.coef_,
    'Scaled Coefficients': lin_reg_scaled.coef_
})

print(coefficients_df)
