In [None]:
# Import necessary libraries
import pandas as pd                     # For data manipulation
import seaborn as sns                   # For pairwise visualization
import matplotlib.pyplot as plt         # For plotting
from sklearn.linear_model import LinearRegression  # Linear regression model
from sklearn.preprocessing import StandardScaler   # For feature scaling
from sklearn.model_selection import train_test_split  # Train-test split
from sklearn.metrics import r2_score, mean_squared_error

# Step 1: Load dataset
data = pd.read_csv('dataset.csv')  # Assumes the file 'dataset.csv' is in the current directory


In [None]:
# Step 2: Explore data visually
# This shows scatter plots for each pair of features and the target variable
sns.pairplot(data)
plt.suptitle('Pairwise Feature Relationships', y=1.02)
plt.tight_layout()
plt.show()

In [None]:
# Step 3: Separate features and target variable
X = data.drop(columns=['CCS']).values  # Feature matrix: all columns except 'CCS'
Y = data['CCS'].values                 # Target vector: 'CCS' column

# Step 4: Train-test split (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, Y, train_size=0.8, random_state=42)

# Step 5 (optional but recommended): Standardize features
# Linear regression is sensitive to feature scaling, especially if regularization is applied later
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Step 6: Create and train regression model
reg_model = LinearRegression()  # Ordinary Least Squares regression
reg_model.fit(X_train_scaled, y_train)

In [None]:
# Step 7: Predict on training and test sets
y_train_pred = reg_model.predict(X_train_scaled)
y_test_pred = reg_model.predict(X_test_scaled)

# Step 8: Evaluate model
r2_train = r2_score(y_train, y_train_pred)
r2_test = r2_score(y_test, y_test_pred)
mse_test = mean_squared_error(y_test, y_test_pred)

print(f"Train R² Score: {r2_train:.3f}")
print(f"Test R² Score: {r2_test:.3f}")
print(f"Test MSE: {mse_test:.3f}")

# Step 9: Parity plot (Actual vs Predicted)
min_val = min(Y.min(), y_train_pred.min(), y_test_pred.min())
max_val = max(Y.max(), y_train_pred.max(), y_test_pred.max())

In [None]:
# Training plot
plt.figure(figsize=(6, 6))
plt.scatter(y_train, y_train_pred, alpha=0.6, label='Train')
plt.plot([min_val, max_val], [min_val, max_val], 'k--', label='1:1 line')
plt.xlabel('Actual CCS')
plt.ylabel('Predicted CCS')
plt.title(f'Train Set: R² = {r2_train:.3f}')
plt.legend()
plt.grid(True)
plt.show()

# Test plot
plt.figure(figsize=(6, 6))
plt.scatter(y_test, y_test_pred, alpha=0.6, color='orange', label='Test')
plt.plot([min_val, max_val], [min_val, max_val], 'k--', label='1:1 line')
plt.xlabel('Actual CCS')
plt.ylabel('Predicted CCS')
plt.title(f'Test Set: R² = {r2_test:.3f}')
plt.legend()
plt.grid(True)
plt.show()