# Model Training and Evaluation
This notebook includes the training and evaluation of both linear and polynomial regression models.

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import matplotlib.pyplot as plt
import joblib

## Data Wrangling

In [2]:
# Load the dataset
data = pd.read_csv('2.Model_Development/2.1.Data_Wrangling/2.1.2.Diabeties_Sample_Data.csv')

# Handle missing values
data = data.dropna(subset=[ 'SEX', 'Target'])

# Remove all females
data = data[data['SEX'] == 'Male']

# Convert DoB and DoT to datetime
data['DoB'] = pd.to_datetime(data['DoB'], format='%d/%m/%Y')
data['DoT'] = pd.to_datetime(data['DoT'], format='%d/%m/%Y')

# Calculate AGE
data['AGE'] = (data['DoT'] - data['DoB']).dt.days / 365.25

# Drop original DoB and DoT columns
data = data.drop(columns=['DoB', 'DoT'])

# Handle missing values
data = data.dropna()

# Convert categorical variables to numerical values
data['SEX'] = data['SEX'].map({'Female': 0, 'Male': 1})

# Remove outliers using IQR
Q1 = data.quantile(0.25)
Q3 = data.quantile(0.75)
IQR = Q3 - Q1
data = data[~((data < (Q1 - 1.5 * IQR)) | (data > (Q3 + 1.5 * IQR))).any(axis=1)]

# Save the new dataset
new_data_path = '/workspaces/2025SE_ElliottP_MLOPS/2.Model_Development/2.1.Data_Wrangling/2.1.2.Diabeties_Sample_Data_with_Age.csv'
data.to_csv(new_data_path, index=False)

# Display the first few rows of the dataset
data.head()

Unnamed: 0,SEX,BMI,BP,TC,BGU,FDR,Target,AGE
2,1,18.5,87.0,2.67,80,2,90.0,42.992471
3,1,18.6,97.0,2.0,83,2,101.0,21.976728
4,1,18.8,78.0,2.0,86,2,85.0,23.080082
5,1,18.8,83.0,3.0,69,2,51.0,26.110883
9,1,19.2,87.0,2.0,90,0,137.0,19.353867


In [3]:
# Split the data into features and target
X = data[['BMI', 'FDR']]
y = data['Target']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Linear Regression Model

In [4]:
# Train the linear regression model
linear_model = LinearRegression()
linear_model.fit(X_train, y_train)

# Predict on the test set
y_pred_linear = linear_model.predict(X_test)

# Calculate error metrics
mse_linear = mean_squared_error(y_test, y_pred_linear)
mae_linear = mean_absolute_error(y_test, y_pred_linear)
r2_linear = r2_score(y_test, y_pred_linear)

print(f'Linear Regression MSE: {mse_linear}')
print(f'Linear Regression MAE: {mae_linear}')
print(f'Linear Regression R2: {r2_linear}')

Linear Regression MSE: 4062.354996581749
Linear Regression MAE: 53.29926075318151
Linear Regression R2: 0.31507375150346373


# Student Performance Prediction Model
## Data Wrangling and Model Development

### 1. Import Required Libraries

In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns

### 2. Load and Examine Data

In [6]:
# Load the dataset
data = pd.read_csv('data/dataset.csv', comment='%')

# Display basic information about the dataset
print("Dataset Info:")
print(data.info())

# Display first few rows
print("\nFirst few rows:")
print(data.head())

ParserError: Error tokenizing data. C error: Expected 1 fields in line 76, saw 31


### 3. Data Preprocessing

In [None]:
def preprocess_data(df):
    # Create copy of dataframe
    df_processed = df.copy()
    
    # Convert binary variables to numeric
    binary_features = ['schoolsup', 'famsup', 'paid', 'activities', 
                      'nursery', 'higher', 'internet', 'romantic']
    
    label_encoder = LabelEncoder()
    for feature in binary_features:
        df_processed[feature] = label_encoder.fit_transform(df_processed[feature])
    
    # Handle categorical variables
    df_processed = pd.get_dummies(df_processed, columns=['school', 'sex', 'address',
                                                       'famsize', 'Pstatus', 'Mjob',
                                                       'Fjob', 'reason', 'guardian'])
    
    return df_processed

# Apply preprocessing
processed_data = preprocess_data(data)

### 4. Feature Selection and Engineering

In [None]:
# Select relevant features for prediction
features = ['age', 'Medu', 'Fedu', 'traveltime', 'studytime', 'failures',
           'absences', 'G3'] # Add more relevant features

# Split features and target
X = processed_data.drop('G3', axis=1)  # Features
y = processed_data['G3']  # Target variable

# Normalize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

### 5. Exploratory Data Analysis

In [None]:
# Correlation heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(processed_data[features].corr(), annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap of Features')
plt.show()

# Distribution of target variable
plt.figure(figsize=(10, 6))
sns.histplot(data=processed_data, x='G3', bins=20)
plt.title('Distribution of Final Grades')
plt.show()

### 6. Train-Test Split

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42
)

print("Training set shape:", X_train.shape)
print("Testing set shape:", X_test.shape)

### 7. Model Development (Placeholder)

In [None]:
# Initialize the model
model = LinearRegression()

# Train the model
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
print("R² Score:", r2_score(y_test, y_pred))
print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred)))

### 8. Model Evaluation and Visualization

In [None]:
# Plot actual vs predicted values
plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred, alpha=0.5)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
plt.xlabel('Actual Grades')
plt.ylabel('Predicted Grades')
plt.title('Actual vs Predicted Student Grades')
plt.show()