In [2]:
import pandas as pd
import os

# --- Configuration ---
INPUT_CSV_FILE = 'cgpa_data.csv'
OUTPUT_CSV_FILE = 'preprocessed_cgpa_data.csv'

# --- Functions ---
def preprocess_cgpa_data(input_path, output_path):
    """
    Loads, preprocesses, and saves the CGPA dataset.

    Args:
        input_path (str): Path to the input CSV file.
        output_path (str): Path to save the preprocessed CSV file.
    """
    print(f"--- Starting Preprocessing ---")
    print(f"Loading data from: {input_path}")

    # Check if input file exists
    if not os.path.exists(input_path):
        print(f"Error: Input file not found at {input_path}")
        # Create a dummy file for demonstration if it doesn't exist
        print(f"Creating a dummy {input_path} for demonstration purposes.")
        dummy_data = """StudentID,Age,Gender,HoursOfStudyPerDay,SchoolAttendanceRate,TuitionAccess,CGPA,AveragePreviousScores,HoursOfSleep,BreakfastDaily,ScreenTimeHours,PhysicalActivityHours,PlaysSport,MentalHealthScore,ParentalEducationLevel,HouseholdIncomeLevel,StudyEnvironmentRating,FriendSupportScore,ParticipatesInClubs,PartTimeWork
201,19,Female,5.2,98.0,Yes,9.6,92,7.2,Yes,1.5,1.8,Yes,9,Postgrad,High,5,9,Yes,No
302,20,Female,2.8,80.0,No,7.1,75,7.2,No,4.0,0.5,No,5,Graduate,Low,2,5,No,Yes
353,18,Other,3.6,85.0,Yes,7.5,80,6.8,Yes,3.5,1.0,Yes,7,High school,Medium,4,6,Yes,No
451,19,Male,1.2,45.0,No,5.5,55,4.5,No,8.0,0.0,No,3,High school,Low,1,2,No,Yes
"""
        with open(input_path, 'w') as f:
            f.write(dummy_data)
        # return # Stop execution if the original file wasn't found

    try:
        df = pd.read_csv(input_path)
        print(f"Successfully loaded {len(df)} rows and {len(df.columns)} columns.")
        print("\nOriginal Data Info:")
        df.info()
        print("\nOriginal Data Head:")
        print(df.head())

        # 1. Check for Missing Values
        print("\n--- Checking for Missing Values ---")
        missing_values = df.isnull().sum()
        print(missing_values[missing_values > 0])
        if missing_values.sum() == 0:
            print("No missing values found.")
        else:
            print("Warning: Missing values detected. Consider imputation strategies.")
            # For this script, we won't impute, but you might add logic here.

        # 2. Drop Identifier Column
        print("\n--- Dropping StudentID Column ---")
        if 'StudentID' in df.columns:
            df = df.drop('StudentID', axis=1)
            print("Dropped 'StudentID' column.")
        else:
            print("'StudentID' column not found.")

        # 3. Encode Binary Categorical Features ('Yes'/'No')
        print("\n--- Encoding Binary Features (Yes/No to 1/0) ---")
        binary_cols = ['TuitionAccess', 'BreakfastDaily', 'PlaysSport', 'ParticipatesInClubs', 'PartTimeWork']
        for col in binary_cols:
            if col in df.columns:
                print(f"Encoding '{col}'...")
                df[col] = df[col].map({'Yes': 1, 'No': 0})
                # Check if mapping worked, handle potential non-'Yes'/'No' values if needed
                if df[col].isnull().any():
                     print(f"Warning: Nulls created during mapping '{col}'. Check original values.")
            else:
                print(f"Warning: Binary column '{col}' not found.")

        # 4. Encode Ordinal Categorical Features
        print("\n--- Encoding Ordinal Features ---")
        # Define orderings (adjust if necessary based on unique values)
        parental_edu_order = {'High school': 0, 'Graduate': 1, 'Postgrad': 2}
        income_order = {'Low': 0, 'Medium': 1, 'High': 2}

        if 'ParentalEducationLevel' in df.columns:
            print("Encoding 'ParentalEducationLevel'...")
            df['ParentalEducationLevel_Encoded'] = df['ParentalEducationLevel'].map(parental_edu_order)
            df = df.drop('ParentalEducationLevel', axis=1) # Drop original
            if df['ParentalEducationLevel_Encoded'].isnull().any():
                     print(f"Warning: Nulls created during mapping 'ParentalEducationLevel'. Check original values.")
        else:
            print("Warning: Ordinal column 'ParentalEducationLevel' not found.")

        if 'HouseholdIncomeLevel' in df.columns:
            print("Encoding 'HouseholdIncomeLevel'...")
            df['HouseholdIncomeLevel_Encoded'] = df['HouseholdIncomeLevel'].map(income_order)
            df = df.drop('HouseholdIncomeLevel', axis=1) # Drop original
            if df['HouseholdIncomeLevel_Encoded'].isnull().any():
                     print(f"Warning: Nulls created during mapping 'HouseholdIncomeLevel'. Check original values.")
        else:
            print("Warning: Ordinal column 'HouseholdIncomeLevel' not found.")


        # 5. Encode Nominal Categorical Features (One-Hot Encoding)
        print("\n--- Encoding Nominal Features (One-Hot) ---")
        if 'Gender' in df.columns:
            print("Encoding 'Gender'...")
            df = pd.get_dummies(df, columns=['Gender'], prefix='Gender', drop_first=False) # drop_first=True avoids multicollinearity if needed
            print("Created columns:", [col for col in df.columns if col.startswith('Gender_')])
        else:
             print("Warning: Nominal column 'Gender' not found.")

        # --- Verification ---
        print("\n--- Verifying Preprocessed Data ---")
        print("\nPreprocessed Data Info:")
        df.info()
        print("\nPreprocessed Data Head:")
        print(df.head())
        print("\nPreprocessed Data Description (Numerical Columns):")
        print(df.describe())

        # --- Save Preprocessed Data ---
        print(f"\n--- Saving Preprocessed Data ---")
        df.to_csv(output_path, index=False)
        print(f"Preprocessed data saved successfully to: {output_path}")

    except FileNotFoundError:
        print(f"Error: Input file not found at {input_path}")
    except KeyError as e:
        print(f"Error: Column {e} not found during processing. Check CSV headers.")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")

    print(f"--- Preprocessing Finished ---")

# --- Run the Script ---
if __name__ == "__main__":
    # If cgpa_data.csv doesn't exist in the same directory,
    # this script will create a small dummy version and process it.
    # Replace 'cgpa_data.csv' if your file has a different name or path.
    preprocess_cgpa_data(INPUT_CSV_FILE, OUTPUT_CSV_FILE)

    print("\n--- Next Steps ---")
    print("1. Split the preprocessed data (e.g., using train_test_split from scikit-learn).")
    print("2. Apply Feature Scaling (e.g., StandardScaler) to numerical features *after* splitting (fit on train, transform train & test).")
    print("3. Select and train your machine learning model (e.g., Linear Regression, Random Forest, Gradient Boosting) to predict 'CGPA'.")
    print("4. Evaluate the model performance.")

--- Starting Preprocessing ---
Loading data from: cgpa_data.csv
Successfully loaded 300 rows and 20 columns.

Original Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300 entries, 0 to 299
Data columns (total 20 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   StudentID               300 non-null    int64  
 1   Age                     300 non-null    int64  
 2   Gender                  300 non-null    object 
 3   HoursOfStudyPerDay      300 non-null    float64
 4   SchoolAttendanceRate    300 non-null    float64
 5   TuitionAccess           300 non-null    object 
 6   CGPA                    300 non-null    float64
 7   AveragePreviousScores   300 non-null    int64  
 8   HoursOfSleep            300 non-null    float64
 9   BreakfastDaily          300 non-null    object 
 10  ScreenTimeHours         300 non-null    float64
 11  PhysicalActivityHours   300 non-null    float64
 12  PlaysSport        

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import os
import numpy as np # For potential NaN handling if needed

# --- Configuration ---
PREPROCESSED_CSV_FILE = 'preprocessed_cgpa_data.csv'
TARGET_COLUMN = 'CGPA'
TEST_SET_SIZE = 0.2 # Use 20% of the data for testing
RANDOM_SEED = 42    # Ensures reproducible splits and results

# --- Main Execution ---
if __name__ == "__main__":
    print(f"--- Starting Data Splitting and Scaling ---")

    # 1. Load Data
    print(f"\nLoading data from: {PREPROCESSED_CSV_FILE}")
    if not os.path.exists(PREPROCESSED_CSV_FILE):
        print(f"Error: File not found at {PREPROCESSED_CSV_FILE}")
        # If the file doesn't exist, you might want to stop or handle it.
        # For demonstration, we'll exit if the file isn't there.
        exit()

    try:
        df = pd.read_csv(PREPROCESSED_CSV_FILE)
        print(f"Successfully loaded {len(df)} rows and {len(df.columns)} columns.")
        print("Original dtypes:\n", df.dtypes.head())


        # 2. Data Cleaning / Type Correction
        print("\n--- Correcting Data Types and Handling Potential Missing Values ---")

        # Convert boolean Gender columns to int (1/0) if they exist and are boolean
        gender_cols = [col for col in df.columns if col.startswith('Gender_')]
        corrected_gender = False
        for col in gender_cols:
            if df[col].dtype == 'bool':
                print(f"Converting boolean column '{col}' to integer (1/0)...")
                df[col] = df[col].astype(int)
                corrected_gender = True
        if corrected_gender:
             print("Gender columns converted.")
        elif gender_cols:
             print("Gender columns are already numeric.")
        else:
             print("No columns starting with 'Gender_' found.")

        # Handle potential missing values in 'PartTimeWork' (replace NaN with 0)
        if 'PartTimeWork' in df.columns:
            if df['PartTimeWork'].isnull().any():
                print("Found missing values in 'PartTimeWork'. Filling with 0 (assuming 'No').")
                df['PartTimeWork'] = df['PartTimeWork'].fillna(0)
                # Ensure the column is integer type after filling
                df['PartTimeWork'] = df['PartTimeWork'].astype(int)
            else:
                 print("'PartTimeWork' has no missing values.")
        else:
            print("Column 'PartTimeWork' not found.")

        # Verify all columns (except target) are numeric, coerce if necessary
        feature_columns = df.columns.difference([TARGET_COLUMN])
        for col in feature_columns:
             # Attempt conversion, log if errors occur
             try:
                  if not pd.api.types.is_numeric_dtype(df[col]):
                        print(f"Column '{col}' is not numeric. Attempting conversion...")
                        df[col] = pd.to_numeric(df[col], errors='coerce')
             except Exception as e:
                  print(f"Could not process column '{col}': {e}")

        # Check for NaNs that might have been introduced by coercion
        if df.isnull().any().any():
             print("\nWarning: Found NaN values after data cleaning/conversion.")
             print(df.isnull().sum()[df.isnull().sum() > 0])
             print("Dropping rows with NaN values for simplicity...")
             df = df.dropna() # Simple strategy: drop rows with any NaN
             print(f"Remaining rows after dropping NaNs: {len(df)}")
             if df.empty:
                  print("Error: DataFrame is empty after dropping NaN values. Check data quality.")
                  exit()


        # 3. Separate Features (X) and Target (y)
        print(f"\nSeparating features (X) and target (y = '{TARGET_COLUMN}')...")
        if TARGET_COLUMN not in df.columns:
            raise ValueError(f"Target column '{TARGET_COLUMN}' not found in the dataset.")

        y = df[TARGET_COLUMN]
        X = df.drop(TARGET_COLUMN, axis=1)
        feature_names = X.columns.tolist() # Store column names for later

        print(f"Features (X) shape: {X.shape}")
        print(f"Target (y) shape: {y.shape}")
        print("Features columns:", feature_names)

        # 4. Split Data into Training and Testing Sets
        print(f"\nSplitting data into Training ({1-TEST_SET_SIZE:.0%}) and Testing ({TEST_SET_SIZE:.0%}) sets...")
        X_train, X_test, y_train, y_test = train_test_split(
            X, y,
            test_size=TEST_SET_SIZE,
            random_state=RANDOM_SEED  # for reproducibility
        )
        print(f"X_train shape: {X_train.shape}")
        print(f"X_test shape: {X_test.shape}")
        print(f"y_train shape: {y_train.shape}")
        print(f"y_test shape: {y_test.shape}")

        # 5. Apply Feature Scaling (StandardScaler)
        # StandardScaler removes the mean and scales to unit variance.
        # It should be fitted ONLY on the training data.
        print("\nApplying StandardScaler to features (X)...")
        scaler = StandardScaler()

        # Fit the scaler on the training data features and transform it
        X_train_scaled = scaler.fit_transform(X_train)

        # Transform the test data features using the SAME fitted scaler
        X_test_scaled = scaler.transform(X_test)

        # For easier inspection, convert scaled arrays back to DataFrames
        X_train_scaled_df = pd.DataFrame(X_train_scaled, columns=feature_names, index=X_train.index)
        X_test_scaled_df = pd.DataFrame(X_test_scaled, columns=feature_names, index=X_test.index)

        print("\n--- Results ---")
        print("Splitting and Scaling complete.")
        print("Variables created: X_train_scaled_df, X_test_scaled_df, y_train, y_test")

        print("\nScaled Training Features (X_train_scaled_df) Head:")
        print(X_train_scaled_df.head())

        print("\nScaled Training Features Description:")
        print(X_train_scaled_df.describe()) # Mean should be ~0, Std Dev ~1

        print("\nTraining Target (y_train) Head:")
        print(y_train.head())

    except FileNotFoundError:
         print(f"Error: Input file not found at {PREPROCESSED_CSV_FILE}")
    except ValueError as ve:
         print(f"ValueError during processing: {ve}")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")
        import traceback
        traceback.print_exc()


    print(f"\n--- Data Splitting and Scaling Finished ---")
    print("\nNext step: Choose a machine learning model and train it using X_train_scaled_df and y_train.")
    print("Evaluate the trained model using X_test_scaled_df and y_test.")

--- Starting Data Splitting and Scaling ---

Loading data from: preprocessed_cgpa_data.csv
Successfully loaded 300 rows and 21 columns.
Original dtypes:
 Age                       int64
HoursOfStudyPerDay      float64
SchoolAttendanceRate    float64
TuitionAccess             int64
CGPA                    float64
dtype: object

--- Correcting Data Types and Handling Potential Missing Values ---
Converting boolean column 'Gender_Female' to integer (1/0)...
Converting boolean column 'Gender_Male' to integer (1/0)...
Converting boolean column 'Gender_Other' to integer (1/0)...
Gender columns converted.
Found missing values in 'PartTimeWork'. Filling with 0 (assuming 'No').

Separating features (X) and target (y = 'CGPA')...
Features (X) shape: (300, 20)
Target (y) shape: (300,)
Features columns: ['Age', 'HoursOfStudyPerDay', 'SchoolAttendanceRate', 'TuitionAccess', 'AveragePreviousScores', 'HoursOfSleep', 'BreakfastDaily', 'ScreenTimeHours', 'PhysicalActivityHours', 'PlaysSport', 'MentalHe

In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np
import os

# --- Configuration ---
PREPROCESSED_CSV_FILE = 'preprocessed_cgpa_data.csv'
TARGET_COLUMN = 'CGPA'
TEST_SET_SIZE = 0.2 # Should match the split in the previous step
RANDOM_SEED = 42    # Should match the seed in the previous step

# --- Functions ---

def load_and_prepare_data(file_path, target_col, test_size, random_seed):
    """Loads, cleans, splits, and scales the data."""
    print("--- Loading and Preparing Data ---")
    if not os.path.exists(file_path):
        print(f"Error: File not found at {file_path}")
        return None, None, None, None, None, None

    df = pd.read_csv(file_path)
    print(f"Loaded {len(df)} rows.")

    # Clean data (as done in the previous script)
    gender_cols = [col for col in df.columns if col.startswith('Gender_')]
    for col in gender_cols:
        if df[col].dtype == 'bool':
            df[col] = df[col].astype(int)

    if 'PartTimeWork' in df.columns:
         if df['PartTimeWork'].isnull().any():
            df['PartTimeWork'] = df['PartTimeWork'].fillna(0).astype(int)

    # Ensure all features are numeric, handle potential NaNs
    feature_columns_list = df.columns.difference([target_col])
    for col in feature_columns_list:
        if not pd.api.types.is_numeric_dtype(df[col]):
            df[col] = pd.to_numeric(df[col], errors='coerce')
    df = df.dropna()
    print(f"Rows after cleaning/dropping NaNs: {len(df)}")
    if df.empty:
        print("Error: DataFrame empty after cleaning.")
        return None, None, None, None, None, None

    # Separate features and target
    y = df[target_col]
    X = df.drop(target_col, axis=1)
    feature_names = X.columns.tolist()

    # Split data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=random_seed
    )

    # Scale features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Convert scaled arrays back to DataFrames
    X_train_scaled_df = pd.DataFrame(X_train_scaled, columns=feature_names, index=X_train.index)
    X_test_scaled_df = pd.DataFrame(X_test_scaled, columns=feature_names, index=X_test.index)

    print("Data preparation complete.")
    return X_train_scaled_df, X_test_scaled_df, y_train, y_test, feature_names, scaler


def train_evaluate_model(X_train, y_train, X_test, y_test):
    """Trains a RandomForestRegressor and evaluates it."""
    print("\n--- Training RandomForestRegressor Model ---")
    # Initialize the model (you can tune hyperparameters later)
    # n_estimators: number of trees in the forest
    # random_state: for reproducibility of the model itself
    model = RandomForestRegressor(n_estimators=100, random_state=RANDOM_SEED, n_jobs=-1) # Use all available CPU cores

    # Train the model
    model.fit(X_train, y_train)
    print("Model training complete.")

    # Make predictions on the test set
    print("\n--- Evaluating Model Performance ---")
    y_pred = model.predict(X_test)

    # Calculate evaluation metrics
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse) # Or: mean_squared_error(y_test, y_pred, squared=False)
    r2 = r2_score(y_test, y_pred)

    print(f"Evaluation Metrics on Test Set:")
    print(f"  Mean Absolute Error (MAE):  {mae:.4f}")
    print(f"  Mean Squared Error (MSE):   {mse:.4f}")
    print(f"  Root Mean Squared Error (RMSE): {rmse:.4f}")
    print(f"  R-squared (R²) Score:       {r2:.4f}") # Closer to 1 is better

    # Display comparison for a few test samples
    print("\nSample Predictions vs Actual Values:")
    comparison_df = pd.DataFrame({'Actual CGPA': y_test, 'Predicted CGPA': y_pred})
    print(comparison_df.head(10)) # Show first 10 comparisons

    return model, y_pred

def display_feature_importances(model, feature_names):
    """Displays the feature importances of the trained RandomForest model."""
    if hasattr(model, 'feature_importances_'):
        print("\n--- Feature Importances (from RandomForest) ---")
        importances = model.feature_importances_
        feature_importance_df = pd.DataFrame({
            'Feature': feature_names,
            'Importance': importances
        }).sort_values(by='Importance', ascending=False)

        print(feature_importance_df)
    else:
        print("\nThe selected model does not provide feature importances.")


# --- Main Execution ---
if __name__ == "__main__":
    # 1. Load, Prepare, Split, Scale Data
    X_train_scaled_df, X_test_scaled_df, y_train, y_test, features, scaler = load_and_prepare_data(
        PREPROCESSED_CSV_FILE, TARGET_COLUMN, TEST_SET_SIZE, RANDOM_SEED
    )

    if X_train_scaled_df is not None: # Check if data loading was successful
        # 2. Train and Evaluate Model
        trained_model, y_predictions = train_evaluate_model(
            X_train_scaled_df, y_train, X_test_scaled_df, y_test
        )

        # 3. Display Feature Importances (Specific to tree-based models like RandomForest)
        display_feature_importances(trained_model, features)

        print("\n--- Process Finished ---")

--- Loading and Preparing Data ---
Loaded 300 rows.
Rows after cleaning/dropping NaNs: 300
Data preparation complete.

--- Training RandomForestRegressor Model ---
Model training complete.

--- Evaluating Model Performance ---
Evaluation Metrics on Test Set:
  Mean Absolute Error (MAE):  0.0955
  Mean Squared Error (MSE):   0.0330
  Root Mean Squared Error (RMSE): 0.1817
  R-squared (R²) Score:       0.9886

Sample Predictions vs Actual Values:
     Actual CGPA  Predicted CGPA
203          7.5           7.528
266          2.5           2.189
152          7.5           7.610
9            9.2           9.200
233          7.8           7.890
226          6.6           6.529
196          8.0           9.042
109          7.2           7.076
5            8.9           8.701
175          6.7           6.704

--- Feature Importances (from RandomForest) ---
                           Feature    Importance
4            AveragePreviousScores  3.439500e-01
1               HoursOfStudyPerDay  2.687

In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score # Keep metrics for confirmation
import numpy as np
import os
import pickle # Import the pickle library

# --- Configuration ---
PREPROCESSED_CSV_FILE = 'preprocessed_cgpa_data.csv'
TARGET_COLUMN = 'CGPA'
TEST_SET_SIZE = 0.2
RANDOM_SEED = 42
MODEL_PKL_FILE = 'rf_cgpa_model.pkl' # Filename for the saved model
SCALER_PKL_FILE = 'scaler.pkl'       # Filename for the saved scaler

# --- Functions ---

def load_and_prepare_data(file_path, target_col, test_size, random_seed):
    """Loads, cleans, splits, and scales the data. Returns necessary components."""
    print("--- Loading and Preparing Data ---")
    if not os.path.exists(file_path):
        print(f"Error: File not found at {file_path}")
        return None, None, None, None, None, None

    df = pd.read_csv(file_path)
    print(f"Loaded {len(df)} rows.")

    # Clean data (as done before)
    gender_cols = [col for col in df.columns if col.startswith('Gender_')]
    for col in gender_cols:
        if df[col].dtype == 'bool':
            df[col] = df[col].astype(int)
    if 'PartTimeWork' in df.columns:
         if df['PartTimeWork'].isnull().any():
            df['PartTimeWork'] = df['PartTimeWork'].fillna(0).astype(int)

    # Ensure all features are numeric, handle potential NaNs
    feature_columns_list = df.columns.difference([target_col])
    for col in feature_columns_list:
        if not pd.api.types.is_numeric_dtype(df[col]):
             try:
                df[col] = pd.to_numeric(df[col], errors='coerce')
             except Exception as e:
                  print(f"Warning: Could not convert column {col} to numeric: {e}")
    df = df.dropna()
    print(f"Rows after cleaning/dropping NaNs: {len(df)}")
    if df.empty:
        print("Error: DataFrame empty after cleaning.")
        return None, None, None, None, None, None

    # Separate features and target
    y = df[target_col]
    X = df.drop(target_col, axis=1)
    feature_names = X.columns.tolist() # Get feature names

    # Split data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=random_seed
    )

    # Scale features - Fit scaler ONLY on training data
    scaler = StandardScaler()
    # Important: Fit scaler on the DataFrame to retain feature names
    scaler.fit(X_train)
    X_train_scaled = scaler.transform(X_train)
    X_test_scaled = scaler.transform(X_test) # Use same fitted scaler for test

    # Optional: Convert scaled arrays back to DataFrames for inspection if needed
    # X_train_scaled_df = pd.DataFrame(X_train_scaled, columns=feature_names, index=X_train.index)
    # X_test_scaled_df = pd.DataFrame(X_test_scaled, columns=feature_names, index=X_test.index)

    print("Data preparation complete.")
    # Return scaled data AND the fitted scaler
    return X_train_scaled, X_test_scaled, y_train, y_test, feature_names, scaler


# --- Main Execution ---
if __name__ == "__main__":
    # 1. Load, Prepare, Split, Scale Data
    X_train_scaled, X_test_scaled, y_train, y_test, features, scaler = load_and_prepare_data(
        PREPROCESSED_CSV_FILE, TARGET_COLUMN, TEST_SET_SIZE, RANDOM_SEED
    )

    if X_train_scaled is not None: # Check if data loading was successful
        # 2. Train the Model
        print("\n--- Training RandomForestRegressor Model ---")
        model = RandomForestRegressor(n_estimators=100, random_state=RANDOM_SEED, n_jobs=-1)
        model.fit(X_train_scaled, y_train) # Train on scaled data
        print("Model training complete.")

        # (Optional) Evaluate model on test set to confirm performance before saving
        print("\n--- Evaluating Model Performance (Confirmation) ---")
        y_pred = model.predict(X_test_scaled)
        mae = mean_absolute_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)
        print(f"  MAE on Test Set:  {mae:.4f}")
        print(f"  R² on Test Set:   {r2:.4f}")

        # 3. Save the Trained Model
        print(f"\n--- Saving Model to {MODEL_PKL_FILE} ---")
        try:
            with open(MODEL_PKL_FILE, 'wb') as f: # 'wb' means Write Binary
                pickle.dump(model, f)
            print("Model saved successfully.")
        except Exception as e:
            print(f"Error saving model: {e}")

        # 4. Save the Fitted Scaler
        print(f"\n--- Saving Scaler to {SCALER_PKL_FILE} ---")
        try:
            with open(SCALER_PKL_FILE, 'wb') as f:
                pickle.dump(scaler, f)
            print("Scaler saved successfully.")
        except Exception as e:
            print(f"Error saving scaler: {e}")

        print("\n--- Process Finished: Model and Scaler Saved ---")
    else:
        print("\n--- Process Aborted due to data loading/preparation errors ---")

--- Loading and Preparing Data ---
Loaded 300 rows.
Rows after cleaning/dropping NaNs: 300
Data preparation complete.

--- Training RandomForestRegressor Model ---
Model training complete.

--- Evaluating Model Performance (Confirmation) ---
  MAE on Test Set:  0.0955
  R² on Test Set:   0.9886

--- Saving Model to rf_cgpa_model.pkl ---
Model saved successfully.

--- Saving Scaler to scaler.pkl ---
Scaler saved successfully.

--- Process Finished: Model and Scaler Saved ---


In [6]:
import pandas as pd
import numpy as np
import pickle
import os

# --- Configuration ---
MODEL_PKL_FILE = 'rf_cgpa_model.pkl'  # Filename of the saved model
SCALER_PKL_FILE = 'scaler.pkl'        # Filename of the saved scaler

# --- Functions ---

def load_model_and_scaler(model_path, scaler_path):
    """Loads the trained model and scaler from pickle files."""
    print("--- Loading Model and Scaler ---")
    loaded_model, loaded_scaler = None, None

    # Load Model
    if os.path.exists(model_path):
        try:
            with open(model_path, 'rb') as f:
                loaded_model = pickle.load(f)
            print(f"Model loaded successfully from {model_path}")
        except Exception as e:
            print(f"Error loading model: {e}")
    else:
        print(f"Error: Model file not found at {model_path}")

    # Load Scaler
    if os.path.exists(scaler_path):
        try:
            with open(scaler_path, 'rb') as f:
                loaded_scaler = pickle.load(f)
            print(f"Scaler loaded successfully from {scaler_path}")
        except Exception as e:
            print(f"Error loading scaler: {e}")
    else:
        print(f"Error: Scaler file not found at {scaler_path}")

    return loaded_model, loaded_scaler

def predict_cgpa(model, scaler, new_data_df):
    """
    Takes new student data (as a DataFrame), scales it using the loaded scaler,
    and predicts CGPA using the loaded model.
    """
    if model is None or scaler is None:
        print("Error: Model or Scaler not loaded. Cannot predict.")
        return None

    print("\n--- Preparing New Data for Prediction ---")
    if not hasattr(scaler, 'feature_names_in_'):
        print("Error: Loaded scaler missing feature names.")
        return None

    expected = scaler.feature_names_in_
    missing = set(expected) - set(new_data_df.columns)
    extra   = set(new_data_df.columns) - set(expected)
    if missing:
        print(f"Error: Missing columns: {missing}")
        return None
    if extra:
        print(f"Warning: Extra columns will be dropped: {extra}")
    new_data_df = new_data_df[expected]

    try:
        X_scaled = scaler.transform(new_data_df)
        print("New data scaled successfully.")
    except Exception as e:
        print(f"Error scaling data: {e}")
        return None

    print("\n--- Making Predictions ---")
    try:
        preds = model.predict(X_scaled)
        print("Predictions made successfully.")
        return preds
    except Exception as e:
        print(f"Error during prediction: {e}")
        return None

# --- Main Execution ---
if __name__ == "__main__":
    model, scaler = load_model_and_scaler(MODEL_PKL_FILE, SCALER_PKL_FILE)

    if model and scaler:
        # 2. Manually defined “realistic” data for 10 students
        new_students_data = {
            'Age':                         [18, 19, 20, 21, 22, 18, 19, 20, 21, 22],
            'Gender_Female':              [1, 0, 1, 0, 1, 0, 1, 0, 1, 0],
            'Gender_Male':                [0, 1, 0, 1, 0, 1, 0, 1, 0, 1],
            'Gender_Other':               [0]*10,
            'HoursOfStudyPerDay':         [2.5, 4.0, 5.0, 3.5, 6.0, 1.5, 4.5, 2.0, 5.5, 3.0],
            'SchoolAttendanceRate':       [88.0, 95.0, 78.0, 99.0, 92.0, 85.0, 97.0, 80.0, 90.0, 94.0],
            'TuitionAccess':              [1, 1, 0, 1, 1, 0, 1, 0, 1, 1],
            'AveragePreviousScores':      [75.0, 85.0, 82.0, 90.0, 88.0, 70.0, 92.0, 78.0, 87.0, 91.0],
            'ParticipatesInClubs':        [0, 1, 1, 0, 1, 0, 1, 0, 1, 1],
            'HoursOfSleep':               [7.0, 6.5, 8.0, 7.5, 6.0, 7.0, 8.5, 6.8, 7.2, 6.9],
            'BreakfastDaily':             [1, 1, 0, 1, 1, 0, 1, 0, 1, 1],
            'ScreenTimeHours':            [3.5, 2.0, 4.0, 1.5, 3.0, 5.0, 2.5, 4.5, 1.0, 3.8],
            'PhysicalActivityHours':      [1.0, 0.5, 2.0, 1.5, 0.8, 1.0, 2.5, 0.7, 1.8, 1.2],
            'PlaysSport':                 [0, 1, 1, 0, 1, 0, 1, 0, 1, 1],
            'MentalHealthScore':          [7, 8, 6, 9, 8, 5, 9, 6, 7, 8],
            'StudyEnvironmentRating':     [4, 5, 3, 5, 4, 2, 5, 3, 4, 5],
            'FriendSupportScore':         [8,10, 7,10, 9, 6,10, 7, 8,10],
            'ParentalEducationLevel_Encoded': [1,2,1,2,0,1,2,0,1,2],
            'HouseholdIncomeLevel_Encoded':   [1,2,1,2,1,0,2,0,1,2],
            'PartTimeWork':               [0, 0, 1, 0, 1, 1, 0, 1, 0, 0],
        }
        new_students_df = pd.DataFrame(new_students_data)

        # Pretty-print inputs
        sep = "="*80
        print(f"\n{sep}")
        print("  Sample Student Data (10 Records)  ".center(80, "─"))
        print(sep)
        print(new_students_df.to_string(index=False))
        print(sep)

        # 3. Predict
        preds = predict_cgpa(model, scaler, new_students_df)
        if preds is not None:
            new_students_df['Predicted_CGPA'] = np.round(preds, 2)
            print(f"\n{sep}")
            print("  Predicted CGPAs  ".center(80, "─"))
            print(sep)
            print(new_students_df[
                ['Age','HoursOfStudyPerDay','AveragePreviousScores','Predicted_CGPA']
            ].to_string(index=False))
            print(sep + "\n")

    else:
        print("\n--- Prediction Process Aborted due to loading errors ---")


--- Loading Model and Scaler ---
Model loaded successfully from rf_cgpa_model.pkl
Scaler loaded successfully from scaler.pkl

──────────────────────  Sample Student Data (10 Records)  ──────────────────────
 Age  Gender_Female  Gender_Male  Gender_Other  HoursOfStudyPerDay  SchoolAttendanceRate  TuitionAccess  AveragePreviousScores  ParticipatesInClubs  HoursOfSleep  BreakfastDaily  ScreenTimeHours  PhysicalActivityHours  PlaysSport  MentalHealthScore  StudyEnvironmentRating  FriendSupportScore  ParentalEducationLevel_Encoded  HouseholdIncomeLevel_Encoded  PartTimeWork
  18              1            0             0                 2.5                  88.0              1                   75.0                    0           7.0               1              3.5                    1.0           0                  7                       4                   8                               1                             1             0
  19              0            1             0         