In [2]:
# ----------------------------------------------------------------------
# 5G RESOURCE ALLOCATION XGBOOST REGRESSOR TRAINING SCRIPT (COLAB)
# ----------------------------------------------------------------------
# This script is designed to run in a Google Colab environment.
# It mounts Google Drive, performs final data preparation (cleaning
# and encoding is done on the fly), trains the XGBoost model, and
# saves the results back to Drive for use in MATLAB.
# ----------------------------------------------------------------------

import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from scipy.io import savemat
from google.colab import drive
from google.colab import userdata, _message as message
import os

# --- CONFIGURATION ---
# Set this variable to control which file the script attempts to load:
# Options: 'CLEANED' (uses '5G_Resource_Allocation_Cleaned.csv')
#          'ORIGINAL' (uses '5G_Service (1).csv' and cleans it live)
INPUT_FILE_SOURCE = 'CLEANED'

if INPUT_FILE_SOURCE == 'ORIGINAL':
    INPUT_CSV_FILENAME = '5G_Service (1).csv'
elif INPUT_FILE_SOURCE == 'CLEANED':
    INPUT_CSV_FILENAME = '5G_Resource_Allocation_Cleaned.csv'
else:
    raise ValueError("INPUT_FILE_SOURCE must be 'CLEANED' or 'ORIGINAL'.")
# ----------------------------------------


# ----------------------------------------------------------------------
# STEP 1: MOUNT GOOGLE DRIVE
# Run this cell and follow the link to authorize Google Drive access.
# ----------------------------------------------------------------------
print("--- Step 1: Mounting Google Drive ---")
try:
    drive.mount('/content/gdrive')
    print("Google Drive mounted successfully.")
except Exception as e:
    print(f"Error mounting Google Drive: {e}")


# ----------------------------------------------------------------------
# STEP 2: DATA CLEANING AND PREPARATION FUNCTIONS
# ----------------------------------------------------------------------

def convert_bandwidth_to_mbps(bw_str):
    """Converts bandwidth strings (Mbps/Kbps) to float Mbps."""
    if pd.isna(bw_str): return np.nan
    bw_str = str(bw_str).strip()
    if "Mbps" in bw_str:
        return float(bw_str.replace(" Mbps", ""))
    elif "Kbps" in bw_str:
        return float(bw_str.replace(" Kbps", "")) / 1000.0
    try:
        return float(bw_str)
    except ValueError:
        return np.nan

def clean_and_prepare_data(df):
    """Cleans the 5G data and prepares it for XGBoost (used if INPUT_FILE_SOURCE is 'ORIGINAL')."""

    # 1. Clean Numerical Columns
    df['Required_Bandwidth_Mbps'] = df['Required_Bandwidth'].apply(convert_bandwidth_to_mbps)
    df['Allocated_Bandwidth_Mbps'] = df['Allocated_Bandwidth'].apply(convert_bandwidth_to_mbps)
    df['Signal_Strength_dBm'] = df['Signal_Strength'].str.replace(' dBm', '').astype(float)
    df['Latency_ms'] = df['Latency'].str.replace(' ms', '').astype(float)
    df['Resource_Allocation_Pct'] = df['Resource_Allocation'].str.replace('%', '').astype(float) / 100.0

    # 2. Handle Categorical Feature (Application_Type)
    df_encoded = pd.get_dummies(df, columns=['Application_Type'], prefix='App', drop_first=True)

    # 3. Define Features (X) and Target (Y)
    target_col = 'Allocated_Bandwidth_Mbps'

    # Select all relevant columns for X
    feature_cols = [
        'Signal_Strength_dBm', 'Latency_ms', 'Required_Bandwidth_Mbps',
        'Resource_Allocation_Pct'
    ]
    # Add all new Application_Type dummy columns
    feature_cols += [col for col in df_encoded.columns if col.startswith('App_')]

    X = df_encoded[feature_cols]
    y = df_encoded[target_col]

    return X, y, feature_cols

# ----------------------------------------------------------------------
# STEP 3: LOAD DATA, TRAIN MODEL, AND SAVE OUTPUTS
# This cell executes the core ML logic.
# ----------------------------------------------------------------------

def execute_training():

    # PROMPT USER FOR THE CORRECT PATH
    print("\n--- Step 3: Verifying File Path ---")

    # Provide a helpful default path format for the user
    default_prompt = "/content/gdrive/MyDrive/My_5G_Project_Data/"

    raw_path_input = input(f"Please enter the EXACT Google Drive folder path (e.g., {default_prompt}): ").strip()

    # --- Robust Path Cleaning ---
    drive_base_path = raw_path_input
    # 1. Check if the user pasted the filename as well, and strip it.
    if drive_base_path.endswith(INPUT_CSV_FILENAME):
        drive_base_path = drive_base_path.removesuffix(INPUT_CSV_FILENAME)

    # 2. Ensure the path ends with a slash
    if not drive_base_path.endswith('/'):
        drive_base_path += '/'

    full_path = os.path.join(drive_base_path, INPUT_CSV_FILENAME)

    print(f"\nUsing Base Folder Path: {drive_base_path}")
    print(f"Attempting to load data file: {INPUT_CSV_FILENAME}")
    print(f"Full path being used: {full_path}")

    # ðŸš¨ CHECK POINT: List files in the base path to verify directory structure
    try:
        files_in_drive = os.listdir(drive_base_path)
        print(f"Files found in '{drive_base_path}': {files_in_drive}")

        if INPUT_CSV_FILENAME not in files_in_drive:
             print(f"\nFATAL ERROR: '{INPUT_CSV_FILENAME}' was not found in the directory listing.")
             print("Please ensure the CSV file name and the DRIVE_BASE_PATH you entered are correct.")
             return

    except FileNotFoundError:
        print("\nFATAL ERROR: The specified DRIVE_BASE_PATH does not exist. Check for typos in your entry.")
        return
    except Exception as e:
        print(f"\nAn error occurred while listing the directory: {e}")
        return


    try:
        df = pd.read_csv(full_path)
    except Exception as e:
        print(f"\nERROR reading CSV file, even though file was found: {e}")
        print("The CSV might be corrupt or improperly formatted. Check if the file is truly a CSV.")
        return

    # Determine if cleaning is needed based on the INPUT_FILE_SOURCE flag
    if INPUT_FILE_SOURCE == 'ORIGINAL':
        print("Cleaning and preparing data...")
        X, y, feature_names = clean_and_prepare_data(df)
    else:
        # If using the already cleaned CSV
        target_col = 'Allocated_Bandwidth_Mbps'
        X = df.drop(columns=[target_col])
        y = df[target_col]
        feature_names = X.columns.tolist()

    print(f"Data ready. Total features: {len(feature_names)}")

    # Train-test split (optional, but good practice)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    print("Training XGBoost Regressor...")
    # Initialize and train XGBoost Regressor
    model = xgb.XGBRegressor(
        objective='reg:squarederror',
        n_estimators=150,
        max_depth=5,
        learning_rate=0.1,
        random_state=42
    )
    model.fit(X_train, y_train)
    print("Training complete.")


    # --- Save Outputs to Google Drive ---

    # 1. Save model
    model_output_path = os.path.join(drive_base_path, "xgboost_model.json")
    model.save_model(model_output_path)
    print(f"âœ… Model saved successfully to: {model_output_path}")

    # 2. Save feature names (.mat format required for clean MATLAB import)
    feature_names_dict = {'feature_names': np.array(feature_names, dtype=object)}
    feature_output_path = os.path.join(drive_base_path, "feature_names.mat")
    savemat(feature_output_path, feature_names_dict)
    print(f"âœ… Feature names saved successfully to: {feature_output_path}")

    # 3. Print necessary information for MATLAB user
    print("\n--- NEXT STEPS FOR MATLAB ---")
    print("1. Download 'xgboost_model.json' and 'feature_names.mat' to your Mac.")
    print("2. Ensure your 'predict_in_matlab.m' script has the correct Python path.")
    print("3. Run 'predict_in_matlab' in MATLAB.")

# Execute the main function
execute_training()


--- Step 1: Mounting Google Drive ---
Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
Google Drive mounted successfully.

--- Step 3: Verifying File Path ---
Please enter the EXACT Google Drive folder path (e.g., /content/gdrive/MyDrive/My_5G_Project_Data/): /content/gdrive/MyDrive/5G XGBoost /

Using Base Folder Path: /content/gdrive/MyDrive/5G XGBoost /
Attempting to load data file: 5G_Resource_Allocation_Cleaned.csv
Full path being used: /content/gdrive/MyDrive/5G XGBoost /5G_Resource_Allocation_Cleaned.csv
Files found in '/content/gdrive/MyDrive/5G XGBoost /': ['5G_Resource_Allocation_Cleaned.csv']
Data ready. Total features: 14
Training XGBoost Regressor...
Training complete.
âœ… Model saved successfully to: /content/gdrive/MyDrive/5G XGBoost /xgboost_model.json
âœ… Feature names saved successfully to: /content/gdrive/MyDrive/5G XGBoost /feature_names.mat

--- NEXT STEPS FOR MATLAB ---
1. Download '