In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Read the dataset
df = pd.read_csv('/Users/rajatthakur/Desktop/SuperDataScienceML/CollaborationProjects/Edu-spend/International_Education_Costs.csv')


# Display basic information about the dataset
print("Dataset Info:")
print("-" * 50)
print("Number of rows:", df.shape[0])
print("Number of columns:", df.shape[1])
print("\nColumns:")
print(df.columns.to_list())


# Display first few rows
display(df.head())

Dataset Info:
--------------------------------------------------
Number of rows: 907
Number of columns: 12

Columns:
['Country', 'City', 'University', 'Program', 'Level', 'Duration_Years', 'Tuition_USD', 'Living_Cost_Index', 'Rent_USD', 'Visa_Fee_USD', 'Insurance_USD', 'Exchange_Rate']


Unnamed: 0,Country,City,University,Program,Level,Duration_Years,Tuition_USD,Living_Cost_Index,Rent_USD,Visa_Fee_USD,Insurance_USD,Exchange_Rate
0,USA,Cambridge,Harvard University,Computer Science,Master,2.0,55400,83.5,2200,160,1500,1.0
1,UK,London,Imperial College London,Data Science,Master,1.0,41200,75.8,1800,485,800,0.79
2,Canada,Toronto,University of Toronto,Business Analytics,Master,2.0,38500,72.5,1600,235,900,1.35
3,Australia,Melbourne,University of Melbourne,Engineering,Master,2.0,42000,71.2,1400,450,650,1.52
4,Germany,Munich,Technical University of Munich,Mechanical Engineering,Master,2.0,500,70.5,1100,75,550,0.92


# Preprocessing Steps

1. Feature Engineering
First, you’ll want to create a new column for TCA:

In [2]:
df['TCA'] = (
    df['Tuition_USD'] * df['Duration_Years'] +
    df['Rent_USD'] * 12 * df['Duration_Years'] +
    df['Visa_Fee_USD'] +
    df['Insurance_USD'] * df['Duration_Years']
)

In [3]:
df

Unnamed: 0,Country,City,University,Program,Level,Duration_Years,Tuition_USD,Living_Cost_Index,Rent_USD,Visa_Fee_USD,Insurance_USD,Exchange_Rate,TCA
0,USA,Cambridge,Harvard University,Computer Science,Master,2.0,55400,83.5,2200,160,1500,1.00,166760.0
1,UK,London,Imperial College London,Data Science,Master,1.0,41200,75.8,1800,485,800,0.79,64085.0
2,Canada,Toronto,University of Toronto,Business Analytics,Master,2.0,38500,72.5,1600,235,900,1.35,117435.0
3,Australia,Melbourne,University of Melbourne,Engineering,Master,2.0,42000,71.2,1400,450,650,1.52,119350.0
4,Germany,Munich,Technical University of Munich,Mechanical Engineering,Master,2.0,500,70.5,1100,75,550,0.92,28575.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
902,France,Strasbourg,University of Strasbourg,Data Analytics,Master,2.0,4000,70.2,1000,99,850,0.92,33799.0
903,Malaysia,Nilai,USIM,Computer Science,Bachelor,3.0,6800,50.5,400,120,400,4.65,36120.0
904,Saudi Arabia,Al-Ahsa,King Faisal University,Information Systems,Master,2.0,4200,64.2,600,200,800,3.75,24600.0
905,USA,Seattle,University of Washington,Software Development,PhD,5.0,50000,77.8,2000,160,1500,1.00,377660.0


Step 2: Select Features and Target
Choose which columns you want to use as features (X) and set TCA as your target (y):

In [4]:
X = df.drop(['TCA', 'Tuition_USD', 'Visa_Fee_USD', 'Insurance_USD', 'Rent_USD'], axis=1)
y = df['TCA']

In [5]:
X

Unnamed: 0,Country,City,University,Program,Level,Duration_Years,Living_Cost_Index,Exchange_Rate
0,USA,Cambridge,Harvard University,Computer Science,Master,2.0,83.5,1.00
1,UK,London,Imperial College London,Data Science,Master,1.0,75.8,0.79
2,Canada,Toronto,University of Toronto,Business Analytics,Master,2.0,72.5,1.35
3,Australia,Melbourne,University of Melbourne,Engineering,Master,2.0,71.2,1.52
4,Germany,Munich,Technical University of Munich,Mechanical Engineering,Master,2.0,70.5,0.92
...,...,...,...,...,...,...,...,...
902,France,Strasbourg,University of Strasbourg,Data Analytics,Master,2.0,70.2,0.92
903,Malaysia,Nilai,USIM,Computer Science,Bachelor,3.0,50.5,4.65
904,Saudi Arabia,Al-Ahsa,King Faisal University,Information Systems,Master,2.0,64.2,3.75
905,USA,Seattle,University of Washington,Software Development,PhD,5.0,77.8,1.00


Step 3: Encode Categorical Variables: 
Random Forests can’t handle string categories directly, so we need to encode them. The most common way is One-Hot Encoding:

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

# Select categorical columns
categorical_cols = X.select_dtypes(include=['object']).columns

# One-hot encode categorical columns
X_encoded = pd.get_dummies(X, columns=categorical_cols)

In [43]:
# Save the columns to a file
X_encoded.columns.to_list()
pd.Series(X_encoded.columns).to_csv("model_columns.csv", index=False)

In [None]:
import mlflow
import mlflow.sklearn
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Start an MLflow run
with mlflow.start_run(run_name="XGBoost Tuned"):
    # Use best parameters from RandomizedSearchCV
    best_params = xgb_random.best_params_
    xgb_final = XGBRegressor(**best_params, random_state=42)
    xgb_final.fit(X_train, y_train)
    y_pred_xgb_tuned = xgb_final.predict(X_test)

    # Calculate metrics
    mae = mean_absolute_error(y_test, y_pred_xgb_tuned)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred_xgb_tuned))
    
    r2 = r2_score(y_test, y_pred_xgb_tuned)

    # Log parameters and metrics
    mlflow.log_params(best_params)
    mlflow.log_metric("MAE", mae)
    mlflow.log_metric("RMSE", rmse)
    mlflow.log_metric("R2", r2)

    # Log the model
    mlflow.sklearn.log_model(xgb_final, "model")

    print(f"Logged run with MAE: {mae:.2f}, RMSE: {rmse:.2f}, R2: {r2:.3f}")

Logged run with MAE: 9724.36, RMSE: 15589.63, R2: 0.964


In [None]:
import mlflow
import mlflow.sklearn
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Start an MLflow run
with mlflow.start_run(run_name="XGBoost Tuned"):
    # Use best parameters from RandomizedSearchCV
    best_params = xgb_random.best_params_
    xgb_final = XGBRegressor(**best_params, random_state=42)
    xgb_final.fit(X_train, y_train)
    y_pred_xgb_tuned = xgb_final.predict(X_test)

    # Calculate metrics
    mae = mean_absolute_error(y_test, y_pred_xgb_tuned)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred_xgb_tuned))
    
    r2 = r2_score(y_test, y_pred_xgb_tuned)

    # Log parameters and metrics
    mlflow.log_params(best_params)
    mlflow.log_metric("MAE", mae)
    mlflow.log_metric("RMSE", rmse)
    mlflow.log_metric("R2", r2)

    # Log the model
    mlflow.sklearn.log_model(xgb_final, "model")

    print(f"Logged run with MAE: {mae:.2f}, RMSE: {rmse:.2f}, R2: {r2:.3f}")

Logged run with MAE: 9724.36, RMSE: 15589.63, R2: 0.964


Step 4: Train/Test Split

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

Step 5: Ready for Modeling!

In [8]:
from sklearn.ensemble import RandomForestRegressor

# Initialize the Random Forest Regressor
rf = RandomForestRegressor(
    n_estimators=100,      # Number of trees in the forest
    random_state=42        # For reproducibility
)

# Fit the model to the training data
rf.fit(X_train, y_train)

# Predict on the test set
y_pred = rf.predict(X_test)

Evaluation (optional, but recommended):

In [9]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

# Calculate evaluation metrics
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f"MAE: {mae:.2f}")
print(f"RMSE: {rmse:.2f}")
print(f"R^2: {r2:.2f}")

MAE: 9894.01
RMSE: 16888.06
R^2: 0.96


In [10]:
import pandas as pd

# Create a DataFrame to compare actual and predicted values
results = pd.DataFrame({
    'Actual_TCA': y_test.values,
    'Predicted_TCA': y_pred
})

# Display the first few rows
print(results.head(20))

    Actual_TCA  Predicted_TCA
0      24040.0       23206.85
1      28310.0       24337.05
2     215850.0      190129.40
3      58500.0       47968.83
4      29350.0       55937.00
5      13310.0       17936.90
6      23650.0       21943.05
7     107590.0      105752.50
8      33799.0       33895.13
9      24420.0       27084.30
10     30675.0       24481.64
11    377660.0      386030.00
12     60820.0       59358.10
13     55620.0       50953.00
14     79035.0       76530.25
15     52560.0       50120.20
16    408660.0      405860.00
17     38810.0       36987.43
18     40320.0       48608.15
19     40875.0       30120.74


In [11]:
new_profile = {
    'Country': 'USA',
    'City': 'Boston',
    'University': 'MIT',
    'Program': 'Data Science',
    'Level': 'Master',
    'Duration_Years': 2,
    'Living_Cost_Index': 85.0,
    'Rent_USD': 2500,
    'Visa_Fee_USD': 160,
    'Insurance_USD': 1500,
    'Exchange_Rate': 1.0
}

In [12]:
# Convert to DataFrame
new_df = pd.DataFrame([new_profile])

# One-hot encode (align columns with training data)
new_df_encoded = pd.get_dummies(new_df)
new_df_encoded = new_df_encoded.reindex(columns=X_encoded.columns, fill_value=0)

# Predict TCA
predicted_tca = rf.predict(new_df_encoded)
print(f"Predicted TCA: ${predicted_tca[0]:.2f}")

Predicted TCA: $167660.00


1. Feature Importance: which features are most influential in predicting TCA.

In [13]:
# Get feature importances and feature names
importances = rf.feature_importances_
feature_names = X_encoded.columns

# Create a DataFrame for easy sorting and viewing
feat_imp_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': importances
})

# Sort by importance descending and show top 15
top_features = feat_imp_df.sort_values(by='Importance', ascending=False).head(15)
print(top_features)

                       Feature  Importance
69                 Country_USA    0.375562
2                Exchange_Rate    0.133731
0               Duration_Years    0.114852
1345              Level_Master    0.072013
5            Country_Australia    0.070762
68                  Country_UK    0.068172
1            Living_Cost_Index    0.048708
12              Country_Canada    0.030578
1346                 Level_PhD    0.011242
529             City_Singapore    0.010250
1269  Program_Computer Science    0.008524
56           Country_Singapore    0.006739
67                 Country_UAE    0.004977
1344            Level_Bachelor    0.004347
34             Country_Ireland    0.004305


Interpretation
Country_USA (0.375):
Whether the program is in the USA is the most influential factor in predicting TCA. This makes sense, as US education costs are often much higher than in other countries.

Exchange_Rate (0.134):
The exchange rate has a significant impact, affecting the conversion of local costs to USD.

Duration_Years (0.110):
The length of the program is also important, as longer programs naturally cost more.

Level_Master (0.070):
Whether the program is a Master’s degree is also a key factor.

What does this mean?

Your model is heavily influenced by the country (especially the USA), the exchange rate, and the duration of the program.
These results are logical and suggest your data is being used sensibly by the model.

2. Hyperparameter Tuning (RandomizedSearchCV Example): This will help you find the best parameters for your Random Forest.

In [14]:
from sklearn.model_selection import RandomizedSearchCV

# Define parameter grid
param_dist = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2']
}

# Set up RandomizedSearchCV
rf_random = RandomizedSearchCV(
    estimator=rf,
    param_distributions=param_dist,
    n_iter=20,
    cv=3,
    verbose=2,
    random_state=42,
    n_jobs=-1
)

# Fit to training data
rf_random.fit(X_train, y_train)

# Best parameters
print("Best parameters found:", rf_random.best_params_)

# Evaluate the best model
best_rf = rf_random.best_estimator_
y_pred_best = best_rf.predict(X_test)
print("MAE (tuned):", mean_absolute_error(y_test, y_pred_best))
print("RMSE (tuned):", np.sqrt(mean_squared_error(y_test, y_pred)))
print("R^2 (tuned):", r2_score(y_test, y_pred_best))

Fitting 3 folds for each of 20 candidates, totalling 60 fits
[CV] END max_depth=30, max_features=log2, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time=   0.1s
[CV] END max_depth=30, max_features=log2, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time=   0.1s
[CV] END max_depth=30, max_features=log2, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time=   0.1s
[CV] END max_depth=30, max_features=log2, min_samples_leaf=4, min_samples_split=10, n_estimators=100; total time=   0.1s
[CV] END max_depth=30, max_features=log2, min_samples_leaf=4, min_samples_split=10, n_estimators=100; total time=   0.1s
[CV] END max_depth=30, max_features=log2, min_samples_leaf=4, min_samples_split=10, n_estimators=100; total time=   0.1s
[CV] END max_depth=20, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time=   0.1s
[CV] END max_depth=20, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=

3. Try Gradient Boosting (XGBoost)

In [15]:
#import sys
#print(sys.executable)


In [16]:
# /Users/rajatthakur/miniforge3/envs/14Junemlflow-clean-env/bin/python -m pip install xgboost

In [17]:
from xgboost import XGBRegressor

# Train XGBoost regressor
xgb = XGBRegressor(n_estimators=100, random_state=42)
xgb.fit(X_train, y_train)

# Predict and evaluate
y_pred_xgb = xgb.predict(X_test)
print("MAE (XGBoost):", mean_absolute_error(y_test, y_pred_xgb))
print("RMSE (XGBoost):", np.sqrt(mean_squared_error(y_test, y_pred)))
print("R^2 (XGBoost):", r2_score(y_test, y_pred_xgb))

MAE (XGBoost): 9225.625520475618
RMSE (XGBoost): 16888.060154482297
R^2 (XGBoost): 0.964075767850752


In [18]:
import pandas as pd

# Create a DataFrame for comparison
results_xgb = pd.DataFrame({
    'Actual_TCA': y_test.values,
    'Predicted_TCA_XGB': y_pred_xgb
})

# Display the first 10 rows
print(results_xgb.head(10))

   Actual_TCA  Predicted_TCA_XGB
0     24040.0       24314.378906
1     28310.0       21836.056641
2    215850.0      196507.984375
3     58500.0       47074.191406
4     29350.0       71652.789062
5     13310.0       13716.552734
6     23650.0       27203.396484
7    107590.0      115703.031250
8     33799.0       32385.943359
9     24420.0       24232.083984


In [19]:
# Create a DataFrame for comparison
results_compare = pd.DataFrame({
    'Actual_TCA': y_test.values,
    'Predicted_TCA_XGB': y_pred_xgb,
    'Predicted_TCA_RF': y_pred
})

# Display the first 10 rows
print(results_compare.head(10))

   Actual_TCA  Predicted_TCA_XGB  Predicted_TCA_RF
0     24040.0       24314.378906          23206.85
1     28310.0       21836.056641          24337.05
2    215850.0      196507.984375         190129.40
3     58500.0       47074.191406          47968.83
4     29350.0       71652.789062          55937.00
5     13310.0       13716.552734          17936.90
6     23650.0       27203.396484          21943.05
7    107590.0      115703.031250         105752.50
8     33799.0       32385.943359          33895.13
9     24420.0       24232.083984          27084.30


In [20]:
# Hyperparameter Tuning for XGBoost

In [21]:
from sklearn.model_selection import RandomizedSearchCV
from xgboost import XGBRegressor

param_dist = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7, 10],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0]
}

xgb = XGBRegressor(random_state=42)

xgb_random = RandomizedSearchCV(
    estimator=xgb,
    param_distributions=param_dist,
    n_iter=20,
    cv=3,
    verbose=2,
    random_state=42,
    n_jobs=-1
)

xgb_random.fit(X_train, y_train)

print("Best parameters found:", xgb_random.best_params_)

# Evaluate
y_pred_xgb_tuned = xgb_random.predict(X_test)
print("MAE (tuned):", mean_absolute_error(y_test, y_pred_xgb_tuned))
print("RMSE (tuned):", np.sqrt(mean_squared_error(y_test, y_pred)))
print("R^2 (tuned):", r2_score(y_test, y_pred_xgb_tuned))

Fitting 3 folds for each of 20 candidates, totalling 60 fits
[CV] END colsample_bytree=0.8, learning_rate=0.05, max_depth=3, n_estimators=100, subsample=0.6; total time=   0.9s
[CV] END colsample_bytree=0.8, learning_rate=0.05, max_depth=3, n_estimators=100, subsample=0.6; total time=   1.0s
[CV] END colsample_bytree=1.0, learning_rate=0.2, max_depth=10, n_estimators=100, subsample=0.8; total time=   1.1s
[CV] END colsample_bytree=1.0, learning_rate=0.2, max_depth=10, n_estimators=100, subsample=0.8; total time=   1.2s
[CV] END colsample_bytree=0.6, learning_rate=0.1, max_depth=3, n_estimators=200, subsample=0.6; total time=   1.2s
[CV] END colsample_bytree=0.6, learning_rate=0.1, max_depth=3, n_estimators=200, subsample=0.6; total time=   1.3s
[CV] END colsample_bytree=1.0, learning_rate=0.2, max_depth=10, n_estimators=100, subsample=0.8; total time=   1.3s
[CV] END colsample_bytree=0.6, learning_rate=0.1, max_depth=3, n_estimators=200, subsample=0.6; total time=   1.3s
[CV] END colsa

In [22]:
# Create a DataFrame for comparison
results_compare = pd.DataFrame({
    'Actual_TCA': y_test.values,
    'Predicted_TCA_XGB': y_pred_xgb,
    'Predicted_TCA_RF': y_pred,
    'Predicted_TCA_XGB_Tuned': y_pred_xgb_tuned
})

# Display the first 10 rows
print(results_compare.head(10))

   Actual_TCA  Predicted_TCA_XGB  Predicted_TCA_RF  Predicted_TCA_XGB_Tuned
0     24040.0       24314.378906          23206.85             23226.193359
1     28310.0       21836.056641          24337.05             23790.230469
2    215850.0      196507.984375         190129.40            197560.328125
3     58500.0       47074.191406          47968.83             49277.796875
4     29350.0       71652.789062          55937.00             62570.437500
5     13310.0       13716.552734          17936.90             17793.029297
6     23650.0       27203.396484          21943.05             25673.412109
7    107590.0      115703.031250         105752.50            115260.578125
8     33799.0       32385.943359          33895.13             32885.480469
9     24420.0       24232.083984          27084.30             23321.582031


In [23]:
import mlflow
import mlflow.sklearn
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Start an MLflow run
with mlflow.start_run(run_name="XGBoost Tuned"):
    # Use best parameters from RandomizedSearchCV
    best_params = xgb_random.best_params_
    xgb_final = XGBRegressor(**best_params, random_state=42)
    xgb_final.fit(X_train, y_train)
    y_pred_xgb_tuned = xgb_final.predict(X_test)

    # Calculate metrics
    mae = mean_absolute_error(y_test, y_pred_xgb_tuned)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred_xgb_tuned))
    
    r2 = r2_score(y_test, y_pred_xgb_tuned)

    # Log parameters and metrics
    mlflow.log_params(best_params)
    mlflow.log_metric("MAE", mae)
    mlflow.log_metric("RMSE", rmse)
    mlflow.log_metric("R2", r2)

    # Log the model
    mlflow.sklearn.log_model(xgb_final, "model")

    print(f"Logged run with MAE: {mae:.2f}, RMSE: {rmse:.2f}, R2: {r2:.3f}")

Logged run with MAE: 9724.36, RMSE: 15589.63, R2: 0.964


In [24]:
import mlflow
import mlflow.sklearn
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Start an MLflow run
with mlflow.start_run(run_name="XGBoost Tuned"):
    # Use best parameters from RandomizedSearchCV
    best_params = xgb_random.best_params_
    xgb_final = XGBRegressor(**best_params, random_state=42)
    xgb_final.fit(X_train, y_train)
    y_pred_xgb_tuned = xgb_final.predict(X_test)

    # Calculate metrics
    mae = mean_absolute_error(y_test, y_pred_xgb_tuned)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred_xgb_tuned))
    
    r2 = r2_score(y_test, y_pred_xgb_tuned)

    # Log parameters and metrics
    mlflow.log_params(best_params)
    mlflow.log_metric("MAE", mae)
    mlflow.log_metric("RMSE", rmse)
    mlflow.log_metric("R2", r2)

    # Log the model
    mlflow.sklearn.log_model(xgb_final, "model")

    print(f"Logged run with MAE: {mae:.2f}, RMSE: {rmse:.2f}, R2: {r2:.3f}")

Logged run with MAE: 9724.36, RMSE: 15589.63, R2: 0.964




In [25]:
import mlflow
import mlflow.sklearn
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Start an MLflow run
with mlflow.start_run(run_name="XGBoost Tuned"):
    # Use best parameters from RandomizedSearchCV
    best_params = xgb_random.best_params_
    xgb_final = XGBRegressor(**best_params, random_state=42)
    xgb_final.fit(X_train, y_train)
    y_pred_xgb_tuned = xgb_final.predict(X_test)

    # Calculate metrics
    mae = mean_absolute_error(y_test, y_pred_xgb_tuned)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred_xgb_tuned))
    
    r2 = r2_score(y_test, y_pred_xgb_tuned)

    # Log parameters and metrics
    mlflow.log_params(best_params)
    mlflow.log_metric("MAE", mae)
    mlflow.log_metric("RMSE", rmse)
    mlflow.log_metric("R2", r2)

    # Log the model
    mlflow.sklearn.log_model(xgb_final, "model")

    print(f"Logged run with MAE: {mae:.2f}, RMSE: {rmse:.2f}, R2: {r2:.3f}")

Logged run with MAE: 9724.36, RMSE: 15589.63, R2: 0.964




In [26]:
import mlflow
import mlflow.sklearn
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Start an MLflow run
with mlflow.start_run(run_name="XGBoost Tuned"):
    # Use best parameters from RandomizedSearchCV
    best_params = xgb_random.best_params_
    xgb_final = XGBRegressor(**best_params, random_state=42)
    xgb_final.fit(X_train, y_train)
    y_pred_xgb_tuned = xgb_final.predict(X_test)

    # Calculate metrics
    mae = mean_absolute_error(y_test, y_pred_xgb_tuned)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred_xgb_tuned))
    
    r2 = r2_score(y_test, y_pred_xgb_tuned)

    # Log parameters and metrics
    mlflow.log_params(best_params)
    mlflow.log_metric("MAE", mae)
    mlflow.log_metric("RMSE", rmse)
    mlflow.log_metric("R2", r2)

    # Log the model
    mlflow.sklearn.log_model(xgb_final, "model")

    print(f"Logged run with MAE: {mae:.2f}, RMSE: {rmse:.2f}, R2: {r2:.3f}")

Logged run with MAE: 9724.36, RMSE: 15589.63, R2: 0.964




In [27]:
import mlflow
import mlflow.sklearn
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Use your best_rf model from RandomizedSearchCV
with mlflow.start_run(run_name="RandomForest Tuned"):
    # Predict on the test set
    y_pred_rf = best_rf.predict(X_test)

    # Calculate metrics
    mae = mean_absolute_error(y_test, y_pred_rf)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred_rf))
    
    r2 = r2_score(y_test, y_pred_rf)

    # Log parameters and metrics
    mlflow.log_params(best_rf.get_params())
    mlflow.log_metric("MAE", mae)
    mlflow.log_metric("RMSE", rmse)
    mlflow.log_metric("R2", r2)

    # Log the model
    mlflow.sklearn.log_model(best_rf, "model")

    print(f"Logged Random Forest run with MAE: {mae:.2f}, RMSE: {rmse:.2f}, R2: {r2:.3f}")

Logged Random Forest run with MAE: 21400.71, RMSE: 30838.01, R2: 0.859




In [28]:
import mlflow
import mlflow.sklearn
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Start an MLflow run
with mlflow.start_run(run_name="XGBoost Tuned"):
    # Use best parameters from RandomizedSearchCV
    best_params = xgb_random.best_params_
    xgb_final = XGBRegressor(**best_params, random_state=42)
    xgb_final.fit(X_train, y_train)
    y_pred_xgb_tuned = xgb_final.predict(X_test)

    # Calculate metrics
    mae = mean_absolute_error(y_test, y_pred_xgb_tuned)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred_xgb_tuned))
    
    r2 = r2_score(y_test, y_pred_xgb_tuned)

    # Log parameters and metrics
    mlflow.log_params(best_params)
    mlflow.log_metric("MAE", mae)
    mlflow.log_metric("RMSE", rmse)
    mlflow.log_metric("R2", r2)

    # Log the model
    mlflow.sklearn.log_model(xgb_final, "model")

    print(f"Logged run with MAE: {mae:.2f}, RMSE: {rmse:.2f}, R2: {r2:.3f}")

Logged run with MAE: 9724.36, RMSE: 15589.63, R2: 0.964




In [29]:
import mlflow
import mlflow.sklearn
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Start an MLflow run
with mlflow.start_run(run_name="XGBoost Tuned"):
    # Use best parameters from RandomizedSearchCV
    best_params = xgb_random.best_params_
    xgb_final = XGBRegressor(**best_params, random_state=42)
    xgb_final.fit(X_train, y_train)
    y_pred_xgb_tuned = xgb_final.predict(X_test)

    # Calculate metrics
    mae = mean_absolute_error(y_test, y_pred_xgb_tuned)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred_xgb_tuned))
    
    r2 = r2_score(y_test, y_pred_xgb_tuned)

    # Log parameters and metrics
    mlflow.log_params(best_params)
    mlflow.log_metric("MAE", mae)
    mlflow.log_metric("RMSE", rmse)
    mlflow.log_metric("R2", r2)

    # Log the model
    mlflow.sklearn.log_model(xgb_final, "model")

    print(f"Logged run with MAE: {mae:.2f}, RMSE: {rmse:.2f}, R2: {r2:.3f}")

Logged run with MAE: 9724.36, RMSE: 15589.63, R2: 0.964




In [30]:
import mlflow
import mlflow.sklearn
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Start an MLflow run
with mlflow.start_run(run_name="XGBoost Tuned"):
    # Use best parameters from RandomizedSearchCV
    best_params = xgb_random.best_params_
    xgb_final = XGBRegressor(**best_params, random_state=42)
    xgb_final.fit(X_train, y_train)
    y_pred_xgb_tuned = xgb_final.predict(X_test)

    # Calculate metrics
    mae = mean_absolute_error(y_test, y_pred_xgb_tuned)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred_xgb_tuned))
    
    r2 = r2_score(y_test, y_pred_xgb_tuned)

    # Log parameters and metrics
    mlflow.log_params(best_params)
    mlflow.log_metric("MAE", mae)
    mlflow.log_metric("RMSE", rmse)
    mlflow.log_metric("R2", r2)

    # Log the model
    mlflow.sklearn.log_model(xgb_final, "model")

    print(f"Logged run with MAE: {mae:.2f}, RMSE: {rmse:.2f}, R2: {r2:.3f}")

Logged run with MAE: 9724.36, RMSE: 15589.63, R2: 0.964




In [31]:
import mlflow
import mlflow.sklearn
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Start an MLflow run
with mlflow.start_run(run_name="XGBoost Tuned"):
    # Use best parameters from RandomizedSearchCV
    best_params = xgb_random.best_params_
    xgb_final = XGBRegressor(**best_params, random_state=42)
    xgb_final.fit(X_train, y_train)
    y_pred_xgb_tuned = xgb_final.predict(X_test)

    # Calculate metrics
    mae = mean_absolute_error(y_test, y_pred_xgb_tuned)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred_xgb_tuned))
    
    r2 = r2_score(y_test, y_pred_xgb_tuned)

    # Log parameters and metrics
    mlflow.log_params(best_params)
    mlflow.log_metric("MAE", mae)
    mlflow.log_metric("RMSE", rmse)
    mlflow.log_metric("R2", r2)

    # Log the model
    mlflow.sklearn.log_model(xgb_final, "model")

    print(f"Logged run with MAE: {mae:.2f}, RMSE: {rmse:.2f}, R2: {r2:.3f}")

Logged run with MAE: 9724.36, RMSE: 15589.63, R2: 0.964




In [32]:
import mlflow
import mlflow.sklearn
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Start an MLflow run
with mlflow.start_run(run_name="XGBoost Tuned"):
    # Use best parameters from RandomizedSearchCV
    best_params = xgb_random.best_params_
    xgb_final = XGBRegressor(**best_params, random_state=42)
    xgb_final.fit(X_train, y_train)
    y_pred_xgb_tuned = xgb_final.predict(X_test)

    # Calculate metrics
    mae = mean_absolute_error(y_test, y_pred_xgb_tuned)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred_xgb_tuned))
    
    r2 = r2_score(y_test, y_pred_xgb_tuned)

    # Log parameters and metrics
    mlflow.log_params(best_params)
    mlflow.log_metric("MAE", mae)
    mlflow.log_metric("RMSE", rmse)
    mlflow.log_metric("R2", r2)

    # Log the model
    mlflow.sklearn.log_model(xgb_final, "model")

    print(f"Logged run with MAE: {mae:.2f}, RMSE: {rmse:.2f}, R2: {r2:.3f}")

Logged run with MAE: 9724.36, RMSE: 15589.63, R2: 0.964




In [33]:
  import sklearn
  print(sklearn.__version__)

1.6.1


In [34]:
import mlflow
import mlflow.sklearn
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Start an MLflow run
with mlflow.start_run(run_name="XGBoost Tuned"):
    # Use best parameters from RandomizedSearchCV
    best_params = xgb_random.best_params_
    xgb_final = XGBRegressor(**best_params, random_state=42)
    xgb_final.fit(X_train, y_train)
    y_pred_xgb_tuned = xgb_final.predict(X_test)

    # Calculate metrics
    mae = mean_absolute_error(y_test, y_pred_xgb_tuned)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred_xgb_tuned))
    
    r2 = r2_score(y_test, y_pred_xgb_tuned)

    # Log parameters and metrics
    mlflow.log_params(best_params)
    mlflow.log_metric("MAE", mae)
    mlflow.log_metric("RMSE", rmse)
    mlflow.log_metric("R2", r2)

    # Log the model
    mlflow.sklearn.log_model(xgb_final, "model")

    print(f"Logged run with MAE: {mae:.2f}, RMSE: {rmse:.2f}, R2: {r2:.3f}")

Logged run with MAE: 9724.36, RMSE: 15589.63, R2: 0.964




In [None]:
import mlflow
import mlflow.sklearn
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Start an MLflow run
with mlflow.start_run(run_name="XGBoost Tuned"):
    # Use best parameters from RandomizedSearchCV
    best_params = xgb_random.best_params_
    xgb_final = XGBRegressor(**best_params, random_state=42)
    xgb_final.fit(X_train, y_train)
    y_pred_xgb_tuned = xgb_final.predict(X_test)

    # Calculate metrics
    mae = mean_absolute_error(y_test, y_pred_xgb_tuned)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred_xgb_tuned))
    
    r2 = r2_score(y_test, y_pred_xgb_tuned)

    # Log parameters and metrics
    mlflow.log_params(best_params)
    mlflow.log_metric("MAE", mae)
    mlflow.log_metric("RMSE", rmse)
    mlflow.log_metric("R2", r2)

    # Log the model
    mlflow.sklearn.log_model(xgb_final, "model")

    print(f"Logged run with MAE: {mae:.2f}, RMSE: {rmse:.2f}, R2: {r2:.3f}")

Logged run with MAE: 9724.36, RMSE: 15589.63, R2: 0.964


In [None]:
import mlflow
import mlflow.sklearn
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Start an MLflow run
with mlflow.start_run(run_name="XGBoost Tuned"):
    # Use best parameters from RandomizedSearchCV
    best_params = xgb_random.best_params_
    xgb_final = XGBRegressor(**best_params, random_state=42)
    xgb_final.fit(X_train, y_train)
    y_pred_xgb_tuned = xgb_final.predict(X_test)

    # Calculate metrics
    mae = mean_absolute_error(y_test, y_pred_xgb_tuned)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred_xgb_tuned))
    
    r2 = r2_score(y_test, y_pred_xgb_tuned)

    # Log parameters and metrics
    mlflow.log_params(best_params)
    mlflow.log_metric("MAE", mae)
    mlflow.log_metric("RMSE", rmse)
    mlflow.log_metric("R2", r2)

    # Log the model
    mlflow.sklearn.log_model(xgb_final, "model")

    print(f"Logged run with MAE: {mae:.2f}, RMSE: {rmse:.2f}, R2: {r2:.3f}")

Logged run with MAE: 9724.36, RMSE: 15589.63, R2: 0.964


In [None]:
import mlflow
import mlflow.sklearn
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Start an MLflow run
with mlflow.start_run(run_name="XGBoost Tuned"):
    # Use best parameters from RandomizedSearchCV
    best_params = xgb_random.best_params_
    xgb_final = XGBRegressor(**best_params, random_state=42)
    xgb_final.fit(X_train, y_train)
    y_pred_xgb_tuned = xgb_final.predict(X_test)

    # Calculate metrics
    mae = mean_absolute_error(y_test, y_pred_xgb_tuned)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred_xgb_tuned))
    
    r2 = r2_score(y_test, y_pred_xgb_tuned)

    # Log parameters and metrics
    mlflow.log_params(best_params)
    mlflow.log_metric("MAE", mae)
    mlflow.log_metric("RMSE", rmse)
    mlflow.log_metric("R2", r2)

    # Log the model
    mlflow.sklearn.log_model(xgb_final, "model")

    print(f"Logged run with MAE: {mae:.2f}, RMSE: {rmse:.2f}, R2: {r2:.3f}")

Logged run with MAE: 9724.36, RMSE: 15589.63, R2: 0.964


In [45]:
df2 = pd.read_csv("/Users/rajatthakur/Desktop/SuperDataScienceML/CollaborationProjects/Edu-spend/model_columns.csv")
print(df2.columns)

Index(['0'], dtype='object')


In [None]:
import mlflow
import mlflow.sklearn
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Start an MLflow run
with mlflow.start_run(run_name="XGBoost Tuned"):
    # Use best parameters from RandomizedSearchCV
    best_params = xgb_random.best_params_
    xgb_final = XGBRegressor(**best_params, random_state=42)
    xgb_final.fit(X_train, y_train)
    y_pred_xgb_tuned = xgb_final.predict(X_test)

    # Calculate metrics
    mae = mean_absolute_error(y_test, y_pred_xgb_tuned)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred_xgb_tuned))
    
    r2 = r2_score(y_test, y_pred_xgb_tuned)

    # Log parameters and metrics
    mlflow.log_params(best_params)
    mlflow.log_metric("MAE", mae)
    mlflow.log_metric("RMSE", rmse)
    mlflow.log_metric("R2", r2)

    # Log the model
    mlflow.sklearn.log_model(xgb_final, "model")

    print(f"Logged run with MAE: {mae:.2f}, RMSE: {rmse:.2f}, R2: {r2:.3f}")

Logged run with MAE: 9724.36, RMSE: 15589.63, R2: 0.964


In [None]:
import mlflow
import mlflow.sklearn
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Start an MLflow run
with mlflow.start_run(run_name="XGBoost Tuned"):
    # Use best parameters from RandomizedSearchCV
    best_params = xgb_random.best_params_
    xgb_final = XGBRegressor(**best_params, random_state=42)
    xgb_final.fit(X_train, y_train)
    y_pred_xgb_tuned = xgb_final.predict(X_test)

    # Calculate metrics
    mae = mean_absolute_error(y_test, y_pred_xgb_tuned)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred_xgb_tuned))
    
    r2 = r2_score(y_test, y_pred_xgb_tuned)

    # Log parameters and metrics
    mlflow.log_params(best_params)
    mlflow.log_metric("MAE", mae)
    mlflow.log_metric("RMSE", rmse)
    mlflow.log_metric("R2", r2)

    # Log the model
    mlflow.sklearn.log_model(xgb_final, "model")

    print(f"Logged run with MAE: {mae:.2f}, RMSE: {rmse:.2f}, R2: {r2:.3f}")

Logged run with MAE: 9724.36, RMSE: 15589.63, R2: 0.964


In [None]:
import mlflow
import mlflow.sklearn
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Start an MLflow run
with mlflow.start_run(run_name="XGBoost Tuned"):
    # Use best parameters from RandomizedSearchCV
    best_params = xgb_random.best_params_
    xgb_final = XGBRegressor(**best_params, random_state=42)
    xgb_final.fit(X_train, y_train)
    y_pred_xgb_tuned = xgb_final.predict(X_test)

    # Calculate metrics
    mae = mean_absolute_error(y_test, y_pred_xgb_tuned)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred_xgb_tuned))
    
    r2 = r2_score(y_test, y_pred_xgb_tuned)

    # Log parameters and metrics
    mlflow.log_params(best_params)
    mlflow.log_metric("MAE", mae)
    mlflow.log_metric("RMSE", rmse)
    mlflow.log_metric("R2", r2)

    # Log the model
    mlflow.sklearn.log_model(xgb_final, "model")

    print(f"Logged run with MAE: {mae:.2f}, RMSE: {rmse:.2f}, R2: {r2:.3f}")

Logged run with MAE: 9724.36, RMSE: 15589.63, R2: 0.964


In [None]:
import mlflow
import mlflow.sklearn
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Start an MLflow run
with mlflow.start_run(run_name="XGBoost Tuned"):
    # Use best parameters from RandomizedSearchCV
    best_params = xgb_random.best_params_
    xgb_final = XGBRegressor(**best_params, random_state=42)
    xgb_final.fit(X_train, y_train)
    y_pred_xgb_tuned = xgb_final.predict(X_test)

    # Calculate metrics
    mae = mean_absolute_error(y_test, y_pred_xgb_tuned)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred_xgb_tuned))
    
    r2 = r2_score(y_test, y_pred_xgb_tuned)

    # Log parameters and metrics
    mlflow.log_params(best_params)
    mlflow.log_metric("MAE", mae)
    mlflow.log_metric("RMSE", rmse)
    mlflow.log_metric("R2", r2)

    # Log the model
    mlflow.sklearn.log_model(xgb_final, "model")

    print(f"Logged run with MAE: {mae:.2f}, RMSE: {rmse:.2f}, R2: {r2:.3f}")

Logged run with MAE: 9724.36, RMSE: 15589.63, R2: 0.964


In [None]:
import mlflow
import mlflow.sklearn
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Start an MLflow run
with mlflow.start_run(run_name="XGBoost Tuned"):
    # Use best parameters from RandomizedSearchCV
    best_params = xgb_random.best_params_
    xgb_final = XGBRegressor(**best_params, random_state=42)
    xgb_final.fit(X_train, y_train)
    y_pred_xgb_tuned = xgb_final.predict(X_test)

    # Calculate metrics
    mae = mean_absolute_error(y_test, y_pred_xgb_tuned)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred_xgb_tuned))
    
    r2 = r2_score(y_test, y_pred_xgb_tuned)

    # Log parameters and metrics
    mlflow.log_params(best_params)
    mlflow.log_metric("MAE", mae)
    mlflow.log_metric("RMSE", rmse)
    mlflow.log_metric("R2", r2)

    # Log the model
    mlflow.sklearn.log_model(xgb_final, "model")

    print(f"Logged run with MAE: {mae:.2f}, RMSE: {rmse:.2f}, R2: {r2:.3f}")

Logged run with MAE: 9724.36, RMSE: 15589.63, R2: 0.964


In [None]:
import mlflow
import mlflow.sklearn
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Start an MLflow run
with mlflow.start_run(run_name="XGBoost Tuned"):
    # Use best parameters from RandomizedSearchCV
    best_params = xgb_random.best_params_
    xgb_final = XGBRegressor(**best_params, random_state=42)
    xgb_final.fit(X_train, y_train)
    y_pred_xgb_tuned = xgb_final.predict(X_test)

    # Calculate metrics
    mae = mean_absolute_error(y_test, y_pred_xgb_tuned)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred_xgb_tuned))
    
    r2 = r2_score(y_test, y_pred_xgb_tuned)

    # Log parameters and metrics
    mlflow.log_params(best_params)
    mlflow.log_metric("MAE", mae)
    mlflow.log_metric("RMSE", rmse)
    mlflow.log_metric("R2", r2)

    # Log the model
    mlflow.sklearn.log_model(xgb_final, "model")

    print(f"Logged run with MAE: {mae:.2f}, RMSE: {rmse:.2f}, R2: {r2:.3f}")

Logged run with MAE: 9724.36, RMSE: 15589.63, R2: 0.964
