In [7]:
import pandas as pd
from pymongo import MongoClient
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score, mean_squared_error
import matplotlib.pyplot as plt



In [8]:
def load_data_from_mongodb():
    db_url = "mongodb+srv://abisekraut:Avanna94801@cluster0.epbos.mongodb.net/"
    client = MongoClient(db_url)
    db = client['BanderSnatch']
    collection = db['Monster']

    data = list(collection.find())
    df = pd.DataFrame(data)
    
    # Clean up DataFrame: Remove MongoDB's '_id' if present
    if '_id' in df.columns:
        df.drop('_id', axis=1, inplace=True)
    
    return df

df = load_data_from_mongodb()
print(df.head())


           Name       Type  Level  Rarity  Damage  Health  Energy  Sanity  \
0         Ghast     Undead      1  Rank 2   1d6+1    4.88    4.42    8.67   
1   Kobold Mage   Devilkin     10  Rank 0  10d2+2   19.88   20.38   19.46   
2     Pit Fiend    Demonic     11  Rank 2  11d6+4   64.71   64.58   66.03   
3       Efreeti  Elemental      7  Rank 1     7d4   28.60   29.81   27.04   
4  Pseudodragon     Dragon      3  Rank 4  3d10+3   32.36   29.04   28.24   

             Timestamp  
0  2025-01-16 21:30:33  
1  2025-01-16 21:30:33  
2  2025-01-16 21:30:33  
3  2025-01-16 21:30:33  
4  2025-01-16 21:30:33  


In [9]:
# Example for encoding categorical data if the target variable is categorical
from sklearn.preprocessing import LabelEncoder

# Assuming 'target' is your categorical target column
if df['Sanity'].dtype == 'object':
    le = LabelEncoder()
    df['Sanity'] = le.fit_transform(df['Sanity'])

# Optionally, encode other categorical features
categorical_features = df.select_dtypes(include=['object']).columns
df = pd.get_dummies(df, columns=categorical_features)


In [10]:
X = df.drop('Sanity', axis=1)
y = df['Sanity']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)


In [11]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR

models = {
    'Random Forest': RandomForestRegressor(),
    'Gradient Boosting': GradientBoostingRegressor(),
    'SVM': SVR()
}

# Re-run model training
for name, model in models.items():
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    # Use regression metrics, e.g., mean squared error
    print(f"{name} MSE:", mean_squared_error(y_test, predictions))



Random Forest MSE: 4.608006144449997
Gradient Boosting MSE: 4.592279822334725
SVM MSE: 111.00467658508069


In [12]:
from sklearn.metrics import mean_squared_error

# Dictionary to store models and their MSE for easy comparison
model_performance = {}

# Train and evaluate models
for name, model in models.items():
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    mse = mean_squared_error(y_test, predictions)
    model_performance[name] = mse
    print(f"{name} MSE:", mse)

# Identify the model with the lowest MSE
best_model_name = min(model_performance, key=model_performance.get)
best_model = models[best_model_name]
print(f"Best Model: {best_model_name} with MSE: {model_performance[best_model_name]}")


Random Forest MSE: 4.771134637549983
Gradient Boosting MSE: 4.54668046206719
SVM MSE: 111.00467658508069
Best Model: Gradient Boosting with MSE: 4.54668046206719


In [13]:
from sklearn.model_selection import GridSearchCV

# Example using RandomForestRegressor for Grid Search
if best_model_name == 'Random Forest':
    param_grid = {
        'n_estimators': [100, 200, 300],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10]
    }
    grid_search = GridSearchCV(best_model, param_grid, cv=5, scoring='neg_mean_squared_error', verbose=1)
    grid_search.fit(X_train, y_train)
    best_model = grid_search.best_estimator_
    best_params = grid_search.best_params_
    best_score = -grid_search.best_score_  # Convert to positive as GridSearchCV returns negative MSE

    print("Best parameters:", best_params)
    print("Best cross-validated MSE:", best_score)

# After tuning, re-evaluate the model on the test data
final_predictions = best_model.predict(X_test)
final_mse = mean_squared_error(y_test, final_predictions)
print(f"Final MSE of {best_model_name} after tuning: {final_mse}")


Final MSE of Gradient Boosting after tuning: 4.54668046206719


In [15]:
from math import sqrt

final_predictions = best_model.predict(X_test)
final_mse = mean_squared_error(y_test, final_predictions)
final_rmse = sqrt(final_mse)  # Manually compute the square root of MSE
final_mae = mean_absolute_error(y_test, final_predictions)
final_r2 = r2_score(y_test, final_predictions)

print(f"Final MSE: {final_mse}")
print(f"Final RMSE: {final_rmse}")
print(f"Final MAE: {final_mae}")
print(f"Final R^2 Score: {final_r2}")


Final MSE: 4.54668046206719
Final RMSE: 2.132294647103723
Final MAE: 1.599559871141953
Final R^2 Score: 0.9957245113399319


In [16]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingRegressor  # Assuming a regression task

# Define the model
model = GradientBoostingRegressor()

# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7],
    'min_samples_split': [2, 4],
    'min_samples_leaf': [1, 2]
}

# Setup the grid search
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error', verbose=1, n_jobs=-1)
grid_search.fit(X_train, y_train)


Fitting 5 folds for each of 108 candidates, totalling 540 fits


In [17]:
# Best parameters found
print("Best parameters found: ", grid_search.best_params_)

# Best score from the grid search
print("Best MSE from grid search: ", -grid_search.best_score_)

# Use the best estimator to make predictions
best_model = grid_search.best_estimator_
predictions = best_model.predict(X_test)
mse = mean_squared_error(y_test, predictions)
print("Test MSE with best model: ", mse)


Best parameters found:  {'learning_rate': 0.1, 'max_depth': 5, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best MSE from grid search:  5.4233602662302856
Test MSE with best model:  4.920872234601521
