# 🤖 Model Training

In [None]:
Machine Learning Models

# In[26]:


X = df[['fire','secveg','crop','urban','port','river','road','mining','dgfor','defor','precp','precn','tempp']]
y = df['refor']


# ### *Linear Regression*

# In[28]:


import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import time

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.feature_selection import mutual_info_regression

get_ipython().run_line_magic('reload_ext', 'memory_profiler')

get_ipython().run_line_magic('memit', '')
  # Split the data into training, validation, and test sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=45)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=45)

  # Create and train the model
model = LinearRegression()
start_time = time.time()
model.fit(X_train, y_train)
end_time = time.time()
time_taken_1 = end_time - start_time

  # Predict on the validation set
y_test_pred = model.predict(X_test)

import sys

# Determine memory usage of objects
X_train_size = sys.getsizeof(X_train)
X_val_size = sys.getsizeof(X_val)
X_test_size = sys.getsizeof(X_test)
y_train_size = sys.getsizeof(y_train)
y_val_size = sys.getsizeof(y_val)
y_test_size = sys.getsizeof(y_test)

y_test_pred_size = sys.getsizeof(y_test_pred)
total_size = X_train_size + X_val_size + X_test_size + y_train_size + y_val_size + y_test_size + y_test_pred_size
model_size = sys.getsizeof(model)

# Calculate metrics on the validation set
mse_test = mean_squared_error(y_test, y_test_pred)
rmse_test = mean_squared_error(y_test, y_test_pred, squared=False)
r2_test = r2_score(y_test, y_test_pred)
mae_test = mean_absolute_error(y_test, y_test_pred)

print(f"Time taken: {time_taken_1} seconds")
print("Test Mean Squared Error (MSE):", mse_test)
print("Test Root Mean Squared Error (RMSE):", rmse_test)
print("Test R-squared:", r2_test)
print("Test Mean Absolute Error (MAE):", mae_test)
print("Basic Data Storage:", total_size)
print("Model Memory:", model_size)

# Calculate residuals
residuals = y_test - y_test_pred

# Create residual plot
plt.figure(figsize=(10,6))
plt.scatter(y_test_pred, residuals, alpha=0.5)
plt.axhline(y=0, color='red')
plt.title('Residuals vs Predicted Values for Linear Regression on Test Set')
plt.xlabel('Predicted Values')
plt.ylabel('Residuals')
plt.show()


# In[29]:


import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_selection import mutual_info_regression

# Apply Information Gain
ig = mutual_info_regression(X, y)

# Create a dictionary of feature importance scores
feature_scores = {}
for i in range(len(X.columns)):
    feature_scores[X.columns[i]] = ig[i]

# Sort the features by importance score in descending order
sorted_features = sorted(feature_scores.items(), key=lambda x: x[1], reverse=True)

# Print the feature importance scores and the sorted features
for feature, score in sorted_features:
    print("Feature:", feature, "Score:", score)

# Plot a horizontal bar chart of the feature importance scores
fig, ax = plt.subplots()
y_pos = np.arange(len(sorted_features))
ax.barh(y_pos, [score for feature, score in sorted_features], align="center")
ax.set_yticks(y_pos)
ax.set_yticklabels([feature for feature, score in sorted_features])
ax.invert_yaxis()  # Labels read top-to-bottom
ax.set_xlabel("Importance Score")
ax.set_title("Feature Importance Scores (Information Gain)")

# Add importance scores as labels on the horizontal bar chart
for i, v in enumerate([score for feature, score in sorted_features]):
    ax.text(v + 0.01, i, str(round(v, 3)), color="black", fontweight="bold")

plt.show()


# In[30]:


from sklearn.model_selection import learning_curve
from sklearn.linear_model import LinearRegression

def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None, n_jobs=None, train_sizes=np.linspace(.1, 1.0, 10)):
    
    plt.figure(figsize=(12, 6))
    plt.title(title)
    if ylim is not None:
        plt.ylim(*ylim)
    plt.xlabel("Training examples")
    plt.ylabel("Score")
    
    train_sizes, train_scores, test_scores = learning_curve(estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes, scoring='neg_mean_squared_error')
    train_scores_mean = -np.mean(train_scores, axis=1)  # We negate because learning_curve returns negative values for MSE
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = -np.mean(test_scores, axis=1)  # We negate because learning_curve returns negative values for MSE
    test_scores_std = np.std(test_scores, axis=1)
    
    plt.grid()
    
    plt.fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.1, color="r")
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, alpha=0.1, color="g")
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r", label="Training score")
    plt.plot(train_sizes, test_scores_mean, 'o-', color="g", label="Cross-validation score")
    
    plt.legend(loc="best")
    
    return plt

# Defining the model
model = LinearRegression()

# Plotting learning curve for LinearRegression
title = "Learning Curves (LinearRegression)"
cv = 10  # Define the number of folds for cross-validation
plot_learning_curve(model, title, X_train, y_train, cv=cv, n_jobs=-1)

plt.show()


# In[31]:


from sklearn.linear_model import Ridge, Lasso

# Create and train the Ridge regression model
ridge_model = Ridge(alpha=1.0)
ridge_model.fit(X_train, y_train)

# Predict on the validation set using the Ridge regression model
y_test_pred_ridge = ridge_model.predict(X_test)

# Calculate metrics on the validation set for the Ridge regression model
mse_test_ridge = mean_squared_error(y_test, y_test_pred_ridge)
rmse_test_ridge = mean_squared_error(y_test, y_test_pred_ridge, squared=False)
r2_test_ridge = r2_score(y_test, y_test_pred_ridge)

print("Ridge Validation Mean Squared Error (MSE):", mse_test_ridge)
print("Ridge Validation Root Mean Squared Error (RMSE):", rmse_test_ridge)
print("Ridge Validation R-squared:", r2_test_ridge)

# Create and train the Lasso regression model
lasso_model = Lasso(alpha=1.0)
lasso_model.fit(X_train, y_train)

# Predict on the validation set using the Lasso regression model
y_test_pred_lasso = lasso_model.predict(X_test)

# Calculate metrics on the validation set for the Lasso regression model
mse_test_lasso = mean_squared_error(y_test, y_test_pred_lasso)
rmse_test_lasso = mean_squared_error(y_test, y_test_pred_lasso, squared=False)
r2_test_lasso = r2_score(y_test, y_test_pred_lasso)

print("Lasso Validation Mean Squared Error (MSE):", mse_test_lasso)
print("Lasso Validation Root Mean Squared Error (RMSE):", rmse_test_lasso)
print("Lasso Validation R-squared:", r2_test_lasso)


# ### *Decision Tree*

# In[32]:


from sklearn.tree import DecisionTreeRegressor

get_ipython().run_line_magic('reload_ext', 'memory_profiler')

get_ipython().run_line_magic('memit', '')
# Split the data into training, validation, and test sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Create and train the model
dt = DecisionTreeRegressor(max_depth=10, min_samples_leaf=10, min_samples_split=40)  # Adjust the max_depth parameter if needed
start_time = time.time()
dt.fit(X_train, y_train)
end_time = time.time()

# Predict on the validation set
y_test_pred = dt.predict(X_test)

# Perform 5-fold cross-validation
cv_scores = cross_val_score(dt, X_train, y_train, cv=5)

# Print the cross-validation scores
print("Cross-validation scores:", cv_scores)

# Print the mean cross-validation score
print("Mean cross-validation score:", cv_scores.mean())

# Calculate metrics on the validation set
mse_test = mean_squared_error(y_test, y_test_pred)
rmse_test = mean_squared_error(y_test, y_test_pred, squared=False)
r2_test = r2_score(y_test, y_test_pred)
mae_test = mean_absolute_error(y_test, y_test_pred)

print(f"Time taken: {end_time - start_time} seconds")
print("Test Mean Squared Error (MSE):", mse_test)
print("Test Root Mean Squared Error (RMSE):", rmse_test)
print("Test R-squared:", r2_test)
print("Test Mean Absolute Error (MAE):", mae_test)

# Calculate residuals for the validation set
residuals_test = y_test - y_test_pred

# Create residual plot for the validation set
plt.figure(figsize=(10,6))
plt.scatter(y_test_pred, residuals_test, alpha=0.5)
plt.axhline(y=0, color='red')
plt.title('Residuals vs Predicted Values for Decision Tree on Test Set')
plt.xlabel('Predicted Values')
plt.ylabel('Residuals')
plt.show()


# In[33]:


from sklearn.model_selection import learning_curve

def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None, n_jobs=None, train_sizes=np.linspace(.1, 1.0, 10)):
    
    plt.figure(figsize=(12, 6))
    plt.title(title)
    if ylim is not None:
        plt.ylim(*ylim)
    plt.xlabel("Training examples")
    plt.ylabel("Score")
    
    train_sizes, train_scores, test_scores = learning_curve(estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes, scoring='neg_mean_squared_error')
    train_scores_mean = -np.mean(train_scores, axis=1)  # We negate because learning_curve returns negative values for MSE
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = -np.mean(test_scores, axis=1)  # We negate because learning_curve returns negative values for MSE
    test_scores_std = np.std(test_scores, axis=1)
    
    plt.grid()
    
    plt.fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.1, color="r")
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, alpha=0.1, color="g")
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r", label="Training score")
    plt.plot(train_sizes, test_scores_mean, 'o-', color="g", label="Cross-validation score")
    
    plt.legend(loc="best")
    
    return plt

# Defining the model
dt = DecisionTreeRegressor(max_depth=10, min_samples_leaf=10, min_samples_split=40)

# Plotting learning curve for DecisionTreeRegressor
title = "Learning Curves (DecisionTreeRegressor)"
cv = 10  # Define the number of folds for cross-validation
plot_learning_curve(dt, title, X_train, y_train, cv=cv, n_jobs=-1)

plt.show()


# In[34]:


from sklearn.model_selection import GridSearchCV

# Define the hyperparameters and the values you want to try
param_grid = {
    'max_depth': [5, 10, 15, 20, 25],
    'min_samples_leaf': [5, 10, 15, 20],
    'min_samples_split': [10, 20, 30, 40]
}

# Create a GridSearchCV object
grid_search = GridSearchCV(dt, param_grid, cv=5, scoring='r2')

# Fit the model to the data and find the best hyperparameters
grid_search.fit(X_train, y_train)

# Print the best parameters it found
print(grid_search.best_params_)


# In[35]:


from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

# Define the hyperparameters and the distributions to sample from
param_dist = {
    'max_depth': randint(5, 25),
    'min_samples_leaf': randint(5, 20),
    'min_samples_split': randint(10, 40)
}

# Create a RandomizedSearchCV object
random_search = RandomizedSearchCV(dt, param_distributions=param_dist, 
                                   n_iter=20, cv=5, scoring='r2')

# Fit the model to the data and find the best hyperparameters
random_search.fit(X_train, y_train)

# Print the best parameters it found
print(random_search.best_params_)


# ### *Random Forest*

# In[36]:


from sklearn.ensemble import RandomForestRegressor

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

rf = RandomForestRegressor(n_estimators=200, max_depth=10, min_samples_leaf=1, max_features=1.0)  # Adjust the number of estimators if needed
start_time = time.time()
rf.fit(X_train, y_train)
end_time = time.time()
importances = rf.feature_importances_
indices = np.argsort(importances)[::-1]
feature_names = X.columns[indices]
sorted_importances = importances[indices]
for i in range(len(sorted_importances)):
    print(f"{feature_names[i]}: {sorted_importances[i]}")
    
time_taken_2 = end_time - start_time

# Perform 5-fold cross validation
scores = cross_val_score(rf, X_train, y_train, cv=25, scoring='r2')

from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

# Make predictions using the Random Forest model
y_pred = rf.predict(X_test)

# Calculate evaluation metrics
# Print