# Basic Exploration

In [1]:
!pip install sklearn==0.0 sklearn-pandas==1.8.0



# Dependencies

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('ggplot')

import statsmodels.api as sm
from statsmodels.compat import lzip
import statsmodels.stats.api as sms
from statsmodels.formula.api import ols
from scipy.stats import zscore
from statsmodels.stats.stattools import durbin_watson
from sklearn.model_selection import train_test_split,KFold
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.metrics import mean_squared_error
from sklearn.feature_selection import RFECV
from mlxtend.feature_selection import SequentialFeatureSelector as sfs
from mlxtend.plotting import plot_sequential_feature_selection as plot_sfs
from sklearn.linear_model import LinearRegression,RidgeCV,LassoCV,ElasticNetCV

# Load and describe data

In [3]:
!pip install ucimlrepo



In [4]:
from ucimlrepo import fetch_ucirepo
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(6.5, 6.5))

import warnings
warnings.filterwarnings('ignore')

<Figure size 650x650 with 0 Axes>

In [5]:
target = 'area'

In [6]:
forest_fires = fetch_ucirepo(id=162)
features = forest_fires.data.features
targets = forest_fires.data.targets

df = pd.DataFrame(data=features, columns=forest_fires.feature_names)
df['area'] = targets

In [7]:
df.shape

(517, 13)

---

# Missing value treatment

In [8]:
df.isna().sum().sum()

0

In [9]:
# Outlier points
y_outliers = df[abs(zscore(df[target])) >= 3 ]
y_outliers

Unnamed: 0,X,Y,month,day,FFMC,DMC,DC,ISI,temp,RH,wind,rain,area
237,1,2,sep,tue,91.0,129.5,692.6,7.0,18.8,40,2.2,0.0,212.88
238,6,5,sep,sat,92.5,121.1,674.4,8.6,25.1,27,4.0,0.0,1090.84
415,8,6,aug,thu,94.8,222.4,698.6,13.9,27.5,27,4.9,0.0,746.28
479,7,4,jul,mon,89.2,103.9,431.6,6.4,22.6,57,4.9,0.0,278.53


### Independent columns

In [10]:
dfa = df.drop(columns=target)
cat_columns = dfa.select_dtypes(include='object').columns.tolist()
num_columns = dfa.select_dtypes(exclude='object').columns.tolist()

cat_columns,num_columns

(['month', 'day'],
 ['X', 'Y', 'FFMC', 'DMC', 'DC', 'ISI', 'temp', 'RH', 'wind', 'rain'])

---

In [11]:
print(df['area'].describe(),'\n')
print(y_outliers)

count     517.000000
mean       12.847292
std        63.655818
min         0.000000
25%         0.000000
50%         0.520000
75%         6.570000
max      1090.840000
Name: area, dtype: float64 

     X  Y month  day  FFMC    DMC     DC   ISI  temp  RH  wind  rain     area
237  1  2   sep  tue  91.0  129.5  692.6   7.0  18.8  40   2.2   0.0   212.88
238  6  5   sep  sat  92.5  121.1  674.4   8.6  25.1  27   4.0   0.0  1090.84
415  8  6   aug  thu  94.8  222.4  698.6  13.9  27.5  27   4.9   0.0   746.28
479  7  4   jul  mon  89.2  103.9  431.6   6.4  22.6  57   4.9   0.0   278.53


# Outlier treatment

We had observed outliers in the following columns:
1. area
2. FFMC
2. ISI
3. rain

In [12]:
out_columns = ['area','FFMC','ISI','rain']

# Preparing the data for modelling
Thing which we can cover here
- Encoding the categorical columns

In [13]:
df = pd.get_dummies(df,columns=['day','month'],drop_first=True)

- Data transformations like `log,root,inverse,exponential`,etc

In [14]:
print(df[out_columns].describe())
np.log1p(df[out_columns]).skew(), np.log1p(df[out_columns]).kurtosis()

              area        FFMC         ISI        rain
count   517.000000  517.000000  517.000000  517.000000
mean     12.847292   90.644681    9.021663    0.021663
std      63.655818    5.520111    4.559477    0.295959
min       0.000000   18.700000    0.000000    0.000000
25%       0.000000   90.200000    6.500000    0.000000
50%       0.520000   91.600000    8.400000    0.000000
75%       6.570000   92.900000   10.800000    0.000000
max    1090.840000   96.200000   56.100000    6.400000


(area     1.217838
 FFMC   -11.675394
 ISI     -0.937218
 rain    14.173028
 dtype: float64,
 area      0.945668
 FFMC    185.482383
 ISI       2.584588
 rain    234.240025
 dtype: float64)

In [15]:
# FFMC and rain are still having high skew and kurtosis values,
# since we will be using Linear regression model we cannot operate with such high values
# so for FFMC we can remove the outliers in them using z-score method
mask = df.loc[:,['FFMC']].apply(zscore).abs() < 3

# Since most of the values in rain are 0.0, we can convert it as a categorical column
df['rain'] = df['rain'].apply(lambda x: int(x > 0.0))

df = df[mask.values]
df.shape

(510, 28)

In [16]:
out_columns.remove('rain')
df[out_columns] = np.log1p(df[out_columns])

In [17]:
df[out_columns].skew()

area    1.208492
FFMC   -1.803993
ISI    -0.434372
dtype: float64

---

## Linear Regression

In [19]:
import pandas as pd
import numpy as np
from ucimlrepo import fetch_ucirepo
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFECV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Load data
forest_fires = fetch_ucirepo(id=162)  # Fetch forest fire dataset from UCI repository
features = forest_fires.data.features  # Extract features from the dataset
targets = forest_fires.data.targets  # Extract target variable from the dataset
df = pd.DataFrame(data=features, columns=forest_fires.feature_names)  # Create DataFrame for features
df['area'] = targets  # Add target variable 'area' to the DataFrame

# Data preprocessing
df = pd.get_dummies(df, columns=['day', 'month'], drop_first=True)  # One-hot encode categorical columns
df['rain'] = df['rain'].apply(lambda x: int(x > 0.0))  # Convert 'rain' column to binary variable
df = df[df['FFMC'].apply(lambda x: abs((x - df['FFMC'].mean()) / df['FFMC'].std()) < 3)]  # Remove outliers
df['area'] = np.log1p(df['area'])  # Log-transform the target variable 'area'

# Feature selection
X = df.drop(columns=['area'])  # Features
y = df['area']  # Target variable
selector = RFECV(LinearRegression(), cv=5)  # Initialize RFECV with Linear Regression estimator and 5-fold CV
selector.fit(X, y)  # Fit RFECV on data
selected_features = X.columns[selector.support_]  # Get selected features

# Model building
X_selected = X[selected_features]  # Selected features
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=42)  # Train-test split
model = LinearRegression()  # Initialize Linear Regression model
model.fit(X_train, y_train)  # Train the model on the training data

# Model evaluation
y_train_pred = model.predict(X_train)  # Predictions on training data
y_test_pred = model.predict(X_test)  # Predictions on test data

train_mse = mean_squared_error(y_train, y_train_pred)  # Mean Squared Error on training data
test_mse = mean_squared_error(y_test, y_test_pred)  # Mean Squared Error on test data

train_rmse = np.sqrt(train_mse)  # Root Mean Squared Error on training data
test_rmse = np.sqrt(test_mse)  # Root Mean Squared Error on test data

train_mae = mean_absolute_error(y_train, y_train_pred)  # Mean Absolute Error on training data
test_mae = mean_absolute_error(y_test, y_test_pred)  # Mean Absolute Error on test data

train_r2 = r2_score(y_train, y_train_pred)  # R-squared score on training data
test_r2 = r2_score(y_test, y_test_pred)  # R-squared score on test data

# Inference
print(f"Selected Features: {selected_features}")  # Print selected features
print(f"Test MSE: {test_mse}")  # Print test Mean Squared Error
print(f"Test RMSE: {test_rmse}")  # Print test Root Mean Squared Error
print(f"Test MAE: {test_mae}")  # Print test Mean Absolute Error
print(f"Test R2 Score: {test_r2}")  # Print test R-squared score


Selected Features: Index(['month_dec', 'month_nov'], dtype='object')
Test MSE: 1.8870239012814765
Test RMSE: 1.3736898854113604
Test MAE: 1.1482768510031323
Test R2 Score: 0.006522653557889835


## XGBoost

In [20]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from xgboost import XGBRegressor
from ucimlrepo import fetch_ucirepo

forest_fires = fetch_ucirepo(id=162)
features = forest_fires.data.features
targets = forest_fires.data.targets
df = pd.DataFrame(data=features, columns=forest_fires.feature_names)
df['area'] = targets


df = pd.get_dummies(df, columns=['day', 'month'], drop_first=True)
df['rain'] = df['rain'].apply(lambda x: int(x > 0.0))
df = df[df['FFMC'].apply(lambda x: abs((x - df['FFMC'].mean()) / df['FFMC'].std()) < 3)]
df['area'] = np.log1p(df['area'])

X = df.drop(columns=['area'])
y = df['area']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = XGBRegressor()
model.fit(X_train, y_train)
y_test_pred = model.predict(X_test)

test_mse = mean_squared_error(y_test, y_test_pred)
test_rmse = np.sqrt(test_mse)
test_mae = mean_absolute_error(y_test, y_test_pred)
test_r2 = r2_score(y_test, y_test_pred)

print("XGBoost Scores:")
print(f"Test MSE: {test_mse}")
print(f"Test RMSE: {test_rmse}")
print(f"Test MAE: {test_mae}")
print(f"Test R2 Score: {test_r2}")


XGBoost Scores:
Test MSE: 2.7169130226681566
Test RMSE: 1.648306107089383
Test MAE: 1.2850299997290686
Test R2 Score: -0.43039605298128647


Hyperparameter Tuning

In [21]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from xgboost import XGBRegressor
from ucimlrepo import fetch_ucirepo

# Load data
forest_fires = fetch_ucirepo(id=162)  # Fetch forest fire dataset from UCI repository
features = forest_fires.data.features  # Extract features from the dataset
targets = forest_fires.data.targets  # Extract target variable from the dataset
df = pd.DataFrame(data=features, columns=forest_fires.feature_names)  # Create DataFrame for features
df['area'] = targets  # Add target variable 'area' to the DataFrame

# Data preprocessing
df = pd.get_dummies(df, columns=['day', 'month'], drop_first=True)  # One-hot encode categorical columns
df['rain'] = df['rain'].apply(lambda x: int(x > 0.0))  # Convert 'rain' column to binary variable
df = df[df['FFMC'].apply(lambda x: abs((x - df['FFMC'].mean()) / df['FFMC'].std()) < 3)]  # Remove outliers
df['area'] = np.log1p(df['area'])  # Log-transform the target variable 'area'

# Feature selection (optional for XGBoost)
X = df.drop(columns=['area'])  # Features
y = df['area']  # Target variable

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Hyperparameter tuning using grid search
param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.05, 0.1, 0.2],
    'max_depth': [3, 5, 7],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0]
}

grid_search = GridSearchCV(estimator=XGBRegressor(), param_grid=param_grid,
                           scoring='neg_mean_squared_error', cv=5, verbose=1, n_jobs=-1)
grid_search.fit(X_train, y_train)

# Best parameters found by grid search
print("Best Parameters:", grid_search.best_params_)

# Best XGBoost model
best_model = grid_search.best_estimator_

# Predictions on test data
y_test_pred = best_model.predict(X_test)

# Calculate evaluation metrics
test_mse = mean_squared_error(y_test, y_test_pred)  # Mean Squared Error
test_rmse = np.sqrt(test_mse)  # Root Mean Squared Error
test_mae = mean_absolute_error(y_test, y_test_pred)  # Mean Absolute Error
test_r2 = r2_score(y_test, y_test_pred)  # R-squared score

# Print evaluation scores
print("XGBoost Scores after Hyperparameter Tuning:")
print(f"Test MSE: {test_mse}")
print(f"Test RMSE: {test_rmse}")
print(f"Test MAE: {test_mae}")
print(f"Test R2 Score: {test_r2}")


Fitting 5 folds for each of 243 candidates, totalling 1215 fits


KeyboardInterrupt: 

Fitting 5 folds for each of 243 candidates, totalling 1215 fits
Best Parameters: {'colsample_bytree': 0.6, 'learning_rate': 0.05, 'max_depth': 3, 'n_estimators': 100, 'subsample': 0.8}
XGBoost Scores after Hyperparameter Tuning:
Test MSE: 2.0374115924589407
Test RMSE: 1.4273792742151403
Test MAE: 1.1399235992666652
Test R2 Score: -0.07265321923687473

## Catboost

In [None]:
!pip install -q catboost

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from catboost import CatBoostRegressor
from ucimlrepo import fetch_ucirepo

forest_fires = fetch_ucirepo(id=162)
features = forest_fires.data.features
targets = forest_fires.data.targets
df = pd.DataFrame(data=features, columns=forest_fires.feature_names)
df['area'] = targets

# Data preprocessing
df = pd.get_dummies(df, columns=['day', 'month'], drop_first=True)
df['rain'] = df['rain'].apply(lambda x: int(x > 0.0))
df = df[df['FFMC'].apply(lambda x: abs((x - df['FFMC'].mean()) / df['FFMC'].std()) < 3)]
df['area'] = np.log1p(df['area'])

X = df.drop(columns=['area'])
y = df['area']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = CatBoostRegressor(verbose=0)
model.fit(X_train, y_train)
y_test_pred = model.predict(X_test)

test_mse = mean_squared_error(y_test, y_test_pred)
test_rmse = np.sqrt(test_mse)
test_mae = mean_absolute_error(y_test, y_test_pred)
test_r2 = r2_score(y_test, y_test_pred)

print("CatBoost Scores:")
print(f"Test MSE: {test_mse}")
print(f"Test RMSE: {test_rmse}")
print(f"Test MAE: {test_mae}")
print(f"Test R2 Score: {test_r2}")


Hyperparameter Tuning

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from catboost import CatBoostRegressor
from ucimlrepo import fetch_ucirepo

forest_fires = fetch_ucirepo(id=162)
features = forest_fires.data.features
targets = forest_fires.data.targets
df = pd.DataFrame(data=features, columns=forest_fires.feature_names)
df['area'] = targets

df = pd.get_dummies(df, columns=['day', 'month'], drop_first=True)
df['rain'] = df['rain'].apply(lambda x: int(x > 0.0))
df = df[df['FFMC'].apply(lambda x: abs((x - df['FFMC'].mean()) / df['FFMC'].std()) < 3)]
df['area'] = np.log1p(df['area'])

X = df.drop(columns=['area'])
y = df['area']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

param_grid = {
    'depth': [4, 6, 8, 10],
    'learning_rate': [0.01, 0.05, 0.1],
    'iterations': [100, 200, 300]
}

model = CatBoostRegressor()
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='r2', verbose=1)
grid_search.fit(X_train, y_train)

best_model = grid_search.best_estimator_
print(best_model.get_params())

y_test_pred = best_model.predict(X_test)

test_mse = mean_squared_error(y_test, y_test_pred)
test_rmse = np.sqrt(test_mse)
test_mae = mean_absolute_error(y_test, y_test_pred)
test_r2 = r2_score(y_test, y_test_pred)

print("CatBoost Scores after Hyperparameter Tuning:")
print(f"Test MSE: {test_mse}")
print(f"Test RMSE: {test_rmse}")
print(f"Test MAE: {test_mae}")
print(f"Test R2 Score: {test_r2}")


{'iterations': 100, 'learning_rate': 0.01, 'depth': 8, 'loss_function': 'RMSE'}
CatBoost Scores after Hyperparameter Tuning:
Test MSE: 1.8895315127838481
Test RMSE: 1.3746023107735008
Test MAE: 1.1493961327051425
Test R2 Score: 0.005202450236886813

## LightGBM

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from lightgbm import LGBMRegressor
from ucimlrepo import fetch_ucirepo


forest_fires = fetch_ucirepo(id=162)
features = forest_fires.data.features
targets = forest_fires.data.targets
df = pd.DataFrame(data=features, columns=forest_fires.feature_names)
df['area'] = targets

df = pd.get_dummies(df, columns=['day', 'month'], drop_first=True)
df['rain'] = df['rain'].apply(lambda x: int(x > 0.0))
df = df[df['FFMC'].apply(lambda x: abs((x - df['FFMC'].mean()) / df['FFMC'].std()) < 3)]
df['area'] = np.log1p(df['area'])


X = df.drop(columns=['area'])
y = df['area']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = LGBMRegressor()
model.fit(X_train, y_train)
y_test_pred = model.predict(X_test)

test_mse = mean_squared_error(y_test, y_test_pred)
test_rmse = np.sqrt(test_mse)
test_mae = mean_absolute_error(y_test, y_test_pred)
test_r2 = r2_score(y_test, y_test_pred)

 print("LGBM Scores:")
print(f"Test MSE: {test_mse}")
print(f"Test RMSE: {test_rmse}")
print(f"Test MAE: {test_mae}")
print(f"Test R2 Score: {test_r2}")


Hyperparameter Tuning

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from lightgbm import LGBMRegressor
from ucimlrepo import fetch_ucirepo

forest_fires = fetch_ucirepo(id=162)
features = forest_fires.data.features
targets = forest_fires.data.targets
df = pd.DataFrame(data=features, columns=forest_fires.feature_names)
df['area'] = targets

df = pd.get_dummies(df, columns=['day', 'month'], drop_first=True)
df['rain'] = df['rain'].apply(lambda x: int(x > 0.0))
df = df[df['FFMC'].apply(lambda x: abs((x - df['FFMC'].mean()) / df['FFMC'].std()) < 3)]
df['area'] = np.log1p(df['area'])

X = df.drop(columns=['area'])
y = df['area']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Hyperparameter tuning
param_grid = {
    'num_leaves': [20, 30, 40],
    'max_depth': [4, 6, 8],
    'learning_rate': [0.05, 0.1, 0.15],
    'n_estimators': [100, 200, 300]
}

model = LGBMRegressor()
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='r2', verbose=1)
grid_search.fit(X_train, y_train)

best_model = grid_search.best_estimator_

print("Finalized Parameters of the Best Model:")
print(best_model.get_params())

y_test_pred = best_model.predict(X_test)

test_mse = mean_squared_error(y_test, y_test_pred)
test_rmse = np.sqrt(test_mse)
test_mae = mean_absolute_error(y_test, y_test_pred)
test_r2 = r2_score(y_test, y_test_pred)

print("LGBM Scores after Hyperparameter Tuning:")
print(f"Test MSE: {test_mse}")
print(f"Test RMSE: {test_rmse}")
print(f"Test MAE: {test_mae}")
print(f"Test R2 Score: {test_r2}")


Finalized Parameters of the Best Model:
{'boosting_type': 'gbdt', 'class_weight': None, 'colsample_bytree': 1.0, 'importance_type': 'split', 'learning_rate': 0.05, 'max_depth': 4, 'min_child_samples': 20, 'min_child_weight': 0.001, 'min_split_gain': 0.0, 'n_estimators': 100, 'n_jobs': None, 'num_leaves': 20, 'objective': None, 'random_state': None, 'reg_alpha': 0.0, 'reg_lambda': 0.0, 'subsample': 1.0, 'subsample_for_bin': 200000, 'subsample_freq': 0}
LGBM Scores after Hyperparameter Tuning:
Test MSE: 1.976040596049711
Test RMSE: 1.4057171109614164
Test MAE: 1.1345768461113845
Test R2 Score: -0.04034271452109217

# Random forest

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor

X_train_rf, X_test_rf, y_train_rf, y_test_rf = train_test_split(X, y, test_size=0.2, random_state=42)

param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

rf_model = RandomForestRegressor(random_state=42)

grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train_rf, y_train_rf)
best_params = grid_search.best_params_
best_rf_model = RandomForestRegressor(random_state=42, **best_params)
best_rf_model.fit(X_train_rf, y_train_rf)
best_rf_y_test_pred = best_rf_model.predict(X_test_rf)

best_rf_test_mse = mean_squared_error(y_test_rf, best_rf_y_test_pred)
best_rf_test_rmse = np.sqrt(best_rf_test_mse)
best_rf_test_mae = mean_absolute_error(y_test_rf, best_rf_y_test_pred)
best_rf_test_r2 = r2_score(y_test_rf, best_rf_y_test_pred)

print("Random Forest Scores with Best Parameters:")
for param, value in best_params.items():
    print(f"{param}: {value}")

print(f"Test MSE: {best_rf_test_mse}")
print(f"Test RMSE: {best_rf_test_rmse}")
print(f"Test MAE: {best_rf_test_mae}")
print(f"Test R2 Score: {best_rf_test_r2}")


1)test size 0.1
max_depth: 10
min_samples_leaf: 4
min_samples_split: 10
n_estimators: 300
Test MSE: 2.597105961883943
Test RMSE: 1.6115538966736245
Test MAE: 1.2488122114504228
Test R2 Score: -0.06848152437123023

2) test size 0.2 Random Forest Scores with Best Parameters:
max_depth: None
min_samples_leaf: 4
min_samples_split: 10
n_estimators: 100
Test MSE: 1.9761864740270336
Test RMSE: 1.4057689973914753
Test MAE: 1.14262010726586
Test R2 Score: -0.0404195161268992

# Decision tree

In [None]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV

# Train-test split with test_size=0.1
X_train_dt, X_test_dt, y_train_dt, y_test_dt = train_test_split(X, y, test_size=0.2, random_state=42)

param_grid = {
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

dt_model = DecisionTreeRegressor(random_state=42)

grid_search = GridSearchCV(estimator=dt_model, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error')

grid_search.fit(X_train_dt, y_train_dt)

best_params = grid_search.best_params_
best_dt_model = DecisionTreeRegressor(random_state=42, **best_params)
best_dt_model.fit(X_train_dt, y_train_dt)
best_dt_y_test_pred = best_dt_model.predict(X_test_dt)
best_dt_test_mse = mean_squared_error(y_test_dt, best_dt_y_test_pred)
best_dt_test_rmse = np.sqrt(best_dt_test_mse)
best_dt_test_mae = mean_absolute_error(y_test_dt, best_dt_y_test_pred)
best_dt_test_r2 = r2_score(y_test_dt, best_dt_y_test_pred)

print("Decision Tree Scores with Best Parameters:")
print(f"Test MSE: {best_dt_test_mse}")
print(f"Test RMSE: {best_dt_test_rmse}")
print(f"Test MAE: {best_dt_test_mae}")
print(f"Test R2 Score: {best_dt_test_r2}")

# Print the best parameters
print("\nBest Parameters:")
for param, value in best_params.items():
    print(f"{param}: {value}")


1) test size 0.1
Decision Tree Scores with Best Parameters:
Test MSE: 3.3606311430470437
Test RMSE: 1.8332024282787331
Test MAE: 1.3650773078149845
Test R2 Score: -0.38260523031089

Best Parameters:
max_depth: 10
min_samples_leaf: 4
min_samples_split: 10

2) tet size 0.2 Decision Tree Scores with Best Parameters:
Test MSE: 2.9191310194894022
Test RMSE: 1.7085464639539079
Test MAE: 1.2412350054056487
Test R2 Score: -0.5368594627708394

Best Parameters:
max_depth: 10
min_samples_leaf: 4
min_samples_split: 10

# Extratree

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import ExtraTreesRegressor

X_train_dt, X_test_dt, y_train_dt, y_test_dt = train_test_split(X, y, test_size=0.2, random_state=42)

param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

et_model = ExtraTreesRegressor(random_state=42)

grid_search = GridSearchCV(estimator=et_model, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error')

grid_search.fit(X_train_rf, y_train_rf)

best_params = grid_search.best_params_

best_et_model = ExtraTreesRegressor(random_state=42, **best_params)

best_et_model.fit(X_train_rf, y_train_rf)

best_et_y_test_pred = best_et_model.predict(X_test_rf)

best_et_test_mse = mean_squared_error(y_test_rf, best_et_y_test_pred)
best_et_test_rmse = np.sqrt(best_et_test_mse)
best_et_test_mae = mean_absolute_error(y_test_rf, best_et_y_test_pred)
best_et_test_r2 = r2_score(y_test_rf, best_et_y_test_pred)


print("Extra Trees Scores with Best Parameters:")
print(f"Test MSE: {best_et_test_mse}")
print(f"Test RMSE: {best_et_test_rmse}")
print(f"Test MAE: {best_et_test_mae}")
print(f"Test R2 Score: {best_et_test_r2}")


print("\nBest Parameters:")
for param, value in best_params.items():
    print(f"{param}: {value}")


1)test size 0.1
Extra Trees Scores with Best Parameters:
Test MSE: 2.5737690417140033
Test RMSE: 1.604297055321739
Test MAE: 1.2050085350536315
Test R2 Score: -0.058880426686629184

Best Parameters:
max_depth: 10
min_samples_leaf: 4
min_samples_split: 10
n_estimators: 200

2) test size 0.2 Extra Trees Scores with Best Parameters:
Test MSE: 1.885591855003105
Test RMSE: 1.3731685457375962
Test MAE: 1.0755141497221092
Test R2 Score: 0.00727659500804978

Best Parameters:
max_depth: 10
min_samples_leaf: 4
min_samples_split: 10
n_estimators: 200

# CLASSIFICATION

In [22]:
!pip install catboost



In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from catboost import CatBoostRegressor
from ucimlrepo import fetch_ucirepo

# Load data
forest_fires = fetch_ucirepo(id=162)
features = forest_fires.data.features
targets = forest_fires.data.targets
df = pd.DataFrame(data=features, columns=forest_fires.feature_names)
df['area'] = targets

# Data preprocessing
df = pd.get_dummies(df, columns=['day', 'month'], drop_first=True)
df['rain'] = df['rain'].apply(lambda x: int(x > 0.0))
df = df[df['FFMC'].apply(lambda x: abs((x - df['FFMC'].mean()) / df['FFMC'].std()) < 3)]
df['log_area'] = np.log1p(df['area'])  # Log transform the 'area' column

# Feature selection (optional for CatBoost)
X = df.drop(columns=['area', 'log_area'])  # Excluding both 'area' and 'log_area'
y = df['log_area']  # Using log transformed 'area' as target variable

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Hyperparameter tuning
param_grid = {
    'depth': [4, 6, 8, 10],
    'learning_rate': [0.01, 0.05, 0.1],
    'iterations': [100, 200, 300]
}

model = CatBoostRegressor()
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='r2', verbose=1)
grid_search.fit(X_train, y_train)

# Get the best estimator
best_model = grid_search.best_estimator_
print(best_model.get_params())

# Predictions on test data
y_test_pred = best_model.predict(X_test)

# Calculate evaluation metrics
test_mse = mean_squared_error(y_test, y_test_pred)
test_rmse = np.sqrt(test_mse)
test_mae = mean_absolute_error(y_test, y_test_pred)
test_r2 = r2_score(y_test, y_test_pred)

# Print evaluation scores
print("CatBoost Scores after Hyperparameter Tuning:")
print(f"Test MSE: {test_mse}")
print(f"Test RMSE: {test_rmse}")
print(f"Test MAE: {test_mae}")
print(f"Test R2 Score: {test_r2}")


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
2:	learn: 1.3697432	total: 23.4ms	remaining: 756ms
3:	learn: 1.3629796	total: 25.1ms	remaining: 602ms
4:	learn: 1.3543994	total: 26.6ms	remaining: 505ms
5:	learn: 1.3495812	total: 33ms	remaining: 517ms
6:	learn: 1.3440651	total: 34.7ms	remaining: 462ms
7:	learn: 1.3337766	total: 36.4ms	remaining: 418ms
8:	learn: 1.3222831	total: 37.9ms	remaining: 383ms
9:	learn: 1.3125323	total: 43.9ms	remaining: 395ms
10:	learn: 1.2941430	total: 45.6ms	remaining: 369ms
11:	learn: 1.2922066	total: 47.1ms	remaining: 346ms
12:	learn: 1.2842755	total: 48.7ms	remaining: 326ms
13:	learn: 1.2759832	total: 52.9ms	remaining: 325ms
14:	learn: 1.2600791	total: 54.4ms	remaining: 308ms
15:	learn: 1.2549502	total: 60.5ms	remaining: 318ms
16:	learn: 1.2516051	total: 62.3ms	remaining: 304ms
17:	learn: 1.2395290	total: 64.1ms	remaining: 292ms
18:	learn: 1.2293298	total: 65.8ms	remaining: 280ms
19:	learn: 1.2220745	total: 72ms	remaining: 288ms
20:	learn: 

In [None]:
# Exclude columns starting with 'month_' and 'day_'
test_pred_df_filtered = test_pred_df.loc[:, ~test_pred_df.columns.str.startswith(('month_', 'day_'))]

# Round 'Predicted Log Area' to one decimal place
test_pred_df_filtered['Predicted Log Area'] = test_pred_df_filtered['Predicted Log Area'].round(1)

# Display the modified DataFrame
test_pred_df_filtered


In [None]:
"""graph to plot volume of predicted log area """
# import matplotlib.pyplot as plt

# bin_edges = np.arange(0.0, 2.1, 0.1)

# plt.figure(figsize=(10, 6))
# plt.hist(test_pred_df_filtered['Predicted Log Area'], bins=bin_edges, color='skyblue', edgecolor='black', alpha=0.7)
# plt.title('Volume of Data in Predicted Log Area Ranges')
# plt.xlabel('Predicted Log Area')
# plt.ylabel('Frequency')
# plt.xticks(bin_edges)
# plt.grid(True)
# plt.show()


In [None]:
def area_cat(log_area):
    if log_area <= 1.1:
        return "mild"
    elif log_area <= 1.25:
        return "moderate"
    else:
        return "severe"

test_pred_df_filtered['Damage Category'] = test_pred_df_filtered['Predicted Log Area'].apply(lambda x: area_cat(x))

test_pred_df_filtered[['Predicted Log Area', 'Damage Category']]