In [2]:
!pip install xgboost

Collecting xgboost
  Downloading xgboost-2.1.1-py3-none-win_amd64.whl.metadata (2.1 kB)
Downloading xgboost-2.1.1-py3-none-win_amd64.whl (124.9 MB)
   ---------------------------------------- 0.0/124.9 MB ? eta -:--:--
   ---------------------------------------- 0.0/124.9 MB 660.6 kB/s eta 0:03:10
   ---------------------------------------- 0.1/124.9 MB 1.3 MB/s eta 0:01:36
   ---------------------------------------- 0.2/124.9 MB 1.5 MB/s eta 0:01:22
   ---------------------------------------- 0.2/124.9 MB 1.7 MB/s eta 0:01:15
   ---------------------------------------- 0.2/124.9 MB 1.7 MB/s eta 0:01:15
   ---------------------------------------- 0.4/124.9 MB 1.5 MB/s eta 0:01:22
   ---------------------------------------- 0.5/124.9 MB 1.6 MB/s eta 0:01:17
   ---------------------------------------- 0.5/124.9 MB 1.6 MB/s eta 0:01:17
   ---------------------------------------- 0.7/124.9 MB 1.6 MB/s eta 0:01:20
   ---------------------------------------- 0.8/124.9 MB 1.9 MB/s eta 0:01:07


[notice] A new release of pip is available: 24.0 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import xgboost as xgb

from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split

In [4]:
data_path = "./Problem3.csv"
data_df = pd.read_csv(data_path)
data_df

Unnamed: 0,X,Y,month,day,FFMC,DMC,DC,ISI,temp,RH,wind,rain,area
0,7,5,mar,fri,4.468204,26.2,94.3,1.808289,8.2,51,6.7,False,0.000000
1,7,4,oct,tue,4.517431,35.4,669.1,2.041220,18.0,33,0.9,False,0.000000
2,7,4,oct,sat,4.517431,43.7,686.9,2.041220,14.6,33,1.3,False,0.000000
3,8,6,mar,fri,4.529368,33.3,77.5,2.302585,8.3,97,4.0,True,0.000000
4,8,6,mar,sun,4.503137,51.3,102.2,2.360854,11.4,99,1.8,False,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...
505,4,3,aug,sun,4.414010,56.7,665.6,1.064711,27.8,32,2.7,False,2.006871
506,2,4,aug,sun,4.414010,56.7,665.6,1.064711,21.9,71,5.8,False,4.012592
507,7,4,aug,sun,4.414010,56.7,665.6,1.064711,21.2,70,6.7,False,2.498152
508,1,4,aug,sat,4.558079,146.0,614.7,2.509599,25.6,42,4.0,False,0.000000


In [5]:
categorical_cols = data_df.select_dtypes(include=['object', 'bool']).columns.to_list()

for col_name in categorical_cols:
    n_categories = data_df[col_name].unique()
    print(f"Number of categories in {col_name}: {n_categories}")

ordinal_encoder = OrdinalEncoder()
encoded_categories_cols = ordinal_encoder.fit_transform(data_df[categorical_cols])

encoded_categories_df = pd.DataFrame(
    encoded_categories_cols, 
    columns=categorical_cols
)

numerical_df = data_df.drop(categorical_cols, axis=1)
encoded_df = pd.concat([numerical_df, encoded_categories_df], axis=1)

Number of categories in month: ['mar' 'oct' 'aug' 'sep' 'apr' 'jun' 'jul' 'feb' 'jan' 'dec' 'may' 'nov']
Number of categories in day: ['fri' 'tue' 'sat' 'sun' 'mon' 'wed' 'thu']
Number of categories in rain: [False  True]


In [6]:
X = encoded_df.drop(columns=['area'])
y = encoded_df['area']

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.3,
                                                    random_state=7)

In [8]:
xg_reg = xgb.XGBRFRegressor(seed=7,
                           learning_rate=0.01,
                           n_estimators=102,
                           max_depth=3)
xg_reg.fit(X_train, y_train)

In [9]:
preds = xg_reg.predict(X_test)

In [10]:
mae = mean_absolute_error(y_test, preds)
mse = mean_squared_error(y_test, preds)

print("Evaluation rusults on test set:")
print(f"Mean Absolute Error: {mae}")
print(f"Mean Squared Error: {mse}")

Evaluation rusults on test set:
Mean Absolute Error: 1.1575552072476503
Mean Squared Error: 1.90197797152326
