# **DATA ANALYSIS AND DATA PROCESSING**

# **1. Importing Libraries**

---

 Imports essential libraries for data manipulation

In [None]:
import pandas as pd
import numpy as np

# **2. Loading Datasets**

---

Reads a CSV file located at the specified path into a DataFrame named dataset using Pandas.

In [None]:
dataset = pd.read_csv('/content/drive/MyDrive/datasets/training-data.csv')

print(dataset.tail())

      type of house  \
2710              0   
2711              0   
2712              2   
2713              0   
2714             11   

      living conditions (ventilation, screens on windows, closed environment)  \
2710                                                  0                         
2711                                                  0                         
2712                                                  3                         
2713                                                  0                         
2714                                                  5                         

      how many times do you clean the house in a week? inside:  \
2710                                                  0          
2711                                                  0          
2712                                                  2          
2713                                                  0          
2714                                        

# **MODEL TRAINING**

# **Feature Selection**

---

Prepares the data for modeling by selecting specific features and a target variable from the dataset. It then splits the data into training and testing sets, with 80% of the data used for training and 20% for testing, ensuring reproducibility by setting a random state.

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Select features and target
features = ['t2m_max', 'wd2m', 'rh2m', 'qv2m', 'population (2020)', 'caseclassification',
            'gender', 'agegroup', 'barangay', 'outcome', 'morbiditymonth', 'muncity', 'clinclass',
            'annual regular income', 'average household size', 'living conditions (ventilation, screens on windows, closed environment)',
            'how many times do you clean the house in a week? inside:', 'source of water supply']
target = 'cases'

# Prepare feature matrix and target vector
X = dataset[features]
y = dataset[target]

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# **RANDOM FOREST REGRESSION**

# **1. Import Libraries**

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV

# **2. Scale Features**

In [None]:
# Initialize the scaler
scaler = StandardScaler()

# Fit and transform training data, transform test data
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Convert the scaled arrays back to DataFrame to retain feature names
X_train_scaled = pd.DataFrame(X_train_scaled, columns=X.columns)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X.columns)

# **3. Define Parameter Grid for Hyperparameter Tuning**

In [None]:
# Define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4, 10],
    'max_features': [None, 'sqrt', 'log2'],
    'bootstrap': [True, False]
}

# **4. Initialize and Perform Grid Search**

In [None]:
# Initialize the Random Forest Regressor
model = RandomForestRegressor(random_state=42)

# Initialize GridSearchCV with the expanded grid
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5,
                           scoring='neg_mean_squared_error', n_jobs=-1, verbose=1,
                           error_score='raise')

# Fit the expanded grid search
grid_search.fit(X_train_scaled, y_train)

Fitting 5 folds for each of 864 candidates, totalling 4320 fits


# **5. Evaluate the Best Model**

In [None]:
# Get the best model
best_model = grid_search.best_estimator_

# Make predictions on the test set
y_pred = best_model.predict(X_test_scaled)

# Evaluate the model using different metrics
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Print best parameters
print(f"Best Parameters: {grid_search.best_params_}")

# Print evaluation metrics
print(f"Mean Absolute Error (MAE): {mae:.3f}")
print(f"Mean Squared Error (MSE): {mse:.3f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.3f}")
print(f"R-squared: {r2:.3f}")

Best Parameters: {'bootstrap': False, 'max_depth': 20, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Mean Absolute Error (MAE): 0.131
Mean Squared Error (MSE): 0.057
Root Mean Squared Error (RMSE): 0.239
R-squared: 0.923


# **6. Print Interpretation of Results**

In [None]:
# Print interpretation
print("\nInterpretation of Evaluation Results:")
print(f"The Random Forest Regressor model with the best parameters achieved an R-squared value of {r2:.2f}, "
      f"which means that approximately {r2 * 100:.1f}% of the variance in the target variable (dengue cases) "
      f"is explained by the model. This indicates that the model performs well in capturing the relationship "
      f"between the input features and the target, but there is still some room for improvement.")
print(f"The Mean Squared Error (MSE) is {mse:.3f}, which represents the average squared difference between the "
      f"predicted and actual dengue case values. A lower MSE indicates better model performance, but since the MSE "
      f"depends on the scale of the target variable, it's important to consider both MSE and R-squared together.")
print(f"The optimal parameters found through GridSearchCV include {grid_search.best_params_}. The Random Forest "
      f"model is using {grid_search.best_params_['n_estimators']} trees with a maximum depth of {grid_search.best_params_['max_depth']} "
      f"and features selected using the '{grid_search.best_params_['max_features']}' method.")


Interpretation of Evaluation Results:
The Random Forest Regressor model with the best parameters achieved an R-squared value of 0.92, which means that approximately 92.3% of the variance in the target variable (dengue cases) is explained by the model. This indicates that the model performs well in capturing the relationship between the input features and the target, but there is still some room for improvement.
The Mean Squared Error (MSE) is 0.057, which represents the average squared difference between the predicted and actual dengue case values. A lower MSE indicates better model performance, but since the MSE depends on the scale of the target variable, it's important to consider both MSE and R-squared together.
The optimal parameters found through GridSearchCV include {'bootstrap': False, 'max_depth': 20, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}. The Random Forest model is using 100 trees with a maximum depth of 20 and features s

In [None]:
import pickle

# Save the trained Random Forest model to a .pkl file using pickle
with open('random_forest_model.pkl', 'wb') as file:
    pickle.dump(best_model, file)

print("Model saved to rfr_model.pkl")

Model saved to rfr_model.pkl


# **SUPPORT VECTOR MACHINE**

# **1. Import Libraries**

In [None]:
from sklearn.svm import SVR
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV

# **2. Scale Features**

In [None]:
# Initialize the scaler
scaler = StandardScaler()

# Fit and transform training data, transform test data
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# **3. Define Parameter Grid for Hyperparameter Tuning**

In [None]:
# Define the expanded parameter grid
param_grid_svm = {
    'kernel': ['linear', 'rbf', 'poly'],
    'C': [0.1, 1, 10],
    'gamma': ['scale', 0.01, 0.1],
    'degree': [2, 3],
}

# **4. Initialize and Perform Grid Search**

In [None]:
# Initialize the Support Vector Regressor
model_svm = SVR()

# Initialize GridSearchCV
grid_search_svm = GridSearchCV(estimator=model_svm, param_grid=param_grid_svm, cv=5,
                                       scoring='neg_mean_squared_error', n_jobs=-1, verbose=1)

# Fit GridSearchCV
grid_search_svm.fit(X_train_scaled, y_train)

Fitting 5 folds for each of 54 candidates, totalling 270 fits


# **5. Evaluate the Best Model**

In [None]:
# Get the best model from Grid Search
best_model_svm = grid_search_svm.best_estimator_

# Make predictions with the best model
y_pred = best_model_svm.predict(X_test_scaled)

# Evaluate the best model
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Print best parameters and evaluation metrics
print(f"Best Parameters: {grid_search_svm.best_params_}")
print(f"Mean Absolute Error (MAE): {mae:.3f}")
print(f"Mean Squared Error (MSE): {mse:.3f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.3f}")
print(f"R-squared: {r2:.3f}")

Best Parameters: {'C': 10, 'degree': 2, 'gamma': 0.1, 'kernel': 'rbf'}
Mean Absolute Error (MAE): 0.246
Mean Squared Error (MSE): 0.308
Root Mean Squared Error (RMSE): 0.555
R-squared: 0.583


# **6. Print Interpretation of Results**

In [None]:
# Interpretation of Results
print("\nInterpretation of Evaluation Results:")
print(f"The Support Vector Regressor (SVR) model with the best parameters achieved an R-squared value of {r2:.2f}, "
      f"which means that approximately {r2 * 100:.1f}% of the variance in the target variable (dengue cases) "
      f"is explained by the model. This indicates a decent fit, though further improvement may still be possible.")
print(f"The Mean Squared Error (MSE) is {mse:.3f}, representing the average squared difference between the actual "
      f"and predicted dengue case values. A lower MSE indicates better predictive accuracy. Considering both MSE and R² together, "
      f"the model performs reasonably well.")
print(f"The best parameters for the SVR model are: Kernel = '{grid_search_svm.best_params_['kernel']}', "
      f"C = {grid_search_svm.best_params_['C']}, Gamma = {grid_search_svm.best_params_['gamma']}. "
      f"If the 'poly' kernel was selected, the degree is {grid_search_svm.best_params_.get('degree', 'N/A')}. "
      f"The 'C' parameter controls the trade-off between smooth decision boundaries and accurate regression, while 'gamma' "
      f"controls the influence of individual data points.")


Interpretation of Evaluation Results:
The Support Vector Regressor (SVR) model with the best parameters achieved an R-squared value of 0.58, which means that approximately 58.3% of the variance in the target variable (dengue cases) is explained by the model. This indicates a decent fit, though further improvement may still be possible.
The Mean Squared Error (MSE) is 0.308, representing the average squared difference between the actual and predicted dengue case values. A lower MSE indicates better predictive accuracy. Considering both MSE and R² together, the model performs reasonably well.
The best parameters for the SVR model are: Kernel = 'rbf', C = 10, Gamma = 0.1. If the 'poly' kernel was selected, the degree is 2. The 'C' parameter controls the trade-off between smooth decision boundaries and accurate regression, while 'gamma' controls the influence of individual data points.


In [None]:
import pickle

# Save the trained SVR model to a .pkl file using pickle
with open('svr_model.pkl', 'wb') as file:
    pickle.dump(best_model, file)

print("SVR model saved to svr_model.pkl")

SVR model saved to svr_model.pkl


# **GRADIENT BOOSTING**

# **1. Import Libraries**

In [None]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV

# **2. Scale the Data**

In [None]:
# Initialize the scaler
scaler = StandardScaler()

# Fit and transform training data, transform test data
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# **3. Define the Parameter Grid**

In [None]:
# Refined parameter grid for Gradient Boosting Regressor
param_grid = {
    'loss': ['squared_error', 'absolute_error', 'huber', 'quantile'],  # Correct values
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.05],
    'max_depth': [3, 4, 5],
    'min_samples_split': [2, 5, 10],
}

# **4. Initialize and Perform Grid Search**

In [None]:
# Initialize the Gradient Boosting Regressor
model_gb = GradientBoostingRegressor(random_state=42)

# Initialize GridSearchCV
grid_search_gb = GridSearchCV(estimator=model_gb, param_grid=param_grid, cv=5,
                              scoring='neg_mean_squared_error', n_jobs=-1, verbose=1)

# Fit GridSearchCV
grid_search_gb.fit(X_train_scaled, y_train)

Fitting 5 folds for each of 324 candidates, totalling 1620 fits


  _data = np.array(data, dtype=dtype, copy=copy,


# **5. Evaluate the Best Model**

In [None]:
# Get the best model from Grid Search
best_model_gb = grid_search_gb.best_estimator_

# Make predictions with the best model
y_pred_gb = best_model_gb.predict(X_test_scaled)

# Evaluate the best model
mse_gb = mean_squared_error(y_test, y_pred_gb)
rmse_gb = np.sqrt(mse_gb)
mae_gb = mean_absolute_error(y_test, y_pred_gb)
r2_gb = r2_score(y_test, y_pred_gb)

# Print best parameters and evaluation metrics
print(f"Best Parameters: {grid_search_gb.best_params_}")
print(f"Mean Absolute Error (MAE): {mae_gb:.3f}")
print(f"Mean Squared Error (MSE): {mse_gb:.3f}")
print(f"Root Mean Squared Error (RMSE): {rmse_gb:.3f}")
print(f"R-squared (R²): {r2_gb:.3f}")

Best Parameters: {'learning_rate': 0.1, 'loss': 'squared_error', 'max_depth': 5, 'min_samples_split': 5, 'n_estimators': 300}
Mean Absolute Error (MAE): 0.127
Mean Squared Error (MSE): 0.058
Root Mean Squared Error (RMSE): 0.240
R-squared (R²): 0.922


# **6. Interpretation of Results**

In [None]:
# Interpretation of Results
print("\nInterpretation of Evaluation Results:")
print(f"The best Gradient Boosting model was tuned with the following parameters: {grid_search_gb.best_params_}.")
print(f"The Mean Squared Error (MSE) is {mse_gb:.3f}, indicating that on average, the predictions are off by {rmse_gb:.2f} units.")
print(f"The Mean Absolute Error (MAE) is {mae_gb:.3f}, which shows the average absolute difference between actual and predicted dengue cases.")
print(f"The R-squared value (R²) is {r2_gb:.2f}, meaning that {r2_gb * 100:.1f}% of the variance in dengue cases is captured by this model. "
      f"This suggests that the model is fairly effective in predicting dengue cases, though there's still room for improvement.")
print(f"The Gradient Boosting model is relatively robust, and tuning parameters like learning rate, max depth, and n_estimators helps in capturing the nuances of the data.")


Interpretation of Evaluation Results:
The best Gradient Boosting model was tuned with the following parameters: {'learning_rate': 0.1, 'loss': 'squared_error', 'max_depth': 5, 'min_samples_split': 5, 'n_estimators': 300}.
The Mean Squared Error (MSE) is 0.058, indicating that on average, the predictions are off by 0.24 units.
The Mean Absolute Error (MAE) is 0.127, which shows the average absolute difference between actual and predicted dengue cases.
The R-squared value (R²) is 0.92, meaning that 92.2% of the variance in dengue cases is captured by this model. This suggests that the model is fairly effective in predicting dengue cases, though there's still room for improvement.
The Gradient Boosting model is relatively robust, and tuning parameters like learning rate, max depth, and n_estimators helps in capturing the nuances of the data.


In [None]:
import pickle

# Save the Gradient Boosting model to a .pkl file using pickle
with open('gbr_model.pkl', 'wb') as file:
    pickle.dump(best_model, file)

print("Gradient Boosting model saved to gbr_model.pkl")

Gradient Boosting model saved to gbr_model.pkl
