<h1><center> UFU - Federal University of Uberlândia</center></h1>

<h2><center>Undergraduate Program in Civil Engineering</center></h2>

<h3><center>SCIENTIFIC RESEARCH PROJECT</center><br>
TITLE: USING XGBOOST MODELS FOR DAILY RAINFALL PREDICTION 
<br>  
<br>  
STUDENT: Pedro Augusto Toledo Rios</h3>

<p>This notebook is part of a Scientific Research Project in the field of Computer Science/Data Analysis.</p>


# Classification

## Imports and Initial Configurations

In [None]:
# Data Analysis and Preprocessing
import pandas as pd  # Library for data manipulation and analysis
import numpy as np  # Library for numerical computing
import random as rnd  # Library for generating random numbers
import seaborn as sns  # Library for statistical data visualization

# Data Visualization
import matplotlib.pyplot as plt  # Library for creating static, animated, and interactive visualizations
%matplotlib inline  

# Machine Learning Libraries
from sklearn.linear_model import LogisticRegression  # Logistic Regression model
from sklearn.svm import SVC, LinearSVC  # Support Vector Machine (SVM) classifiers
from sklearn.ensemble import RandomForestClassifier  # Random Forest classifier
from sklearn.neighbors import KNeighborsClassifier  # k-Nearest Neighbors (k-NN) classifier
from sklearn.naive_bayes import GaussianNB  # Gaussian Naïve Bayes classifier
from sklearn.linear_model import Perceptron  # Perceptron classifier (basic neural network)
from sklearn.linear_model import SGDClassifier  # Stochastic Gradient Descent classifier
from sklearn.tree import DecisionTreeClassifier  # Decision Tree classifier
from sklearn.model_selection import train_test_split  # Function to split data into training and testing sets
from sklearn.linear_model import LinearRegression  # Linear Regression model
from sklearn import metrics  # Evaluation metrics for model performance
from sklearn.metrics import (precision_score, recall_score, f1_score, 
                             accuracy_score, classification_report, confusion_matrix)  # Various classification metrics
import itertools  # Library for advanced iteration functions
from sklearn import svm  # Support Vector Machine models
from sklearn.ensemble import AdaBoostClassifier  # AdaBoost ensemble classifier
from sklearn.metrics import ConfusionMatrixDisplay  # Tool for visualizing confusion matrices
import xgboost as xgb  # Extreme Gradient Boosting (XGBoost) classifier
import lightgbm as lgb  # Light Gradient Boosting Machine (LightGBM) classifier


# Exploratory Data Analysis

In [None]:
# Data Preprocessing and Cleaning

# List of missing values to be identified and replaced with NaN
missing_values = [
    'n/a', 'na', '*****', '*', '*******', ' -', '******', 
    '5..84', '3..66', '3.3.21', '1..41', '********', '3.7.94', 
    '354.59*', '564..79', '5.04.24', '21:36', '**********', '***', 
    '*********', '03:18', '00:00', '03:48', '08:42', '03:06', 
    '09:06', '01:30', '07:48', '09:12', '10:18', '01:24', '#VALUE!',  
    '926,4923,8', '27/07/1902**21:36:00', '-'
]

# Loading the dataset, defining the delimiter, and replacing specified missing values with NaN
climate_data = pd.read_csv(
    'C:/Users/auped/Desktop/IC CORREÇÕES/python 05-11/pesquisa/dadosclimaatt.csv', 
    header=None, sep=';', na_values=missing_values
)

# Assigning column names to the dataset for better interpretability
climate_data.columns = [
    'Max Temp (°C)', 'Min Temp (°C)', 'Avg Temp (°C)', 'Wind Speed (km/h)', 
    'Solar Radiation (cal/cm²/h)', 'Pressure (mb)', 'Humidity (%)', 
    'Precipitation (mm)', 'Day', 'Month', 'Year', 'Date', 'Date2'
]

# Data Cleaning and Type Conversion
climate_data['Humidity (%)'] = climate_data['Humidity (%)'].str.replace(':', '.')
climate_data['Pressure (mb)'] = climate_data['Pressure (mb)'].str.replace(',,', '.')

# Converting columns to appropriate data types
climate_data['Humidity (%)'] = climate_data['Humidity (%)'].astype(float)
climate_data['Pressure (mb)'] = climate_data['Pressure (mb)'].astype(float)

# Cleaning and converting the 'Year' column
climate_data['Year'] = climate_data['Year'].str.replace(',,', '')
climate_data['Year'] = climate_data['Year'].astype(int)

# Storing the dataset in a list for possible further transformations
combine = [climate_data]

# Filtering out invalid pressure values (outside the typical atmospheric range)
climate_data = climate_data[
    (climate_data['Pressure (mb)'] >= 870) & (climate_data['Pressure (mb)'] <= 1100)
]

# Storing the updated dataset
combine = [climate_data]

# Extracting precipitation data for further analysis
precipitation = climate_data['Precipitation (mm)']

# Displaying the first five rows of the cleaned dataset
climate_data.head()

# Removing any remaining missing values
climate_data.dropna()

# Printing the count of missing values in each column
print(climate_data.isnull().sum())


In [None]:
# Statistical Summary of the Dataset

# This command provides key statistical metrics for numerical columns in the dataset, 
# including mean, standard deviation, minimum, and maximum values, as well as quartiles.
climate_data.describe()


# Creating a Binary Indicator for Rain Occurrence

In [None]:
# Creating a Binary Indicator for Rain Occurrence

# Adding a new column 'Rained?' based on precipitation data
climate_data['Rained?'] = precipitation  

# Loop to populate the new column with 0 (No Rain) and 1 (Rain)
for climate_data in combine:    
    climate_data.loc[climate_data['Precipitation (mm)'] == 0, 'Rained?'] = 0
    climate_data.loc[climate_data['Precipitation (mm)'] > 0, 'Rained?'] = 1

# Displaying the first five rows of the updated dataset
climate_data.head()


In [None]:
# Handling Missing Values (Data Imputation)

# Forward Fill (ffill) method is used to propagate the last valid observation forward to fill missing values.
# This ensures that gaps in the dataset are replaced with the most recent valid data point.

climate_data['Min Temp (°C)'].fillna(method='ffill', inplace=True)   # Filling missing values with the last recorded temperature
climate_data['Avg Temp (°C)'].fillna(method='ffill', inplace=True)   # Applying forward fill for average temperature
climate_data['Wind Speed (km/h)'].fillna(method='ffill', inplace=True)  # Filling missing wind speed values
climate_data['Solar Radiation (cal/cm²/h)'].fillna(method='ffill', inplace=True)  # Filling missing solar radiation values
climate_data['Humidity (%)'].fillna(method='ffill', inplace=True)  # Filling missing humidity values
climate_data['Precipitation (mm)'].fillna(method='ffill', inplace=True)  # Filling missing precipitation data
climate_data['Pressure (mb)'].fillna(method='ffill', inplace=True)  # Filling missing pressure values
climate_data['Rained?'].fillna(method='ffill', inplace=True)  # Filling missing rain indicator values
climate_data['Max Temp (°C)'].fillna(method='ffill', inplace=True)   # Filling missing max temperature values

# Checking for missing values after the imputation process
print('\nMissing Values in the dataset after imputation:\n', climate_data.isnull().sum(), sep="")


In [None]:
# Splitting Data into Training (70%) and Testing (30%)

start_year = 1980
end_year = 2020
training_start_year = 2009
training_end_year = 2019
testing_start_year = 1980
testing_end_year = 2008

# Creating a copy of the dataset for testing purposes
climate_test_data = climate_data.copy()


In [None]:
# Removing the 'Precipitation (mm)' column from the test dataset

# Since precipitation is the target variable in many climate-related models, 
# we remove it from the feature set to avoid data leakage during model training.

climate_test_data.drop(columns=['Precipitation (mm)'], axis=1, inplace=True)


# Machine Learning Models

## Creating Training and Testing DataFrames

In [None]:
# Creating Training and Testing DataFrames

# Filtering data for the training period
df_train = climate_test_data[climate_test_data['Year'] >= train_start_year]
df_train = climate_test_data[climate_test_data['Year'] <= train_end_year]

# Filtering data for the testing period
df_test = climate_test_data[climate_test_data['Year'] >= test_start_year]
df_test = climate_test_data[climate_test_data['Year'] <= test_end_year]

# Splitting features (X) and target variable (Y)
x_train = df_train.iloc[:, 0:10]  # Selecting the first 10 columns as features
y_train = df_train[['Rained?']]   # Target variable: Did it rain?

x = df_test.iloc[:, 0:10]  # Selecting the first 10 columns as test features
y = df_test[['Rained?']]   # Target variable for testing

# Displaying the first five rows of the test feature set
x.head()


## Model - XGBoost to Determine Whether It Rained or Not


In [None]:
# Creating an XGBoost Classifier instance
xgboost_classifier = xgb.XGBClassifier()

# Training the XGBoost Classifier using the training dataset
xgb_model = xgboost_classifier.fit(x_train, y_train)

# Making predictions on the training dataset
y_pred4 = xgb_model.predict(x_train)


In [None]:
# Importing necessary libraries

# Datasets module: Provides sample datasets for testing and experimentation
from sklearn import datasets

# Splitting module: Used to divide data into training and testing subsets
from sklearn.model_selection import train_test_split

# Logistic Regression: A commonly used statistical model for binary classification
from sklearn.linear_model import LogisticRegression

# Precision-Recall Curve: Used to evaluate model performance in imbalanced datasets
from sklearn.metrics import precision_recall_curve

# Visualization library
import matplotlib.pyplot as plt


### Score

In [None]:
# Evaluating the model's performance using classification metrics

# Precision: Measures the proportion of correctly predicted positive instances 
# out of all instances predicted as positive (true positives / (true positives + false positives)).
print('Precision: %.3f' % precision_score(y_train, y_pred4))

# Accuracy: Represents the proportion of correctly classified instances 
# out of the total instances in the dataset.
print('Accuracy: %.3f' % accuracy_score(y_train, y_pred4))

# Recall (Sensitivity): Measures the proportion of actual positive instances 
# that were correctly identified by the model (true positives / (true positives + false negatives)).
print('Recall: %.3f' % recall_score(y_train, y_pred4))


In [None]:
# Storing predicted rainfall values in a list
predicted_rainfall = []
for z in range(len(y_pred4)):
    # Appending each predicted value to the list
    predicted_rainfall.append(y_pred4[z])

# Adding the predictions as a new column in the training DataFrame
df_train['Prediction'] = predicted_rainfall

# Displaying the first five rows of the updated DataFrame
df_train.head()


# Confusion Matrices

## Model - XGBoost


In [None]:
# Generating the confusion matrix to evaluate classification performance
cm = confusion_matrix(y_train, y_pred4)

# Defining class labels (replace with specific labels if necessary)
class_names = ['Negative Class', 'Positive Class']

# Plotting the confusion matrix using Seaborn
plt.figure(figsize=(8, 6))  # Adjusting figure size
sns.set(font_scale=1.2)  # Setting font size for better readability

# Creating a heatmap visualization of the confusion matrix
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=class_names, yticklabels=class_names)

# Setting axis labels
plt.xlabel('Predicted Label')
plt.ylabel('True Label')

# Displaying the plot
plt.show()
