<h1><center> UFU - Federal University of Uberlândia</center></h1>

<h2><center>Undergraduate Program in Civil Engineering</center></h2>

<h3><center>SCIENTIFIC RESEARCH PROJECT</center><br>
TITLE: USING XGBOOST MODELS FOR DAILY RAINFALL PREDICTION 
<br>  
<br>  
STUDENT: Pedro Augusto Toledo Rios</h3>

<p>This notebook is part of a Scientific Research Project in the field of Computer Science/Data Analysis.</p>


# Classification


## Imports and Initial Configurations


In [None]:
# Data analysis and preprocessing
import pandas as pd  # Data manipulation and analysis
import numpy as np  # Numerical computations
import random as rnd  # Random number generation
import seaborn as sn  # Data visualization

# Visualization libraries
import seaborn as sns  # Statistical data visualization
import matplotlib.pyplot as plt  # General plotting
%matplotlib inline  

# Machine learning models
from sklearn.linear_model import LogisticRegression  # Logistic regression model
from sklearn.svm import SVC, LinearSVC  # Support Vector Classifier (SVC)
from sklearn.ensemble import RandomForestClassifier  # Random Forest classifier
from sklearn.neighbors import KNeighborsClassifier  # K-Nearest Neighbors (KNN)
from sklearn.naive_bayes import GaussianNB  # Naive Bayes classifier
from sklearn.linear_model import Perceptron  # Perceptron classifier
from sklearn.linear_model import SGDClassifier  # Stochastic Gradient Descent classifier
from sklearn.tree import DecisionTreeClassifier  # Decision Tree classifier
from sklearn.model_selection import train_test_split  # Data splitting for training/testing
from sklearn.linear_model import LinearRegression  # Linear regression model
from sklearn import metrics  # Evaluation metrics
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score  # Classification metrics
from sklearn.metrics import classification_report, confusion_matrix  # Model evaluation tools
import itertools  # Itertools for handling iterators
from sklearn import svm  # Support Vector Machines
from sklearn.naive_bayes import GaussianNB  # Gaussian Naive Bayes (repeated import)
from sklearn.ensemble import AdaBoostClassifier  # AdaBoost classifier
from sklearn.metrics import ConfusionMatrixDisplay  # Confusion matrix visualization

# Gradient boosting models
import xgboost as xgb  # XGBoost classifier
import lightgbm as lgb  # LightGBM classifier

# Logistic regression (repeated import)
from sklearn.linear_model import LogisticRegression


# Exploratory Data Analysis


In [None]:
import pandas as pd  # Library for data manipulation

# List of invalid values to be treated as NaN (missing values)
missing_values = [
    'n/a', 'na', '*****', '*', '*******', ' -', '******', '5..84', '3..66', '3.3.21', 
    '1..41', '********', '3.7.94', '354.59*', '564..79', '5.04.24', '21:36', '**********', 
    '***', '*********', '03:18', '00:00', '03:48', '08:42', '03:06', '09:06', '01:30', 
    '07:48', '09:12', '10:18', '01:24', '#VALUE!', '926,4923,8', '27/07/1902**21:36:00', 
    '-', '926.4923.8', '185.488.992', '4.535.416.667', '3.495.833.333', '2.015.833.333', 
    '2.489.166.667', '4.745.416.667', '3.227.916.667', '3.594.166.667', '3.720.416.667', 
    '06:12', '04:36', '06:48'
]

# Load the dataset from CSV file
weather_data = pd.read_csv(
    'C:/Users/auped/Desktop/IC CORREÇÕES/python 05-11/pesquisa/dadosclimaatt - CORRETO.csv', 
    header=None, sep=';', na_values=missing_values
)

# Rename columns for better readability
weather_data.columns = [
    'Max Temperature (°C)', 'Min Temperature (°C)', 'Avg Temperature (°C)', 'Wind Speed (m/s)', 
    'Solar Radiation (cal/cm²/h)', 'Pressure (mb)', 'Relative Humidity (%)', 'Daily Rainfall (mm)', 
    'Month', 'Year'
]

# Data cleaning and type conversion
weather_data['Pressure (mb)'] = weather_data['Pressure (mb)'].astype(str).str.replace(',,', '')
weather_data['Pressure (mb)'] = pd.to_numeric(weather_data['Pressure (mb)'], errors='coerce')

weather_data['Relative Humidity (%)'] = pd.to_numeric(weather_data['Relative Humidity (%)'], errors='coerce')

weather_data['Year'] = weather_data['Year'].astype(str).str.replace(',,', '')
weather_data['Year'] = pd.to_numeric(weather_data['Year'], errors='coerce').astype('Int64')

# Convert wind speed from km/h to m/s
weather_data['Wind Speed (m/s)'] = pd.to_numeric(weather_data['Wind Speed (m/s)'], errors='coerce') / 3.6  

weather_data['Solar Radiation (cal/cm²/h)'] = pd.to_numeric(weather_data['Solar Radiation (cal/cm²/h)'], errors='coerce')

# Filter pressure values within a valid range
weather_data = weather_data[(weather_data['Pressure (mb)'] >= 870) & (weather_data['Pressure (mb)'] <= 1100)]

# Remove rows with remaining NaN values
weather_data.dropna(inplace=True)

# Display the number of missing values in each column
print(weather_data.isnull().sum())

# Display the first five rows of the processed dataset
print(weather_data.head())


### Information about each column in the DataFrame


In [None]:
weather_data.describe()

### Creation of a new class to determine whether it rained on a given day


In [None]:
# Create a new column 'Rained?' with 0 (no rain) and 1 (rain)
weather_data['Rained?'] = (weather_data['Daily Rainfall (mm)'] > 0).astype(int)

# Display the first five rows of the dataset
print(weather_data.head())


In [None]:
# Handling Missing Values  
weather_data['Min Temperature (°C)'].fillna(method='ffill', inplace=True)
weather_data['Avg Temperature (°C)'].fillna(method='ffill', inplace=True)
weather_data['Wind Speed (m/s)'].fillna(method='ffill', inplace=True)
weather_data['Solar Radiation (cal/cm²/h)'].fillna(method='ffill', inplace=True)  # Corrected
weather_data['Relative Humidity (%)'].fillna(method='ffill', inplace=True)
weather_data['Daily Rainfall (mm)'].fillna(method='ffill', inplace=True)  # Corrected
weather_data['Pressure (mb)'].fillna(method='ffill', inplace=True)
weather_data['Rained?'].fillna(method='ffill', inplace=True)
weather_data['Max Temperature (°C)'].fillna(method='ffill', inplace=True)

# Check for missing values after imputation  
print('\nMissing Values in the dataframe after processing:\n', weather_data.isnull().sum(), sep="")


In [None]:
# Compute Pearson correlation and sort the values
columns_corr = weather_data.corr(method='pearson')['Rained?'].sort_values()
print(columns_corr)


### Code snippets to filter the DataFrame for a specific period of interest.


In [None]:
# 70% for training / 30% for testing  

start_year = 1980  
end_year = 2020  
train_start_year = 1980  
train_end_year = 2008  
test_start_year = 2009  
test_end_year = 2019  

# Create a copy of the dataset for testing  
test_data = weather_data.copy()


In [None]:
test_data.drop(columns=['Daily Rainfall (mm)'], axis=1, inplace=True)


# Machine Learning Models


## Creating Training and Testing DataFrames


In [None]:
# Create Training and Testing DataFrames  
df_train = test_data[test_data['Year'] >= train_start_year]  
df_train = test_data[test_data['Year'] <= train_end_year]  
df_test = test_data[test_data['Year'] >= test_start_year]  
df_test = test_data[test_data['Year'] <= test_end_year]  

# Splitting features (X) and target variable (y)  
x_train = df_train.iloc[:, 0:9]  
y_train = df_train[['Rained?']]  

x = df_test.iloc[:, 0:9]  
y = df_test[['Rained?']]  



## Model - XGBoost to Determine Whether It Rained or Not


In [None]:
# Creating the XGBoost classifier object  
xgboost_model = xgb.XGBClassifier()  

# Training the XGBoost classifier  
trained_xgb_model = xgboost_model.fit(x, y)  

# Testing the model  
y_pred4 = trained_xgb_model.predict(x_train)


In [None]:
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_recall_curve
import matplotlib.pyplot as plt

### Score

In [None]:
print('Precision: %.3f' % precision_score(y_train, y_pred4))
print('Recall: %.3f' % recall_score(y_train, y_pred4))
print('Accuracy: %.3f' % accuracy_score(y_train, y_pred4))


In [None]:
chuvaprevista = []
for z in range(len(y_pred4)):
    #print(y_pred4[z])
    chuvaprevista.append(y_pred4[z])
    
df_train['Previsão']= chuvaprevista
df_train.head()    

In [None]:
from xgboost import plot_importance
from matplotlib import pyplot  # Adicione esta linha de importação

plot_importance(xgboost_model)
pyplot.show()

# Confusion Matrices


## Model - XGBoost


In [None]:
# Compute the confusion matrix  
cm = confusion_matrix(y_train, y_pred4)  

# Define class labels (replace with specific labels if necessary)  
class_names = ['Negative Class', 'Positive Class']  

# Plot the confusion matrix using Seaborn  
plt.figure(figsize=(8, 6))  
sns.set(font_scale=1.2)  # Adjust font size  

sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=class_names, yticklabels=class_names)  

plt.xlabel('Predicted Label')  
plt.ylabel('True Label')  
plt.show()
