<h1><center> UFU - Federal University of Uberlândia</center></h1>

<h2><center>Undergraduate Program in Civil Engineering</center></h2>

<h3><center>SCIENTIFIC RESEARCH PROJECT</center><br>
TITLE: USING XGBOOST MODELS FOR DAILY RAINFALL PREDICTION 
<br>  
<br>  
STUDENT: Pedro Augusto Toledo Rios</h3>

<p>This notebook is part of a Scientific Research Project in the field of Computer Science/Data Analysis.</p>


# Notebook for Daily Rainfall Amount (mm) Determination - Adjusted Global Dataset

## Imports and Initial Configurations

In [None]:
# Data Analysis and Wrangling
import pandas as pd
import numpy as np
import random

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
import plotly.express as px
%matplotlib inline

# Machine Learning
from sklearn import metrics
from sklearn.linear_model import LogisticRegression, LinearRegression, Perceptron, SGDClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import (precision_score, recall_score, f1_score, 
                           accuracy_score, classification_report, 
                           confusion_matrix, ConfusionMatrixDisplay,
                           mean_absolute_error, mean_squared_error, r2_score)

# Deep Learning
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import (LSTM, Dense, Dropout, Bidirectional,
                                   SimpleRNN, Embedding, Masking)

# Display configuration
pd.set_option("display.max_colwidth", 150)
pd.set_option("display.min_rows", 20)

## Exploratory Data Analysis

In [None]:
# Data Preprocessing
missing_values = ['n/a', 'na', '*****', '*', '*******', ' -', '******', '5..84', '3..66', 
                 '3.3.21', '1..41', '********', '3.7.94', '354.59*', '564..79', '5.04.24', 
                 '21:36', '**********', '***', '*********', '03:18', '00:00', '03:48', 
                 '08:42', '03:06', '09:06', '01:30', '07:48', '09:12', '10:18', '01:24', 
                 '#VALUE!', '926,4923,8', '27/07/1902**21:36:00', '-', '926.4923.8',
                 '185.488.992', '4.535.416.667', '3.495.833.333', '2.015.833.333',
                 '2.489.166.667', '4.745.416.667', '3.227.916.667', '3.594.166.667',
                 '3.720.416.667']

# Load and clean dataset
weather_data = pd.read_csv('C:/Users/auped/Desktop/IC CORREÇÕES/python 05-11/Titanic/modelo global/dadosclima_ajustado.csv', header = None, sep=';', na_values=missing_values)

# Column renaming
weather_data.columns = ['Max Temp (°C)', 'Min Temp (°C)', 'Avg Temp (°C)', 
                       'Wind Speed (km/h)', 'Solar Radiation (cal/cm²/h)', 
                       'Pressure (mb)', 'Humidity (%)', 'Rainfall (mm)', 
                       'Month', 'Year']

# Data cleaning operations
weather_data['Pressure (mb)'] = weather_data['Pressure (mb)'].str.replace(',,', '.')
weather_data['Humidity (%)'] = weather_data['Humidity (%)'].astype(float)
weather_data['Pressure (mb)'] = weather_data['Pressure (mb)'].astype(float)
weather_data['Year'] = weather_data['Year'].str.replace(',,', '')
weather_data['Year'] = weather_data['Year'].astype(int)

# Filter valid ranges
weather_data = weather_data[
    (weather_data['Pressure (mb)'] >= 870) & 
    (weather_data['Pressure (mb)'] <= 1100)
]

weather_data = weather_data[
    (weather_data['Solar Radiation (cal/cm²/h)'] >= 0) & 
    (weather_data['Solar Radiation (cal/cm²/h)'] <= 1500)
]

# Prepare data for analysis
combined_data = [weather_data]
rainfall = weather_data['Rainfall (mm)']

# Display cleaned data
weather_data.head()
weather_data.dropna(inplace=True)

# Show missing values count
print(weather_data.isnull().sum())

### Implementation of a Rain Detection Class for Daily Weather Data

In [None]:
# Create binary rainfall indicator column (0 = no rain, 1 = rain)
for weather_df in combined_data:
    weather_df.loc[weather_df['Rainfall (mm)'] == 0, 'Rainfall_Occurred'] = 0
    weather_df.loc[weather_df['Rainfall (mm)'] > 0, 'Rainfall_Occurred'] = 1

# Display first 5 rows
weather_data.head()

In [None]:
# Generate descriptive statistics
weather_data.describe()

In [None]:
# Display missing values count per column
print('\nMissing Values in DataFrame:\n', weather_data.isnull().sum(), sep='')

In [None]:
# Data cleaning 

weather_data['Humidity (%)'] = weather_data['Humidity (%)'].astype(float)
weather_data['Pressure (mb)'] = weather_data['Pressure (mb)'].astype(float)

weather_data['Year'] = weather_data['Year'].astype(int)

In [None]:
# Calculate Pearson correlation with rainfall and sort values
rain_correlation = weather_data.corr(method='pearson')['Rainfall (mm)'].sort_values()
print(rain_correlation)

In [None]:
# Count rainy and dry days
rain_count = weather_data['Rainfall_Occurred'].value_counts()

# Get results (with default 0 if no occurrences)
rainy_days = rain_count.get(1, 0)    # Days with rain (1)
dry_days = rain_count.get(0, 0)      # Days without rain (0)

# Print formatted results
print(f"Total rainy days: {rainy_days}")
print(f"Total dry days: {dry_days}")

## Time Period Selection for Analysis

In [None]:
start_year = 1980
end_year = 2020
train_start_year = 1983
train_end_year = 2008
test_start_year = 2009
test_end_year = 2019


In [None]:
# Remove rows with zero rainfall
weather_data.drop(weather_data.loc[weather_data['Rainfall (mm)'] == 0].index, 
                inplace=True)

In [None]:
dadosparateste = weather_data.copy()

#Criando uma nova coluna de chuva para ficar no final do dataframe

qtdchuva = dadosparateste['Rainfall (mm)']


dadosparateste['Rainfall (mm)'] = qtdchuva

#Visualização das primeiras cinco linhas da tabela
dadosparateste.head()  

In [None]:
#dadosparateste.drop(columns = ['Rainfall (mm)'], axis = 1, inplace=True)


## Shared Utility Functions for All Models

In [None]:
# Library para statmodels
import statsmodels.api as sm
from sklearn.metrics import mean_absolute_percentage_error

#Definir funções que serão utilizadas por todos os modelos de predição

#Imprimir mensagens de erro da predição
def imprimir_scores_predicao(strModel, y_target, arPredict):
  #print(strModel, ' - R2: ', r2_score(y_target, arPredict))
  print(strModel, ' - MAE:  ', mean_absolute_error(y_target, arPredict))
  print(strModel, ' - MAPE: ', mean_absolute_percentage_error(y_target, arPredict))
  #print(strModel, ' - MSE: ', mean_squared_error(y_target, arPredict))
  print(strModel, ' - RMSE: ', np.sqrt(mean_squared_error(y_target, arPredict)))
  print(strModel, ' - MSE: ',mean_squared_error(y_target, arPredict))
  print(strModel, ' - R2: ',r2_score(y_target, arPredict))
  #mse = metrics.mean_squared_error(y, yhat)
  
  
def imprimir_graficos_predicao(strModel, y_target, arPredict):
  
  df_Test_Predicted = y_target.copy()
  df_Test_Predicted['Chuva (mm)'] = arPredict

  #Plotagem dos dados de teste e dados previstos
  plt.figure(figsize=(16,8))
  plt.plot(y_target.loc['Dia'], y_target, color = 'blue', label = 'Chuva Real')
  plt.plot(y_target.loc['Dia'], arPredict, color = 'red', label = 'Chuva Prevista')
  font = {'family': 'serif',
        'color':  'black',
        'weight': 'normal',
        'size': 13.5,
        }
  plt.title('Comparação entre o real e o previsto pelo modelo', fontdict=font)
  plt.xlabel('Período')
  plt.ylabel('Quantidade de chuva')
  plt.legend()
  plt.grid(True)
  #plt.savefig('lstm2.pdf')
  plt.show()  

# Machine Learning Models - Regression

In [None]:
# Criar os dataframes de Treinamento e Teste
df_train = dadosparateste[(dadosparateste['Year'] >= train_start_year) & (dadosparateste['Year'] <= train_end_year)]
df_test = dadosparateste[(dadosparateste['Year'] >= test_start_year) & (dadosparateste['Year'] <= test_end_year)]



# Separar variáveis independentes (X) e variável alvo (y)
X_train = df_train.iloc[:, 0:7]
y_train = df_train[['Rainfall (mm)']]

X_test = df_test.iloc[:, 0:7]
y_test = df_test[['Rainfall (mm)']]

# Exibir as primeiras linhas do conjunto de teste
X_train.head()

# XGBOOST 

In [None]:
import xgboost as xgb

In [None]:
xg_reg = xgb.XGBRegressor(objective ='reg:linear', colsample_bytree = 0.3, learning_rate = 0.1,
                max_depth = 100, alpha = 1, n_estimators = 100000)

In [None]:
xg_reg.fit(X_train,y_train)

preds = xg_reg.predict(X_test)

In [None]:
imprimir_scores_predicao("XGBoost", y_test, preds)


In [None]:
y_test.describe()

In [None]:
chuvaprevista = []
for z in range(len(preds)):
    #print(y_pred4[z])
    chuvaprevista.append(preds[z])
    
df_test['Previsão']= chuvaprevista
df_test.head()    

In [None]:
df_test.to_excel("regressaoseco.xlsx")

In [None]:
df_test

In [None]:

df_test = df_test[df_test['Year']>= 1980]
df_test =  df_test[df_test['Year'] <= 2020]

In [None]:
import numpy as np
import matplotlib.pyplot as plt

# Filtrar valores positivos para evitar erro na escala logarítmica
df_test = df_test[(df_test['Rainfall (mm)'] > 0) & (df_test['Previsão'] > 0)]

# Verificar se df_test está vazio após o filtro
if df_test.empty:
    print("Erro: df_test está vazio após remover valores não positivos. Verifique os dados.")
else:
    plt.figure(figsize=(10, 10))

    true_value = df_test['Rainfall (mm)']
    predicted_value = df_test['Previsão']

    plt.scatter(true_value, predicted_value, c='crimson')
    plt.yscale('log')
    plt.xscale('log')

    p1 = max(max(predicted_value), max(true_value))
    p2 = min(min(predicted_value), min(true_value))
    
    plt.plot([p1, p2], [p1, p2], 'b-')
    plt.xlabel('True Values', fontsize=15)
    plt.ylabel('Predictions', fontsize=15)
    plt.axis('equal')
    plt.show()


In [None]:
from xgboost import plot_importance
plot_importance(xg_reg)
pyplot.show()