# 0.0 Preparation

In [2]:
pip install pandas --upgrade


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip3 install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [3]:
pip install graphviz


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip3 install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [4]:
pip install pymc


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip3 install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [5]:
pip install openpyxl


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip3 install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


## 0.1 Imports

In [6]:
# Import Pandas 
import pandas as pd

## Set view to two decimal points only 
pd.set_option('float_format', '{:2f}' .format) 

# Import NumPy
import numpy as np

# Import from statsmodel
import statsmodels.api as sm
from statsmodels.tsa.arima.model import ARIMA

# Import pyMC
import pymc as pm


# 1.0 Setting Roots and Functions

## 1.1 Roots

In [47]:
user_root = "/Users/xanvelaa/Downloads/"

## 1.1 Preprocessing

In [17]:
def preprocess_tourist_dataset (dataset):
    
    preprocessed_dataset = (
        dataset
        .assign(
            month = lambda x: x["date"].dt.month,
            year = lambda x: x["date"].dt.year
            )
            [["month", "year", "tourist_count"]]
            .pivot_table(
                values='tourist_count',  
                index='month',        
                columns='year',      
                )
                .reset_index()
    )

    return preprocessed_dataset
    

In [18]:
def preprocess_weather_dataset (dataset):
    
    preprocessed_dataset = (
        dataset
        [["date", "days_rained", "days_cloudy", "days_sunny"]]
        .assign(
            month = lambda x: x["date"].dt.month,
            year = lambda x: x["date"].dt.year
        )
        [["month", "year", "days_rained", "days_cloudy", "days_sunny"]]
        .sort_values(by=["month", "year"], ignore_index=True)

    )

    return preprocessed_dataset
    

## 1.2 Modeling

In [30]:
def arima_forecast_25 (dataset):
    # Initialize dictionary to store forecasts
    forecasts = {}

    # Loop through each month and fit an ARIMA model
    for i, month in enumerate(dataset["month"]):
        # Get the time series for the current month (2022, 2023, 2024 values)
        ts = dataset.iloc[i, 1:].values  

        # Fit ARIMA model (p, d, q can be tuned)
        model = ARIMA(ts, order=(1, 1, 1))  
        model_fit = model.fit()
        
        # Forecast 1 step ahead (2025 value)
        forecast = model_fit.forecast(steps=1)[0]
        
        # Store the forecasted value
        forecasts[month] = forecast

    # Convert forecasts into a DataFrame
    forecast_df = (
        pd.DataFrame(

            list(forecasts.items()), 
            columns=["month", "Tourists"]
            )
            .assign(tourists = lambda x: x["Tourists"].astype(int))
            .assign(year = 2025)
            [["month", "year", "tourists"]]
            )

    return forecast_df

In [20]:
def dirichlet_multinomial_25 (dataset): 

    # Store predictions
    predictions = []

    # Get unique months in the dataset
    months = dataset["month"].unique()

    # Bayesian model loop for each month
    for month in months:
        # Filter data for the current month
        filtered_dataset = dataset.query(f"month == {month}")
        days_in_this_month = filtered_dataset.iloc[0][["days_rained", "days_cloudy", "days_sunny"]].sum()

    
        # Get observed counts and years
        observed_counts = filtered_dataset[["days_rained", "days_cloudy", "days_sunny"]].values
        years = filtered_dataset["year"].values
        k = observed_counts.shape[1]  # Number of weather categories

        # Define coordinates for PyMC
        coords = {"year": years, "weather": ["rainy", "cloudy", "sunny"]}

        with pm.Model(coords=coords) as model_dm:
            # Dirichlet prior for weather proportions
            frac = pm.Dirichlet("frac", a=np.ones(k), dims="weather")

            # Lognormal prior for concentration parameter
            conc = pm.Lognormal("conc", mu=1, sigma=1)
            
            # Dirichlet-Multinomial probabilities per year
            p = pm.Dirichlet("p", a=frac * conc, dims=("year", "weather"))
            
            # Multinomial likelihood
            counts = pm.Multinomial(
                "counts", n=days_in_this_month, p=p, observed=observed_counts, dims=("year", "weather")
            )

            # Sample from posterior
            trace = pm.sample(1000, tune=500, chains=2, return_inferencedata=True, cores=1)

        # Predict for 2025
        with model_dm:
            future_posterior = pm.sample_posterior_predictive(trace, var_names=["p"])

        # Extract predicted probabilities for 2025
        pred_p = future_posterior.posterior_predictive["p"].mean(dim=["chain", "draw"]).sel(year=2024)

        # Convert probabilities to predicted counts for a 31-day month
        pred_counts = (pred_p * days_in_this_month).round().astype(int)

        # Store results
        predictions.append({
            "month": month,
            "year": 2025,
            "days_rained": pred_counts.sel(weather="rainy").values.item(),
            "days_cloudy": pred_counts.sel(weather="cloudy").values.item(),
            "days_sunny": pred_counts.sel(weather="sunny").values.item()
        })

    # Turn predictions into a dataframe 
    weather_predictions = pd.DataFrame(predictions)
    return weather_predictions

## 1.3 Merging

In [74]:
def merge_data_for_llm (tourist_df, weather_df, loc):
    llm_dataset = (
        pd
        .merge(
            weather_df,
            tourist_df,
            on = ["year", "month"],
            how = "outer")
        .query("year == 2025")
        .assign(location = loc)

    )

    return llm_dataset

# 2.0 Preprocessing + Modeling Datasets

## 2.1 Boracay

### 2.1.1 Tourists

In [None]:
# Read Boracay tourist dataset
boracay_df = pd.read_excel((user_root + "Boracay_Weather_Tourist.xlsx"), sheet_name = "tourist")

# Preprocess Boracay tourist dataset
boracay_tourist_preprocessed = preprocess_tourist_dataset(boracay_df)

# Forecast number of tourists for 2025 
boracay_tourists_forecasted = arima_forecast_25(boracay_tourist_preprocessed)
boracay_tourists_forecasted

In [37]:
boracay_tourists_forecasted

Unnamed: 0,month,year,tourists
0,1,2025,146323
1,2,2025,161727
2,3,2025,192503
3,4,2025,193003
4,5,2025,206013
5,6,2025,190545
6,7,2025,167744
7,8,2025,124315
8,9,2025,156059
9,10,2025,175847


### 2.1.2 Weather

In [None]:
# Read Boracay weather dataset
weather_df = pd.read_excel((user_root + "Boracay_Weather_Tourist.xlsx"), sheet_name = "weather")

# Preprocess Boracay weather dataset
boracay_weather_preprocessed = preprocess_weather_dataset(weather_df)

# Forecast weather for 2025
boracay_weather_forecasted = dirichlet_multinomial_25(boracay_weather_preprocessed)

In [50]:
boracay_weather_forecasted

Unnamed: 0,month,year,days_rained,days_cloudy,days_sunny
0,1,2025,17,8,6
1,2,2025,13,9,6
2,3,2025,7,14,10
3,4,2025,9,12,10
4,5,2025,11,11,9
5,6,2025,18,8,4
6,7,2025,20,8,3
7,8,2025,18,9,4
8,9,2025,19,8,3
9,10,2025,18,10,3


### 2.1.3 Merge DFs 

In [78]:
boracay_predictions_25 = merge_data_for_llm(
    boracay_tourists_forecasted,
    boracay_weather_forecasted,
    "Boracay")

boracay_predictions_25

Unnamed: 0,month,year,days_rained,days_cloudy,days_sunny,tourists,location
0,1,2025,17,8,6,146323,Boracay
1,2,2025,13,9,6,161727,Boracay
2,3,2025,7,14,10,192503,Boracay
3,4,2025,9,12,10,193003,Boracay
4,5,2025,11,11,9,206013,Boracay
5,6,2025,18,8,4,190545,Boracay
6,7,2025,20,8,3,167744,Boracay
7,8,2025,18,9,4,124315,Boracay
8,9,2025,19,8,3,156059,Boracay
9,10,2025,18,10,3,175847,Boracay


## 2.2 Bohol

### 2.2.1 Tourists

In [None]:
# Read Bohol tourist dataset
bohol_df = pd.read_excel((user_root + "Bohol_Weather_Tourist.xlsx"), sheet_name = "tourist")

# Preprocess Bohol tourist dataset
bohol_tourist_preprocessed = preprocess_tourist_dataset(bohol_df)

# Forecast number of tourists for 2025 
bohol_tourists_forecasted = arima_forecast_25(bohol_tourist_preprocessed)

In [59]:
bohol_tourists_forecasted

Unnamed: 0,month,year,tourists
0,1,2025,129658
1,2,2025,140046
2,3,2025,115024
3,4,2025,111723
4,5,2025,137137
5,6,2025,126981
6,7,2025,144690
7,8,2025,140216
8,9,2025,120277
9,10,2025,127919


### 2.2.2 Weather

In [None]:
# Read Bohol weather dataset
bohol_weather_df = pd.read_excel((user_root + "Bohol_Weather_Tourist.xlsx"), sheet_name = "weather")

# Preprocess Bohol weather dataset
bohol_weather_preprocessed = preprocess_weather_dataset(bohol_weather_df)

# Forecast weather for 2025
bohol_weather_forecasted = dirichlet_multinomial_25(bohol_weather_preprocessed)

In [61]:
bohol_weather_forecasted

Unnamed: 0,month,year,days_rained,days_cloudy,days_sunny
0,1,2025,21,8,2
1,2,2025,16,10,3
2,3,2025,20,8,3
3,4,2025,17,9,3
4,5,2025,15,12,4
5,6,2025,15,12,3
6,7,2025,14,14,4
7,8,2025,11,16,4
8,9,2025,12,14,4
9,10,2025,19,10,2


### 2.2.3 Merge DFs

In [77]:
bohol_predictions_25 = merge_data_for_llm(
    bohol_tourists_forecasted,
    bohol_weather_forecasted,
    "Bohol")

bohol_predictions_25

Unnamed: 0,month,year,days_rained,days_cloudy,days_sunny,tourists,location
0,1,2025,21,8,2,129658,Bohol
1,2,2025,16,10,3,140046,Bohol
2,3,2025,20,8,3,115024,Bohol
3,4,2025,17,9,3,111723,Bohol
4,5,2025,15,12,4,137137,Bohol
5,6,2025,15,12,3,126981,Bohol
6,7,2025,14,14,4,144690,Bohol
7,8,2025,11,16,4,140216,Bohol
8,9,2025,12,14,4,120277,Bohol
9,10,2025,19,10,2,127919,Bohol


## 2.3 Cebu

### 2.3.1 Tourists

In [69]:
# Read Cebu tourist dataset
cebu_df = pd.read_excel((user_root + "Cebu_Weather_Tourist.xlsx"), sheet_name = "tourist")

# Preprocess Bohol tourist dataset
cebu_tourist_preprocessed = preprocess_tourist_dataset(cebu_df)

# Forecast number of tourists for 2025 
cebu_tourists_forecasted = arima_forecast_25(cebu_tourist_preprocessed)

  warn('Too few observations to estimate starting parameters%s.'


In [70]:
cebu_tourists_forecasted

Unnamed: 0,month,year,tourists
0,1,2025,490812
1,2,2025,394330
2,3,2025,335634
3,4,2025,302281
4,5,2025,337122
5,6,2025,330585
6,7,2025,378859
7,8,2025,362876
8,9,2025,328300
9,10,2025,362762


### 2.3.2 Weather

In [None]:
# Read Cebu weather dataset
cebu_weather_df = pd.read_excel((user_root + "Cebu_Weather_Tourist.xlsx"), sheet_name = "weather")

# Preprocess Bohol weather dataset
cebu_weather_preprocessed = preprocess_weather_dataset(cebu_weather_df)

# Forecast weather for 2025
cebu_weather_forecasted = dirichlet_multinomial_25(cebu_weather_preprocessed)

In [72]:
cebu_weather_forecasted

Unnamed: 0,month,year,days_rained,days_cloudy,days_sunny
0,1,2025,17,9,5
1,2,2025,13,10,5
2,3,2025,11,11,9
3,4,2025,13,10,6
4,5,2025,14,11,6
5,6,2025,16,11,3
6,7,2025,20,9,2
7,8,2025,18,9,3
8,9,2025,19,9,2
9,10,2025,22,7,2


### 2.2.3 Merge DFs

In [80]:
cebu_predictions_25 = merge_data_for_llm(
    cebu_tourists_forecasted,
    cebu_weather_forecasted,
    "Cebu")

cebu_predictions_25

Unnamed: 0,month,year,days_rained,days_cloudy,days_sunny,tourists,location
0,1,2025,17,9,5,490812,Cebu
1,2,2025,13,10,5,394330,Cebu
2,3,2025,11,11,9,335634,Cebu
3,4,2025,13,10,6,302281,Cebu
4,5,2025,14,11,6,337122,Cebu
5,6,2025,16,11,3,330585,Cebu
6,7,2025,20,9,2,378859,Cebu
7,8,2025,18,9,3,362876,Cebu
8,9,2025,19,9,2,328300,Cebu
9,10,2025,22,7,2,362762,Cebu


# 3.0 Append Datasets for LLM Fine Tuning

In [85]:
datasets = [
    boracay_predictions_25,
    bohol_predictions_25,
    cebu_predictions_25
]

column_list = (
    boracay_predictions_25
    .columns[1:5]
    .insert(0, "location")
)

train_data_updated = (
    pd.concat(
        datasets,
        axis = 0,
        ignore_index = True
    )
    [column_list]

)

In [86]:
train_data_updated.to_csv("train_data_updated.csv")

In [None]:
train_data.to_csv("train_data.csv")