In [18]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import OneHotEncoder

#### Find out what one hot encoder is, and random forest regressor, and yap why it good

In [19]:
# 1. Load Data (Assuming these are now DataFrames)
visitor_data = pd.read_csv("cleaned_visit_data.csv")
climate_data = pd.read_csv("cleaned_climate_data.csv")


# Transform visitor data from wide to long format
visitor_long = visitor_data.melt(
    id_vars=['Year', 'Week'], 
    var_name='mount_id', 
    value_name='visitors'
)

# Convert mount_id to integer for merging
visitor_long['mount_id'] = visitor_long['mount_id'].astype(int)

# Aggregate climate data by week and mountain (average weather conditions)
climate_agg = climate_data.groupby(['mount_id', 'Year', 'week']).agg({
    'max_temp': 'mean',
    'min_temp': 'mean',
    'rain': 'mean'
}).reset_index()

# Merge visitor and climate data
merged_data = pd.merge(
    visitor_long, 
    climate_agg, 
    left_on=['mount_id', 'Year', 'Week'], 
    right_on=['mount_id', 'Year', 'week'],
    how='inner'  # Use inner join to ensure we have complete data
)

# Drop the redundant week column
merged_data = merged_data.drop(columns=['week'])

# Handle missing data if any
print(f"Missing values in merged data: {merged_data.isnull().sum().sum()}")

# Prepare features and target
X = merged_data[['mount_id', 'max_temp', 'min_temp', 'rain']]
y = merged_data['visitors']

# Encode mountain ID as categorical feature
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
mount_encoded = encoder.fit_transform(X[['mount_id']])
mount_encoded_df = pd.DataFrame(
    mount_encoded, 
    columns=encoder.get_feature_names_out(['mount_id'])
)

# Combine with numerical features
X_numerical = X[['max_temp', 'min_temp', 'rain']].reset_index(drop=True)
X_processed = pd.concat([mount_encoded_df, X_numerical], axis=1)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    X_processed, y, test_size=0.2, random_state=42
)

# Train the model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Evaluate the model
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
nsme = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print(f"Model Performance:")
print(f"Root Mean Squared Error: {nsme:.2f}")
print(f"Normal root Mean Squared Error: {nsme/y.mean():.2f}")
print(f"R² Score: {r2:.2f}")


Missing values in merged data: 0
Model Performance:
Root Mean Squared Error: 14199.92
Normal root Mean Squared Error: 0.64
R² Score: 0.44


In [20]:

# Function to predict visitors for new weather conditions
def predict_visitors(mount_id, max_temp, min_temp, rain):
    # Create a dataframe with the input data
    input_data = pd.DataFrame({
        'mount_id': [mount_id],
        'max_temp': [max_temp],
        'min_temp': [min_temp],
        'rain': [rain]
    })
    
    # Encode the mountain ID
    mount_encoded = encoder.transform(input_data[['mount_id']])
    mount_encoded_df = pd.DataFrame(
        mount_encoded, 
        columns=encoder.get_feature_names_out(['mount_id'])
    )
    
    # Combine with numerical features
    numerical_features = input_data[['max_temp', 'min_temp', 'rain']].reset_index(drop=True)
    processed_input = pd.concat([mount_encoded_df, numerical_features], axis=1)
    
    # Ensure all columns are present (handle unknown mountains)
    for col in X_processed.columns:
        if col not in processed_input.columns:
            processed_input[col] = 0
    
    # Reorder columns to match training data
    processed_input = processed_input[X_processed.columns]
    
    # Make prediction
    prediction = model.predict(processed_input)
    return prediction[0]

# Example usage
print("\nExample Prediction:")
predicted_visitors = predict_visitors(71075, 15.0, 5.0, 0.5)
print(f"Predicted visitors for mountain 71075 with max_temp=15, min_temp=5, rain=0.5: {predicted_visitors:.0f}")


Example Prediction:
Predicted visitors for mountain 71075 with max_temp=15, min_temp=5, rain=0.5: 41458


## Find amount of visitor for a given mount id per week

In [21]:
weather_forecast = pd.read_csv("weather_forecast_2026.csv")
RESULT_CSV = "visitor_forecast_2026.csv"

dataframe = {"Mount":[], "Week":[], "Visitors":[]}

In [22]:
stations = {
    71032: 'Thredbo',
    71075: 'Perisher',
    72161: 'Selwyn',
    83024: 'Mt. Buller',
    83084: 'Falls Creek',
    83085: 'Mt. Hotham',
    85291: 'Mt. Baw Baw'
}

for i in weather_forecast.index:
    mount_id = weather_forecast["mount_id"][i]
    week = weather_forecast["week"][i]
    max_temp = weather_forecast["max_temp"][i]
    min_temp = weather_forecast["min_temp"][i]
    rain = weather_forecast["rain"][i]

    visitors = predict_visitors(mount_id, max_temp, min_temp, rain)

    dataframe["Mount"].append(stations[mount_id])
    dataframe["Week"].append(week)
    dataframe["Visitors"].append(round(visitors))

In [23]:
pd.DataFrame(dataframe).to_csv(RESULT_CSV, index=False)