In [1]:
import pandas as pd
import numpy as np

In [2]:
visitation_data = "visitation_d.csv"
climate_data = "climate_d.csv"

visit_df = pd.read_csv(visitation_data)
clim_df = pd.read_csv(climate_data)

## Cleaning visitation data

In [3]:
visit_df.head(5)

In [4]:
visit_df.dtypes

In [5]:
visit_df.info()

In [6]:
visit_df.count() # check for null values

In [7]:
visit_df.describe()

## Cleaning climate data

In [8]:
clim_df.head(5)

In [9]:
clim_df.columns

In [10]:
clim_df.rename(columns={'Bureau of Meteorology station number': 'mount_id', "Maximum temperature (Degree C)": "max_temp", "Minimum temperature (Degree C)": "min_temp",
"Rainfall amount (millimetres)":"rain"}, inplace=True)

In [11]:
clim_df.columns

In [12]:
clim_df.dropna(how='any', inplace=True)

In [13]:
clim_df.dtypes

In [14]:
clim_df.info()

In [15]:
clim_df.describe()

In [16]:
stations = {
    'Thredbo': 71032,
    'Perisher': 71075,
    'Selwyn': 72161,
    'Mt. Buller': 83024,
    'Falls Creek': 83084,
    'Mt. Hotham': 83085,
    'Mt. Baw Baw': 85291
}

In [17]:
for i in visit_df.columns[2:]: # Trying to rename columns using the stations dictionary
    try:
        visit_df.rename(columns={i: stations[i]}, inplace=True)
    except KeyError:
        continue
visit_df.columns

In [18]:
clim_df["mount_id"].unique()

## Testing things out

In [19]:
new_visit_df = visit_df
for i in visit_df.columns[2:]:
    if type(i) != int:
        new_visit_df.drop(columns=i, inplace=True)
new_visit_df.columns

In [20]:
new_clim_df = clim_df.drop(
    clim_df[(clim_df["Year"] < 2014) | (clim_df["Year"] > 2024)].index
)

In [21]:
new_clim_df.describe() #checking if the filtering worked

In [22]:
# Convert Month and Day to a datetime, then get week number
new_clim_df['date'] = pd.to_datetime(dict(year=new_clim_df['Year'], month=new_clim_df['Month'], day=new_clim_df['Day']))
new_clim_df['week'] = new_clim_df['date'].dt.isocalendar().week

# Filter for weeks 1 to 15
filtered_clim_df = new_clim_df[(new_clim_df['week'] >= 1) & (new_clim_df['week'] <= 15)]

# Group by Year and week, take average of other columns
weekly_avg_clim_df = filtered_clim_df.groupby(['Year', 'week'], as_index=False).mean(numeric_only=True)

In [23]:
new_clim_df.drop(columns=['Month', 'Day', 'date'], inplace=True)
new_clim_df.drop(new_clim_df[new_clim_df['week'] > 15].index, inplace=True)

In [24]:
new_clim_df.columns

In [25]:
new_clim_df = new_clim_df[['mount_id', 'Year', 'week',  'max_temp', 'min_temp', 'rain']]

In [None]:
new_clim_df.to_csv("cleaned_climate_data.csv", index=False)

## Cleaning out the visitor mountain again

In [27]:
new_visit_df.describe()

In [28]:
from sklearn.ensemble import IsolationForest

mountain_cols = [c for c in new_visit_df.columns if c not in ['Year','Week']]
outlier_indices = set()
mult = 1.5

X = new_visit_df[mountain_cols].apply(pd.to_numeric, errors='coerce')

# simple imputation for missing values â€” median is robust
X_imp = X.fillna(X.median())

clf = IsolationForest(contamination=0.01, random_state=0)   # adjust contamination
clf.fit(X_imp)
new_visit_df['iso_outlier'] = clf.predict(X_imp) == -1

clean_df = new_visit_df.loc[~new_visit_df['iso_outlier']].reset_index(drop=True)

In [29]:
new_visit_df.info()

In [30]:
new_visit_df["iso_outlier"].value_counts()

In [31]:
new_visit_df.drop(new_visit_df[new_visit_df['iso_outlier'] == True].index, inplace=True)
new_visit_df.drop(columns=['iso_outlier'], inplace=True)

In [32]:
new_visit_df.info()

In [33]:
new_visit_df.describe()

In [34]:
new_visit_df.to_csv("cleaned_visit_data.csv", index=False)

In [None]:
new_visit_df.head(5)

In [36]:
new_clim_df.head(5)

Unnamed: 0,mount_id,Year,week,max_temp,min_temp,rain
1461,71075,2014,1,14.3,2.5,0.0
1462,71075,2014,1,18.1,8.8,0.8
1463,71075,2014,1,14.8,9.5,0.0
1464,71075,2014,1,11.6,4.2,0.2
1465,71075,2014,1,14.5,5.0,0.0
