In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_absolute_error, mean_squared_error

# Load data
df = pd.read_csv('STATIONS.csv', parse_dates=['Date'])
df.set_index('Date', inplace=True)

# Define the distances for Darwin Airport and CSIRO
stations = ['Darwin Airport', 'CSIRO']  # List the specific stations to consider
distances = [8, 4.8]  # Corresponding distances to Thorak Cemetery

# Calculate Spearman correlation between Thorak Cemetery and the specified stations
correlations = df[stations].corrwith(df['Thorak Cemetery'], method='spearman')

# Calculate completeness of data for each station
completeness = df[stations].notnull().mean()  # Fraction of non-missing values for each specified station

# Calculate weights based on correlation, distance, and data completeness
weights = {station: (correlations[station] / distances[idx]) * completeness[station] for idx, station in enumerate(stations)}

# Normalize weights so they sum to 1
total_weight = sum(weights.values())
normalized_weights = {station: weight / total_weight for station, weight in weights.items()}

# Function to impute missing values with weighted average
def weighted_impute(row):
    if pd.isna(row['Thorak Cemetery']):
        available_data = {station: row[station] for station in stations if pd.notna(row[station])}
        if not available_data:
            return np.nan  # No data available at all for imputation
        elif len(available_data) == 1:
            # Only one station has data, use it as the imputed value
            return list(available_data.values())[0]
        else:
            # Calculate weighted sum using available data and normalized weights
            weighted_sum = sum(row[station] * normalized_weights[station] for station in available_data if station in normalized_weights)
            return weighted_sum if weighted_sum != 0 else np.nan
    else:
        return row['Thorak Cemetery']

# Evaluate the imputation (need actual known values, simulating missing data here as an example)
known_values = df['Thorak Cemetery'].copy()  # Assume we know all the original values
mask = np.random.rand(len(df)) < 0.1  # Randomly select 10% of the data
df.loc[mask, 'Thorak Cemetery'] = np.nan  # Introduce missing values

# Apply the imputation
df['Thorak Cemetery'] = df.apply(weighted_impute, axis=1)

# Calculate MAE and RMSE for the imputed values, excluding NaNs
imputed_values = df['Thorak Cemetery'][mask]
known_values = known_values[mask]

# Filter out NaN values from known_values and imputed_values
valid_mask = known_values.notna() & imputed_values.notna()
mae = mean_absolute_error(known_values[valid_mask], imputed_values[valid_mask])
rmse = np.sqrt(mean_squared_error(known_values[valid_mask], imputed_values[valid_mask]))

# Print normalized weights for review
print("Normalized Weights based on Spearman correlation, distances, and completeness:", normalized_weights)
print(f"Mean Absolute Error (MAE): {mae}")
print(f"Root Mean Squared Error (RMSE): {rmse}")


FileNotFoundError: [Errno 2] No such file or directory: 'STATIONS.csv'