In [1]:
# Two stations using correlation,compeleteness and distance
import numpy as np
from sklearn.metrics import mean_absolute_error, mean_squared_error
import pandas as pd
import sklearn
df = pd.read_csv('STATIONS.csv')

# Convert 'Date' to datetime and set as index
df['Date'] = pd.to_datetime(df['Date'], format='%d/%m/%Y')
df.set_index('Date', inplace=True)

# Define function for weighted mean imputation
def weighted_mean_imputation(df, target, neighbors, correlations, distances, completeness):
    weights = [correlation / distance * comp for correlation, distance, comp in zip(correlations, distances, completeness)]
    total_weight = sum(weights)
    normalized_weights = [weight / total_weight for weight in weights]

    for index, row in df.iterrows():
        if np.isnan(row[target]):
            weighted_values = [row[neighbor] * weight for neighbor, weight in zip(neighbors, normalized_weights) if not np.isnan(row[neighbor])]
            if weighted_values:
                df.at[index, target] = sum(weighted_values)

# Impute missing values in 'Thorak Cemetery'
weighted_mean_imputation(df, 'Thorak Cemetery', ['CSIRO', 'Darwin Airport'], [0.85, 0.79], [5, 9], [0.9, 1.0])

# Evaluate the imputation (need actual known values, simulating missing data here as an example)
known_values = df['Thorak Cemetery'].copy()  # Assume we know all the original values
mask = np.random.rand(len(df)) < 0.1  # Randomly select 10% of the data
df.loc[mask, 'Thorak Cemetery'] = np.nan  # Introduce missing values
weighted_mean_imputation(df, 'Thorak Cemetery', ['CSIRO', 'Darwin Airport'], [0.85, 0.79], [5, 9], [0.9, 1.0])

# Calculate MAE and RMSE for the imputed values
mae = mean_absolute_error(known_values[mask], df['Thorak Cemetery'][mask])
rmse = np.sqrt(mean_squared_error(known_values[mask], df['Thorak Cemetery'][mask]))

print(f"Mean Absolute Error (MAE): {mae}")
print(f"Root Mean Squared Error (RMSE): {rmse}")


Mean Absolute Error (MAE): 2.094594967128594
Root Mean Squared Error (RMSE): 6.76692473797948


In [5]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_absolute_error, mean_squared_error

# Load data
df = pd.read_csv('STATIONS_DATA.csv', parse_dates=['Date'])
# df = df.dropna(subset=['Thorak Cemetery'])
df.set_index('Date', inplace=True)

# Evaluate the imputation (need actual known values, simulating missing data here as an example)
# known_values = df['Thorak Cemetery'].copy()  # Assume we know all the original values
# mask = np.random.rand(len(df)) < 0.1  # Randomly select 10% of the data
# df.loc[mask, 'Thorak Cemetery'] = np.nan  # Introduce missing values

# Define distances to stations in order from Thorak Cemetery: 1km, 2km, 3km, 4km, 5km
# distances = [4.8, 8, 5.5, 5.5, 5]
# stations = [col for col in df.columns if col != 'Thorak Cemetery']  # Exclude Thorak Cemetery from the stations list

# Calculate Spearman correlation between Thorak Cemetery and other stations
# correlations = df.corr(method='spearman')['Thorak Cemetery'].drop('Thorak Cemetery')

# Calculate completeness of data for each station
completeness = df.notnull().mean() * 100  # Percentage of non-missing values for each column

# # Calculate weights based on correlation, distance, and data completeness
# weights = {station: (correlations[station] / distances[idx]) * (completeness[station] / 100) for idx, station in enumerate(stations)}

# Normalize weights so they sum to 1
# total_weight = sum(weights.values())
# normalized_weights = {station: weight / total_weight for station, weight in weights.items()}

# # Function to impute missing values with weighted average
# def weighted_impute(row):
#     if pd.isna(row['Thorak Cemetery']):
#         available_data = {station: row[station] for station in stations if pd.notna(row[station])}
#         if not available_data:
#             return np.nan  # No data available at all for imputation
#         elif len(available_data) == 1:
#             # Only one station has data, use it as the imputed value
#             return list(available_data.values())[0]
#         else:
#             # Calculate weighted sum using available data and normalized weights
#             weighted_sum = sum(row[station] * normalized_weights[station] for station in available_data if station in normalized_weights)
#             return weighted_sum if weighted_sum != 0 else np.nan
#     else:
#         return row['Thorak Cemetery']

# # Apply the imputation
# df['Thorak Cemetery'] = df.apply(weighted_impute, axis=1)

# # Calculate MAE and RMSE for the imputed values, excluding NaNs
# imputed_values = df['Thorak Cemetery'][mask]
# known_values = known_values[mask]

# # Filter out NaN values from known_values and imputed_values
# valid_mask = known_values.notna() & imputed_values.notna()
# mae = mean_absolute_error(known_values[valid_mask], imputed_values[valid_mask])
# rmse = np.sqrt(mean_squared_error(known_values[valid_mask], imputed_values[valid_mask]))

# # Print normalized weights for review
# print("Normalized Weights based on Spearman correlation, distances, and completeness:", normalized_weights)
# print(f"Mean Absolute Error (MAE): {mae}")
# print(f"Root Mean Squared Error (RMSE): {rmse}")
print(completeness)

Thorak Cemetery      88.052345
CSIRO                82.741332
Northlake            55.965330
Karama NT            24.456152
Berrimah Research    58.268185
Darwin Airport       99.991502
dtype: float64


In [1]:
# consider all stations with correlations with no completeness
import pandas as pd
import numpy as np
from sklearn.metrics import mean_absolute_error, mean_squared_error

# Load data
df = pd.read_csv('STATIONS_DATA.csv', parse_dates=['Date'])
df.set_index('Date', inplace=True)

# Evaluate the imputation (need actual known values, simulating missing data here as an example)
known_values = df['Thorak Cemetery'].copy()  # Assume we know all the original values
mask = np.random.rand(len(df)) < 0.1  # Randomly select 10% of the data
df.loc[mask, 'Thorak Cemetery'] = np.nan  # Introduce missing values

# Define distances to stations in order from Thorak Cemetery: 1km, 2km, 3km, 4km, 5km
distances = [4.8, 8, 5.5, 5.5, 5]
stations = [col for col in df.columns if col != 'Thorak Cemetery']  # Exclude Thorak Cemetery from the stations list

# Calculate Spearman correlation between Thorak Cemetery and other stations
correlations = df.corr(method='spearman')['Thorak Cemetery'].drop('Thorak Cemetery')

# Calculate weights based on correlation and distance
weights = {station: (correlations[station] / distances[idx]) for idx, station in enumerate(stations)}

# Normalize weights so they sum to 1
total_weight = sum(weights.values())
normalized_weights = {station: weight / total_weight for station, weight in weights.items()}

# Function to impute missing values with weighted average
def weighted_impute(row):
    if pd.isna(row['Thorak Cemetery']):
        available_data = {station: row[station] for station in stations if pd.notna(row[station])}
        if not available_data:
            return np.nan  # No data available at all for imputation
        elif len(available_data) == 1:
            # Only one station has data, use it as the imputed value
            return list(available_data.values())[0]
        else:
            # Calculate weighted sum using available data and normalized weights
            weighted_sum = sum(row[station] * normalized_weights[station] for station in available_data if station in normalized_weights)
            return weighted_sum if weighted_sum != 0 else np.nan
    else:
        return row['Thorak Cemetery']

# Apply the imputation
df['Thorak Cemetery'] = df.apply(weighted_impute, axis=1)

# Calculate MAE and RMSE for the imputed values, excluding NaNs
imputed_values = df['Thorak Cemetery'][mask]
known_values = known_values[mask]

# Filter out NaN values from known_values and imputed_values
valid_mask = known_values.notna() & imputed_values.notna()
mae = mean_absolute_error(known_values[valid_mask], imputed_values[valid_mask])
rmse = np.sqrt(mean_squared_error(known_values[valid_mask], imputed_values[valid_mask]))

# Print normalized weights for review
print("Normalized Weights based on Spearman correlation and distances:", normalized_weights)
print(f"Mean Absolute Error (MAE): {mae}")
print(f"Root Mean Squared Error (RMSE): {rmse}")


Normalized Weights based on Spearman correlation and distances: {'CSIRO': 0.24968832488343082, 'Northlake': 0.14135490114681093, 'Karama NT': 0.18392302599072977, 'Berrimah Research': 0.20113841758797524, 'Darwin Airport': 0.22389533039105328}
Mean Absolute Error (MAE): 6.684115017119337
Root Mean Squared Error (RMSE): 11.646617079351191


In [1]:
# consider stations with correlations with no completeness
import pandas as pd
import numpy as np
from sklearn.metrics import mean_absolute_error, mean_squared_error

# Load data
df = pd.read_csv('STATIONS_DATA.csv', parse_dates=['Date'])
df.set_index('Date', inplace=True)

# Evaluate the imputation (need actual known values, simulating missing data here as an example)
known_values = df['Thorak Cemetery'].copy()  # Assume we know all the original values
mask = np.random.rand(len(df)) < 0.1  # Randomly select 10% of the data
df.loc[mask, 'Thorak Cemetery'] = np.nan  # Introduce missing values

# Define the distances for Darwin Airport, CSIRO, and Berrimah
stations = ['Darwin Airport', 'CSIRO', 'Berrimah Research']  # List the specific stations to consider
distances = [8, 4.8, 5.5]  # Corresponding distances to Thorak Cemetery

# Calculate Spearman correlation between Thorak Cemetery and the specified stations
correlations = df[stations].corrwith(df['Thorak Cemetery'], method='spearman')

# Calculate weights based on correlation and distance
weights = {station: (correlations[station] / distances[idx]) for idx, station in enumerate(stations)}

# Normalize weights so they sum to 1
total_weight = sum(weights.values())
normalized_weights = {station: weight / total_weight for station, weight in weights.items()}

# Function to impute missing values with weighted average
def weighted_impute(row):
    if pd.isna(row['Thorak Cemetery']):
        available_data = {station: row[station] for station in stations if pd.notna(row[station])}
        if not available_data:
            return np.nan  # No data available at all for imputation
        elif len(available_data) == 1:
            # Only one station has data, use it as the imputed value
            return list(available_data.values())[0]
        else:
            # Calculate weighted sum using available data and normalized weights
            weighted_sum = sum(row[station] * normalized_weights[station] for station in available_data if station in normalized_weights)
            return weighted_sum if weighted_sum != 0 else np.nan
    else:
        return row['Thorak Cemetery']

# Apply the imputation
df['Thorak Cemetery'] = df.apply(weighted_impute, axis=1)

# Calculate MAE and RMSE for the imputed values, excluding NaNs
imputed_values = df['Thorak Cemetery'][mask]
known_values = known_values[mask]

# Filter out NaN values from known_values and imputed_values
valid_mask = known_values.notna() & imputed_values.notna()
mae = mean_absolute_error(known_values[valid_mask], imputed_values[valid_mask])
rmse = np.sqrt(mean_squared_error(known_values[valid_mask], imputed_values[valid_mask]))

# Print normalized weights for review
print("Normalized Weights based on Spearman correlation and distances:", normalized_weights)
print(f"Mean Absolute Error (MAE): {mae}")
print(f"Root Mean Squared Error (RMSE): {rmse}")


Normalized Weights based on Spearman correlation and distances: {'Darwin Airport': 0.23716672357371507, 'CSIRO': 0.4213235999011493, 'Berrimah Research': 0.34150967652513564}
Mean Absolute Error (MAE): 6.54380286177368
Root Mean Squared Error (RMSE): 11.902872587504927


In [2]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_absolute_error, mean_squared_error

# Load data
df = pd.read_csv('STATIONS_DATA.csv', parse_dates=['Date'])
df.set_index('Date', inplace=True)

# Evaluate the imputation (need actual known values, simulating missing data here as an example)
known_values = df['Thorak Cemetery'].copy()  # Assume we know all the original values
mask = np.random.rand(len(df)) < 0.1  # Randomly select 10% of the data
df.loc[mask, 'Thorak Cemetery'] = np.nan  # Introduce missing values

# Define the distances for Darwin Airport, CSIRO, and Berrimah
stations = ['Darwin Airport', 'CSIRO', 'Berrimah Research']  # List the specific stations to consider
distances = [8, 4.8, 5.5]  # Corresponding distances to Thorak Cemetery

# Calculate Spearman correlation between Thorak Cemetery and the specified stations
correlations = df[stations].corrwith(df['Thorak Cemetery'], method='spearman')

# Calculate completeness of data for each station
completeness = df[stations].notnull().mean() * 100  # Percentage of non-missing values for each specified station

# Calculate weights based on correlation, distance, and data completeness
weights = {station: (correlations[station] / distances[idx]) * (completeness[station] / 100) for idx, station in enumerate(stations)}

# Normalize weights so they sum to 1
total_weight = sum(weights.values())
normalized_weights = {station: weight / total_weight for station, weight in weights.items()}

# Function to impute missing values with weighted average
def weighted_impute(row):
    if pd.isna(row['Thorak Cemetery']):
        available_data = {station: row[station] for station in stations if pd.notna(row[station])}
        if not available_data:
            return np.nan  # No data available at all for imputation
        elif len(available_data) == 1:
            # Only one station has data, use it as the imputed value
            return list(available_data.values())[0]
        else:
            # Calculate weighted sum using available data and normalized weights
            weighted_sum = sum(row[station] * normalized_weights[station] for station in available_data if station in normalized_weights)
            return weighted_sum if weighted_sum != 0 else np.nan
    else:
        return row['Thorak Cemetery']

# Apply the imputation
df['Thorak Cemetery'] = df.apply(weighted_impute, axis=1)

# Calculate MAE and RMSE for the imputed values, excluding NaNs
imputed_values = df['Thorak Cemetery'][mask]
known_values = known_values[mask]

# Filter out NaN values from known_values and imputed_values
valid_mask = known_values.notna() & imputed_values.notna()
mae = mean_absolute_error(known_values[valid_mask], imputed_values[valid_mask])
rmse = np.sqrt(mean_squared_error(known_values[valid_mask], imputed_values[valid_mask]))

# Print normalized weights for review
print("Normalized Weights based on Spearman correlation, distances, and completeness:", normalized_weights)
print(f"Mean Absolute Error (MAE): {mae}")
print(f"Root Mean Squared Error (RMSE): {rmse}")


Normalized Weights based on Spearman correlation, distances, and completeness: {'Darwin Airport': 0.3019199793497001, 'CSIRO': 0.4448852861437701, 'Berrimah Research': 0.25319473450652974}
Mean Absolute Error (MAE): 7.507824975039637
Root Mean Squared Error (RMSE): 14.556227127040454


In [2]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_absolute_error, mean_squared_error

# Load data
df = pd.read_csv('STATIONS_DATA.csv', parse_dates=['Date'])
df.set_index('Date', inplace=True)

# Evaluate the imputation (need actual known values, simulating missing data here as an example)
known_values = df['Thorak Cemetery'].copy()  # Assume we know all the original values
mask = np.random.rand(len(df)) < 0.1  # Randomly select 10% of the data
df.loc[mask, 'Thorak Cemetery'] = np.nan  # Introduce missing values

# Define the distances for Darwin Airport and CSIRO
stations = ['Darwin Airport', 'CSIRO']  # List the specific stations to consider
distances = [8, 4.8]  # Corresponding distances to Thorak Cemetery

# Calculate Spearman correlation between Thorak Cemetery and the specified stations
correlations = df[stations].corrwith(df['Thorak Cemetery'], method='spearman')

# Calculate completeness of data for each station
completeness = df[stations].notnull().mean() * 100  # Percentage of non-missing values for each specified station

# Calculate weights based on correlation, distance, and data completeness
weights = {station: (correlations[station] / distances[idx]) * (completeness[station] / 100) for idx, station in enumerate(stations)}

# Normalize weights so they sum to 1
total_weight = sum(weights.values())
normalized_weights = {station: weight / total_weight for station, weight in weights.items()}

# Function to impute missing values with weighted average
def weighted_impute(row):
    if pd.isna(row['Thorak Cemetery']):
        available_data = {station: row[station] for station in stations if pd.notna(row[station])}
        if not available_data:
            return np.nan  # No data available at all for imputation
        elif len(available_data) == 1:
            # Only one station has data, use it as the imputed value
            return list(available_data.values())[0]
        else:
            # Calculate weighted sum using available data and normalized weights
            weighted_sum = sum(row[station] * normalized_weights[station] for station in available_data if station in normalized_weights)
            return weighted_sum if weighted_sum != 0 else np.nan
    else:
        return row['Thorak Cemetery']

# Apply the imputation
df['Thorak Cemetery'] = df.apply(weighted_impute, axis=1)

# Calculate MAE and RMSE for the imputed values, excluding NaNs
imputed_values = df['Thorak Cemetery'][mask]
known_values = known_values[mask]

# Filter out NaN values from known_values and imputed_values
valid_mask = known_values.notna() & imputed_values.notna()
mae = mean_absolute_error(known_values[valid_mask], imputed_values[valid_mask])
rmse = np.sqrt(mean_squared_error(known_values[valid_mask], imputed_values[valid_mask]))

# Print normalized weights for review
print("Normalized Weights based on Spearman correlation, distances, and completeness:", normalized_weights)
print(f"Mean Absolute Error (MAE): {mae}")
print(f"Root Mean Squared Error (RMSE): {rmse}")


Normalized Weights based on Spearman correlation, distances, and completeness: {'Darwin Airport': 0.4046753929882326, 'CSIRO': 0.5953246070117674}
Mean Absolute Error (MAE): 6.4001701481632844
Root Mean Squared Error (RMSE): 13.355028299555359


In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_absolute_error, mean_squared_error

# Load data
df = pd.read_csv('STATIONS.csv', parse_dates=['Date'])
df.set_index('Date', inplace=True)

# Evaluate the imputation (need actual known values, simulating missing data here as an example)
known_values = df['Thorak Cemetery'].copy()  # Assume we know all the original values
mask = np.random.rand(len(df)) < 0.1  # Randomly select 10% of the data
df.loc[mask, 'Thorak Cemetery'] = np.nan  # Introduce missing values

# Define the distances for Darwin Airport and CSIRO
stations = ['Darwin Airport', 'CSIRO']  # List the specific stations to consider
distances = [8, 4.8]  # Corresponding distances to Thorak Cemetery

# Calculate Spearman correlation between Thorak Cemetery and the specified stations
correlations = df[stations].corrwith(df['Thorak Cemetery'], method='spearman')

# Calculate completeness of data for each station
completeness = df[stations].notnull().mean() * 100  # Percentage of non-missing values for each specified station

# Calculate weights based on correlation, distance, and data completeness
weights = {station: (correlations[station] / distances[idx]) * (completeness[station] / 100) for idx, station in enumerate(stations)}

# Normalize weights so they sum to 1
total_weight = sum(weights.values())
normalized_weights = {station: weight / total_weight for station, weight in weights.items()}

# Function to impute missing values with weighted average
def weighted_impute(row):
    if pd.isna(row['Thorak Cemetery']):
        available_data = {station: row[station] for station in stations if pd.notna(row[station])}
        if not available_data:
            return np.nan  # No data available at all for imputation
        elif len(available_data) == 1:
            # Only one station has data, use it as the imputed value
            return list(available_data.values())[0]
        else:
            # Calculate weighted sum using available data and normalized weights
            weighted_sum = sum(row[station] * normalized_weights[station] for station in available_data if station in normalized_weights)
            return weighted_sum if weighted_sum != 0 else np.nan
    else:
        return row['Thorak Cemetery']

# Apply the imputation
df['Thorak Cemetery'] = df.apply(weighted_impute, axis=1)

# Calculate MAE and RMSE for the imputed values, excluding NaNs
imputed_values = df['Thorak Cemetery'][mask]
known_values = known_values[mask]

# Filter out NaN values from known_values and imputed_values
valid_mask = known_values.notna() & imputed_values.notna()
mae = mean_absolute_error(known_values[valid_mask], imputed_values[valid_mask])
rmse = np.sqrt(mean_squared_error(known_values[valid_mask], imputed_values[valid_mask]))

# Print normalized weights for review
print("Normalized Weights based on Spearman correlation, distances, and completeness:", normalized_weights)
print(f"Mean Absolute Error (MAE): {mae}")
print(f"Root Mean Squared Error (RMSE): {rmse}")


Normalized Weights based on Spearman correlation, distances, and completeness: {'Darwin Airport': 0.404122481011266, 'CSIRO': 0.595877518988734}
Mean Absolute Error (MAE): 5.709640999887143
Root Mean Squared Error (RMSE): 11.887565195813679


In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_absolute_error, mean_squared_error

# Load data
df = pd.read_csv('STATIONS.csv', parse_dates=['Date'])
df.set_index('Date', inplace=True)

# Define the distances for Darwin Airport and CSIRO
stations = ['Darwin Airport', 'CSIRO']  # List the specific stations to consider
distances = [8, 4.8]  # Corresponding distances to Thorak Cemetery

# Calculate Spearman correlation between Thorak Cemetery and the specified stations
correlations = df[stations].corrwith(df['Thorak Cemetery'], method='spearman')

# Calculate completeness of data for each station
completeness = df[stations].notnull().mean()  # Fraction of non-missing values for each specified station

# Calculate weights based on correlation, distance, and data completeness
weights = {station: (correlations[station] / distances[idx]) * completeness[station] for idx, station in enumerate(stations)}

# Normalize weights so they sum to 1
total_weight = sum(weights.values())
normalized_weights = {station: weight / total_weight for station, weight in weights.items()}

# Function to impute missing values with weighted average
def weighted_impute(row):
    if pd.isna(row['Thorak Cemetery']):
        available_data = {station: row[station] for station in stations if pd.notna(row[station])}
        if not available_data:
            return np.nan  # No data available at all for imputation
        elif len(available_data) == 1:
            # Only one station has data, use it as the imputed value
            return list(available_data.values())[0]
        else:
            # Calculate weighted sum using available data and normalized weights
            weighted_sum = sum(row[station] * normalized_weights[station] for station in available_data if station in normalized_weights)
            return weighted_sum if weighted_sum != 0 else np.nan
    else:
        return row['Thorak Cemetery']

# Evaluate the imputation (need actual known values, simulating missing data here as an example)
known_values = df['Thorak Cemetery'].copy()  # Assume we know all the original values
mask = np.random.rand(len(df)) < 0.1  # Randomly select 10% of the data
df.loc[mask, 'Thorak Cemetery'] = np.nan  # Introduce missing values

# Apply the imputation
df['Thorak Cemetery'] = df.apply(weighted_impute, axis=1)

# Calculate MAE and RMSE for the imputed values, excluding NaNs
imputed_values = df['Thorak Cemetery'][mask]
known_values = known_values[mask]

# Filter out NaN values from known_values and imputed_values
valid_mask = known_values.notna() & imputed_values.notna()
mae = mean_absolute_error(known_values[valid_mask], imputed_values[valid_mask])
rmse = np.sqrt(mean_squared_error(known_values[valid_mask], imputed_values[valid_mask]))

# Print normalized weights for review
print("Normalized Weights based on Spearman correlation, distances, and completeness:", normalized_weights)
print(f"Mean Absolute Error (MAE): {mae}")
print(f"Root Mean Squared Error (RMSE): {rmse}")

# Given example values
print("Example MAE values: 2 and 5")
print("Example RMSE values: 6 and 11")


Normalized Weights based on Spearman correlation, distances, and completeness: {'Darwin Airport': 0.40438082216718024, 'CSIRO': 0.5956191778328198}
Mean Absolute Error (MAE): 5.603738849121125
Root Mean Squared Error (RMSE): 11.265911078752266
Example MAE values: 2 and 5
Example RMSE values: 6 and 11
