# README
- This file is used to check whether there are missing stations for some years

In [1]:
import pandas as pd

In [2]:
GS_data_set = {}

for year in range(2013,2026):
    GS_data_set[year] = pd.read_parquet(f"C:/Users/31155/Dropbox/EV-GasDualNetwork/Data/intermediate/yiwei/intermidiate/GS_cleaned/GS_data_cleaned_{year}.parquet")
    print(year," GS ",len(GS_data_set[year]))

2013  GS  98458
2014  GS  102642
2015  GS  117307
2016  GS  118338
2017  GS  118193
2018  GS  104177
2019  GS  109464
2020  GS  113399
2021  GS  116279
2022  GS  105157
2023  GS  112000
2024  GS  113219
2025  GS  114421


In [None]:
def check_missing_stations(data_set, matching_columns, start_year=2013, end_year=2025):
    """
    Check stations that exist in year-1 and year+1 but missing in current year.
    
    Parameters:
    - data_set: dict, {year: DataFrame}
    - matching_columns: list or str, columns to identify unique stations
    - start_year: int, first year to check
    - end_year: int, last year to check
    
    Returns:
    - results_df: DataFrame with year, missing_count, current_count, and ratio
    """
    results = []
    
    for year in range(start_year + 1, end_year):
        # Get unique station IDs for prev, current, and next year
        prev_stations = data_set[year-1].drop_duplicates(subset=matching_columns)[matching_columns]
        curr_stations = data_set[year].drop_duplicates(subset=matching_columns)[matching_columns]
        next_stations = data_set[year+1].drop_duplicates(subset=matching_columns)[matching_columns]
        
        # Create composite keys for vectorized comparison
        prev_keys = set(map(tuple, prev_stations.values))
        curr_keys = set(map(tuple, curr_stations.values))
        next_keys = set(map(tuple, next_stations.values))
        
        # Find stations in both prev and next but not in current
        missing_stations = (prev_keys & next_keys) - curr_keys
        
        missing_count = len(missing_stations)
        current_count = len(curr_keys)
        ratio = missing_count / current_count if current_count > 0 else 0
        
        results.append({
            'year': year,
            'missing_count': missing_count,
            'current_count': current_count,
            'ratio': f"{ratio:.4f}"
        })
    
    return pd.DataFrame(results)

# Example usage
matching_columns = ['pname','cityname','adname','address','name','大类','中类','小类']
results_df = check_missing_stations(GS_data_set, matching_columns)
print(results_df)

    year  missing_count  current_count   ratio
0   2014             13          98283  0.0001
1   2015              0         116561  0.0000
2   2016              0         117586  0.0000
3   2017           1602         117814  0.0136
4   2018           2478         104016  0.0238
5   2019           2820         109259  0.0258
6   2020           1333         113216  0.0118
7   2021           6280         114813  0.0547
8   2022           3496         105135  0.0333
9   2023           1697         111973  0.0152
10  2024           1343         113190  0.0119


: 