# README
- This file is used to check all province and city names in the dataset (there are some bugs require manual check) and then translate them into English version

In [1]:
import pandas as pd

In [2]:
base_dir = "C:/Users/31155/Dropbox/EV-GasDualNetwork/Data/intermediate/yiwei/intermidiate"
CS_data_dir = base_dir + "/CS/"
GS_data_dir = base_dir + "/GS/"

In [3]:
CS_data_set = {}
for i in range(2015,2026):
    CS_data_set[i] = pd.read_parquet(f"{CS_data_dir}/{i}.parquet")
    print(f"CS data {i} loaded. Length: {len(CS_data_set[i])}")
GS_data_set = {}
for i in range(2013,2026):
    GS_data_set[i] = pd.read_parquet(f"{GS_data_dir}/{i}.parquet")
    print(f"GS data {i} loaded. Length: {len(GS_data_set[i])}")

CS data 2015 loaded. Length: 2039
CS data 2016 loaded. Length: 2092
CS data 2017 loaded. Length: 4533
CS data 2018 loaded. Length: 33972
CS data 2019 loaded. Length: 57113
CS data 2020 loaded. Length: 73320
CS data 2021 loaded. Length: 94209
CS data 2022 loaded. Length: 98813
CS data 2023 loaded. Length: 120660
CS data 2024 loaded. Length: 185617
CS data 2025 loaded. Length: 225899
GS data 2013 loaded. Length: 101816
GS data 2014 loaded. Length: 104542
GS data 2015 loaded. Length: 118645
GS data 2016 loaded. Length: 120030
GS data 2017 loaded. Length: 120669
GS data 2018 loaded. Length: 107356
GS data 2019 loaded. Length: 113770
GS data 2020 loaded. Length: 120313
GS data 2021 loaded. Length: 122005
GS data 2022 loaded. Length: 111608
GS data 2023 loaded. Length: 119012
GS data 2024 loaded. Length: 119029
GS data 2025 loaded. Length: 107755


In [4]:
# Get all unique province names and city names from both datasets
def get_unique_names(data_dict):
    """Extract unique province names and city names from data dictionary"""
    all_pnames = set()
    all_citynames = set()
    
    for year, df in data_dict.items():
        if 'pname' in df.columns:
            all_pnames.update(df['pname'].dropna().unique())
        if 'cityname' in df.columns:
            all_citynames.update(df['cityname'].dropna().unique())
    
    return sorted(list(all_pnames)), sorted(list(all_citynames))

# Get unique names from both CS and GS datasets
cs_pnames, cs_citynames = get_unique_names(CS_data_set)
gs_pnames, gs_citynames = get_unique_names(GS_data_set)

# Combine all unique names
all_pnames = sorted(list(set(cs_pnames + gs_pnames)))
all_citynames = sorted(list(set(cs_citynames + gs_citynames)))

print(f"Total unique province names: {len(all_pnames)}")
print(f"Total unique city names: {len(all_citynames)}")

Total unique province names: 49
Total unique city names: 486


In [5]:
# Check missing years for each province name and city name
def check_missing_years(data_dict, names_list, name_column, available_years):
    """Check which years are missing for each name in the specified column"""
    missing_info = {}
    
    for name in names_list:
        present_years = []
        for year in available_years:
            if year in data_dict:
                df = data_dict[year]
                if name_column in df.columns and name in df[name_column].values:
                    present_years.append(year)
        
        missing_years = [year for year in available_years if year not in present_years]
        missing_info[name] = {
            'present_years': present_years,
            'missing_years': missing_years,
            'total_present': len(present_years),
            'total_missing': len(missing_years)
        }
    
    return missing_info

# Define available years for each dataset
cs_years = list(range(2015, 2026))  # 2015-2025
gs_years = list(range(2013, 2026))  # 2013-2025

# Check missing years for CS data
cs_pname_missing = check_missing_years(CS_data_set, all_pnames, 'pname', cs_years)
cs_cityname_missing = check_missing_years(CS_data_set, all_citynames, 'cityname', cs_years)

# Check missing years for GS data  
gs_pname_missing = check_missing_years(GS_data_set, all_pnames, 'pname', gs_years)
gs_cityname_missing = check_missing_years(GS_data_set, all_citynames, 'cityname', gs_years)

print("Missing year analysis completed!")

Missing year analysis completed!


In [6]:
pname_combined_info = {}

for pname in all_pnames:
    pname_combined_info[pname] = {
        'cs_present_years': cs_pname_missing[pname]['present_years'],
        'cs_missing_years': cs_pname_missing[pname]['missing_years'],
        'gs_present_years': gs_pname_missing[pname]['present_years'],
        'gs_missing_years': gs_pname_missing[pname]['missing_years'],
        'correct': pname  # Fill with current pname value for manual correction
    }

print(f"Combined province name information created for {len(pname_combined_info)} provinces")
print("Sample entry:", list(pname_combined_info.items())[0])

Combined province name information created for 49 provinces
Sample entry: (' ', {'cs_present_years': [2022], 'cs_missing_years': [2015, 2016, 2017, 2018, 2019, 2020, 2021, 2023, 2024, 2025], 'gs_present_years': [2022], 'gs_missing_years': [2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2023, 2024, 2025], 'correct': ' '})


In [7]:
cityname_combined_info = {}

for cityname in all_citynames:
    cityname_combined_info[cityname] = {
        'cs_present_years': cs_cityname_missing[cityname]['present_years'],
        'cs_missing_years': cs_cityname_missing[cityname]['missing_years'],
        'gs_present_years': gs_cityname_missing[cityname]['present_years'],
        'gs_missing_years': gs_cityname_missing[cityname]['missing_years'],
        'correct': cityname  # Fill with current cityname value for manual correction
    }

print(f"Combined city name information created for {len(cityname_combined_info)} cities")
print("Sample entry:", list(cityname_combined_info.items())[0])

Combined city name information created for 486 cities
Sample entry: ('七台河', {'cs_present_years': [], 'cs_missing_years': [2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024, 2025], 'gs_present_years': [2013], 'gs_missing_years': [2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024, 2025], 'correct': '七台河'})


In [8]:
import json
# Custom JSON encoder for compact lists
def write_compact_json(data, filename):
    """Write JSON with compact list formatting"""
    json_str = json.dumps(data, ensure_ascii=False, indent=4)
    
    # Replace multi-line lists with single-line lists
    import re
    json_str = re.sub(r'\[\s+', '[', json_str)
    json_str = re.sub(r',\s+(\d+)', r', \1', json_str)
    json_str = re.sub(r'\s+\]', ']', json_str)
    
    with open(filename, 'w', encoding='utf-8') as f:
        f.write(json_str)

# Save files with compact formatting
write_compact_json(pname_combined_info, "pname_correct_info.json")
write_compact_json(cityname_combined_info, "cityname_correct_info.json")

## NOTICE
- Here we have json file about pname and cityname, then we have to manually check the file and correct mistakes

In [9]:
with open("pname_correct_info.json", "r", encoding='utf-8') as f:
    pname_correct_info_v1 = json.load(f)
with open("cityname_correct_info.json", "r", encoding='utf-8') as f:
    cityname_correct_info_v1 = json.load(f)
print(list(pname_correct_info_v1.items())[0])
print(list(cityname_correct_info_v1.items())[0])

(' ', {'cs_present_years': [2022], 'cs_missing_years': [2015, 2016, 2017, 2018, 2019, 2020, 2021, 2023, 2024, 2025], 'gs_present_years': [2022], 'gs_missing_years': [2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2023, 2024, 2025], 'correct': '上海市'})
('七台河', {'cs_present_years': [], 'cs_missing_years': [2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024, 2025], 'gs_present_years': [2013], 'gs_missing_years': [2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024, 2025], 'correct': '七台河市'})


In [10]:
# Update pname and cityname in datasets using vectorized operations
def update_names_vectorized(data_dict, pname_mapping, cityname_mapping):
    """
    Update province and city names in datasets using vectorized operations
    
    Args:
        data_dict: Dictionary of dataframes by year
        pname_mapping: Dictionary mapping old pname -> correct pname
        cityname_mapping: Dictionary mapping old cityname -> correct cityname
    """
    for year, df in data_dict.items():
        print(f"Updating year {year}...")
        
        # Update pname using vectorized map operation
        if 'pname' in df.columns:
            df['pname'] = df['pname'].map(pname_mapping).fillna(df['pname'])
        
        # Update cityname using vectorized map operation  
        if 'cityname' in df.columns:
            df['cityname'] = df['cityname'].map(cityname_mapping).fillna(df['cityname'])
    
    print("All datasets updated successfully!")

# Extract mapping dictionaries from correction info
pname_mapping = {old_name: info['correct'] for old_name, info in pname_correct_info_v1.items() 
                 if info['correct'] != old_name}  # Only map if correction is different
cityname_mapping = {old_name: info['correct'] for old_name, info in cityname_correct_info_v1.items() 
                     if info['correct'] != old_name}  # Only map if correction is different

print(f"Created pname mapping for {len(pname_mapping)} entries")
print(f"Created cityname mapping for {len(cityname_mapping)} entries")

# Update both CS and GS datasets
print("Updating CS datasets...")
update_names_vectorized(CS_data_set, pname_mapping, cityname_mapping)

print("Updating GS datasets...")
update_names_vectorized(GS_data_set, pname_mapping, cityname_mapping)

print("All updates completed!")

Created pname mapping for 15 entries
Created cityname mapping for 108 entries
Updating CS datasets...
Updating year 2015...
Updating year 2016...
Updating year 2017...
Updating year 2018...
Updating year 2019...
Updating year 2020...
Updating year 2021...
Updating year 2022...
Updating year 2023...
Updating year 2024...
Updating year 2025...
All datasets updated successfully!
Updating GS datasets...
Updating year 2013...
Updating year 2014...
Updating year 2015...
Updating year 2016...
Updating year 2017...
Updating year 2018...
Updating year 2019...
Updating year 2020...
Updating year 2021...
Updating year 2022...
Updating year 2023...
Updating year 2024...
Updating year 2025...
All datasets updated successfully!
All updates completed!


In [11]:
# Get unique names from both CS and GS datasets
cs_pnames, cs_citynames = get_unique_names(CS_data_set)
gs_pnames, gs_citynames = get_unique_names(GS_data_set)

# Combine all unique names
all_pnames = sorted(list(set(cs_pnames + gs_pnames)))
all_citynames = sorted(list(set(cs_citynames + gs_citynames)))

print(f"Total unique province names: {len(all_pnames)}")
print(f"Total unique city names: {len(all_citynames)}")

Total unique province names: 34
Total unique city names: 378


In [12]:
print(CS_data_set[2023]['pname'].unique())
print(GS_data_set[2023]['pname'].unique())

['黑龙江省' '海南省' '福建省' '河南省' '上海市' '江西省' '广东省' '山东省' '宁夏回族自治区' '甘肃省' '山西省'
 '云南省' '辽宁省' '浙江省' '内蒙古自治区' '新疆维吾尔自治区' '四川省' '安徽省' '湖北省' '河北省' '贵州省' '北京市'
 '广西壮族自治区' '江苏省' '台湾省' '吉林省' '陕西省' '天津市' '湖南省' '西藏自治区' '青海省' '澳门特别行政区'
 '重庆市' '香港特别行政区']
['黑龙江省' '海南省' '福建省' '河南省' '上海市' '江西省' '广东省' '山东省' '宁夏回族自治区' '甘肃省' '山西省'
 '云南省' '辽宁省' '浙江省' '内蒙古自治区' '新疆维吾尔自治区' '四川省' '安徽省' '湖北省' '河北省' '贵州省' '北京市'
 '广西壮族自治区' '江苏省' '台湾省' '吉林省' '陕西省' '天津市' '湖南省' '西藏自治区' '青海省' '澳门特别行政区'
 '重庆市' '香港特别行政区']


In [13]:
# Remove rows with specific province names (Hong Kong, Macau, Taiwan) using vectorized operations
def remove_specific_provinces(data_dict, provinces_to_remove):
    """
    Remove rows with specific province names from all datasets
    
    Args:
        data_dict: Dictionary of dataframes by year
        provinces_to_remove: List of province names to remove
    """
    removal_stats = {}
    
    for year, df in data_dict.items():
        if 'pname' in df.columns:
            # Get original length
            original_length = len(df)
            
            # Use vectorized operation to filter out unwanted provinces
            mask = ~df['pname'].isin(provinces_to_remove)
            filtered_df = df[mask]
            
            # Update the dataframe in place
            data_dict[year] = filtered_df
            
            # Track removal statistics
            removed_count = original_length - len(filtered_df)
            removal_stats[year] = {
                'original': original_length,
                'remaining': len(filtered_df),
                'removed': removed_count
            }
            
            print(f"Year {year}: Removed {removed_count} rows, {len(filtered_df)} rows remaining")
    
    return removal_stats

# Define provinces to remove
provinces_to_remove = ['香港特别行政区', '澳门特别行政区', '台湾省']

print("Removing specified provinces from CS datasets...")
cs_removal_stats = remove_specific_provinces(CS_data_set, provinces_to_remove)

print("\nRemoving specified provinces from GS datasets...")
gs_removal_stats = remove_specific_provinces(GS_data_set, provinces_to_remove)

# Print summary statistics
total_cs_removed = sum(stats['removed'] for stats in cs_removal_stats.values())
total_gs_removed = sum(stats['removed'] for stats in gs_removal_stats.values())

print(f"\nSummary:")
print(f"Total rows removed from CS datasets: {total_cs_removed}")
print(f"Total rows removed from GS datasets: {total_gs_removed}")
print(f"Total rows removed overall: {total_cs_removed + total_gs_removed}")

Removing specified provinces from CS datasets...
Year 2015: Removed 183 rows, 1856 rows remaining
Year 2016: Removed 183 rows, 1909 rows remaining
Year 2017: Removed 281 rows, 4252 rows remaining
Year 2018: Removed 420 rows, 33552 rows remaining
Year 2019: Removed 452 rows, 56661 rows remaining
Year 2020: Removed 457 rows, 72863 rows remaining
Year 2021: Removed 820 rows, 93389 rows remaining
Year 2022: Removed 917 rows, 97896 rows remaining
Year 2023: Removed 870 rows, 119790 rows remaining
Year 2024: Removed 854 rows, 184763 rows remaining
Year 2025: Removed 641 rows, 225258 rows remaining

Removing specified provinces from GS datasets...
Year 2013: Removed 225 rows, 101591 rows remaining
Year 2014: Removed 215 rows, 104327 rows remaining
Year 2015: Removed 634 rows, 118011 rows remaining
Year 2016: Removed 637 rows, 119393 rows remaining
Year 2017: Removed 2228 rows, 118441 rows remaining
Year 2018: Removed 273 rows, 107083 rows remaining
Year 2019: Removed 432 rows, 113338 rows rem

In [14]:
print(CS_data_set[2023]['pname'].unique())
print(GS_data_set[2023]['pname'].unique())

['黑龙江省' '海南省' '福建省' '河南省' '上海市' '江西省' '广东省' '山东省' '宁夏回族自治区' '甘肃省' '山西省'
 '云南省' '辽宁省' '浙江省' '内蒙古自治区' '新疆维吾尔自治区' '四川省' '安徽省' '湖北省' '河北省' '贵州省' '北京市'
 '广西壮族自治区' '江苏省' '吉林省' '陕西省' '天津市' '湖南省' '西藏自治区' '青海省' '重庆市']
['黑龙江省' '海南省' '福建省' '河南省' '上海市' '江西省' '广东省' '山东省' '宁夏回族自治区' '甘肃省' '山西省'
 '云南省' '辽宁省' '浙江省' '内蒙古自治区' '新疆维吾尔自治区' '四川省' '安徽省' '湖北省' '河北省' '贵州省' '北京市'
 '广西壮族自治区' '江苏省' '吉林省' '陕西省' '天津市' '湖南省' '西藏自治区' '青海省' '重庆市']


In [15]:
# Get unique names from both CS and GS datasets
cs_pnames, cs_citynames = get_unique_names(CS_data_set)
gs_pnames, gs_citynames = get_unique_names(GS_data_set)

# Combine all unique names
all_pnames = sorted(list(set(cs_pnames + gs_pnames)))
all_citynames = sorted(list(set(cs_citynames + gs_citynames)))

print(f"Total unique province names: {len(all_pnames)}")
print(f"Total unique city names: {len(all_citynames)}")

Total unique province names: 31
Total unique city names: 375


In [16]:
# Define available years for each dataset
cs_years = list(range(2015, 2026))  # 2015-2025
gs_years = list(range(2013, 2026))  # 2013-2025

# Check missing years for CS data
cs_pname_missing = check_missing_years(CS_data_set, all_pnames, 'pname', cs_years)
cs_cityname_missing = check_missing_years(CS_data_set, all_citynames, 'cityname', cs_years)

# Check missing years for GS data  
gs_pname_missing = check_missing_years(GS_data_set, all_pnames, 'pname', gs_years)
gs_cityname_missing = check_missing_years(GS_data_set, all_citynames, 'cityname', gs_years)

print("Missing year analysis completed!")

Missing year analysis completed!


In [17]:
pname_combined_info = {}

for pname in all_pnames:
    pname_combined_info[pname] = {
        'cs_present_years': cs_pname_missing[pname]['present_years'],
        'cs_missing_years': cs_pname_missing[pname]['missing_years'],
        'gs_present_years': gs_pname_missing[pname]['present_years'],
        'gs_missing_years': gs_pname_missing[pname]['missing_years'],
        'correct': pname  # Fill with current pname value for manual correction
    }

print(f"Combined province name information created for {len(pname_combined_info)} provinces")
print("Sample entry:", list(pname_combined_info.items())[0])

cityname_combined_info = {}

for cityname in all_citynames:
    cityname_combined_info[cityname] = {
        'cs_present_years': cs_cityname_missing[cityname]['present_years'],
        'cs_missing_years': cs_cityname_missing[cityname]['missing_years'],
        'gs_present_years': gs_cityname_missing[cityname]['present_years'],
        'gs_missing_years': gs_cityname_missing[cityname]['missing_years'],
        'correct': cityname  # Fill with current cityname value for manual correction
    }

print(f"Combined city name information created for {len(cityname_combined_info)} cities")
print("Sample entry:", list(cityname_combined_info.items())[0])

Combined province name information created for 31 provinces
Sample entry: ('上海市', {'cs_present_years': [2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024, 2025], 'cs_missing_years': [], 'gs_present_years': [2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024, 2025], 'gs_missing_years': [], 'correct': '上海市'})
Combined city name information created for 375 cities
Sample entry: ('七台河市', {'cs_present_years': [2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024, 2025], 'cs_missing_years': [], 'gs_present_years': [2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024, 2025], 'gs_missing_years': [], 'correct': '七台河市'})


In [21]:
def check_missing_names(data_dict):
    missing_names = {}
    for key in data_dict:
        if (len(data_dict[key]['cs_missing_years'])>0) and (len(data_dict[key]['gs_missing_years']) > 0):
            print(f"{key}: CS missing years: {data_dict[key]['cs_missing_years']}, GS missing years: {data_dict[key]['gs_missing_years']}")
            missing_names[key] = data_dict[key]
    return missing_names


In [22]:
pname_missing = check_missing_names(pname_combined_info)
cityname_missing = check_missing_names(cityname_combined_info)

广东省: CS missing years: [2025], GS missing years: [2025]
广西壮族自治区: CS missing years: [2025], GS missing years: [2025]
万宁市: CS missing years: [2015, 2016, 2017, 2019], GS missing years: [2013, 2014, 2015, 2016, 2017]
三沙市: CS missing years: [2015, 2016, 2017, 2018, 2019, 2020, 2022], GS missing years: [2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024, 2025]
东方市: CS missing years: [2015, 2016, 2017, 2019], GS missing years: [2013, 2014, 2015, 2016, 2017]
东莞市: CS missing years: [2025], GS missing years: [2025]
中山市: CS missing years: [2025], GS missing years: [2025]
临高县: CS missing years: [2015, 2016, 2017], GS missing years: [2013, 2014, 2015, 2016, 2017]
乐东黎族自治县: CS missing years: [2015, 2016, 2017], GS missing years: [2013, 2014, 2015, 2016, 2017]
云浮市: CS missing years: [2025], GS missing years: [2025]
五家渠市: CS missing years: [2015, 2016, 2017], GS missing years: [2013, 2014, 2015, 2016, 2017]
五指山市: CS missing years: [2015, 2016, 2017], GS missing years: [2013, 2014, 

In [24]:
print(len(pname_missing))
print(len(cityname_missing))

2
74


In [23]:
# Save files with compact formatting
write_compact_json(pname_missing, "pname_missing_info_v2.json")
write_compact_json(cityname_missing, "cityname_missing_info_v2.json")


In [30]:
GS_data_set[2016][GS_data_set[2016]['cityname'] == '新疆维吾尔自治区']

Unnamed: 0,name,address,wgs84_x,wgs84_y,tel,pname,cityname,adname,大类,中类,小类
4649,十六团加油站,207省道,80.835777,40.495087,,新疆维吾尔自治区,新疆维吾尔自治区,阿拉尔市,汽车服务,加油站,加油站
4650,金源加油站(黑孜洞派出所东),207省道附近,80.797974,40.644142,,新疆维吾尔自治区,新疆维吾尔自治区,阿拉尔市,汽车服务,加油站,加油站
4651,阿拉尔南口加气站,207省道附近,81.315575,40.510029,,新疆维吾尔自治区,新疆维吾尔自治区,阿拉尔市,汽车服务,加气站,加气站
4652,中国石油加油站(口九段),南口镇;长安路与308省道交叉口西北方向,81.313881,40.511959,,新疆维吾尔自治区,新疆维吾尔自治区,阿拉尔市,汽车服务,加油站,中国石油
4653,超祥加油站,207省道依兰勒克路口;九团水管站附近,81.178398,40.557003,,新疆维吾尔自治区,新疆维吾尔自治区,阿拉尔市,汽车服务,加油站,加油站
...,...,...,...,...,...,...,...,...,...,...,...
86744,中国石油一八三团加油站,军垦路附近,88.085930,47.264969,,新疆维吾尔自治区,新疆维吾尔自治区,北屯市,汽车服务,加油站,中国石油
86745,中国石油青河农场加油站,一八三团东场附近,88.334648,47.231735,,新疆维吾尔自治区,新疆维吾尔自治区,北屯市,汽车服务,加油站,中国石油
86746,广汇能源北屯站,319省道附近,87.783295,47.371914,,新疆维吾尔自治区,新疆维吾尔自治区,北屯市,汽车服务,加气站,加气站
86747,新疆新捷北屯加气站,318省道附近,87.774384,47.324032,,新疆维吾尔自治区,新疆维吾尔自治区,北屯市,汽车服务,加气站,加气站


In [33]:
for year, df in GS_data_set.items():
    mask = df['cityname'] == '新疆维吾尔自治区'
    df.loc[mask, 'cityname'] = df.loc[mask, 'adname']
    print(f"Year {year}: Updated {mask.sum()} rows")
    mask = df['cityname'] == '河南省'
    df.loc[mask, 'cityname'] = df.loc[mask, 'adname']
    print(f"Year {year}: Updated {mask.sum()} rows")
    mask = df['cityname'] == '海南省'
    df.loc[mask, 'cityname'] = df.loc[mask, 'adname']
    print(f"Year {year}: Updated {mask.sum()} rows")
    mask = df['cityname'] == '湖北省'
    df.loc[mask, 'cityname'] = df.loc[mask, 'adname']
    print(f"Year {year}: Updated {mask.sum()} rows")
    mask = df['cityname'] == '那曲地区'
    df.loc[mask, 'cityname'] = '那曲市'
    print(f"Year {year}: Updated {mask.sum()} rows")

Year 2013: Updated 76 rows
Year 2013: Updated 102 rows
Year 2013: Updated 267 rows
Year 2013: Updated 177 rows
Year 2013: Updated 11 rows
Year 2014: Updated 89 rows
Year 2014: Updated 105 rows
Year 2014: Updated 388 rows
Year 2014: Updated 229 rows
Year 2014: Updated 11 rows
Year 2015: Updated 87 rows
Year 2015: Updated 110 rows
Year 2015: Updated 365 rows
Year 2015: Updated 283 rows
Year 2015: Updated 29 rows
Year 2016: Updated 87 rows
Year 2016: Updated 111 rows
Year 2016: Updated 371 rows
Year 2016: Updated 285 rows
Year 2016: Updated 29 rows
Year 2017: Updated 104 rows
Year 2017: Updated 105 rows
Year 2017: Updated 351 rows
Year 2017: Updated 293 rows
Year 2017: Updated 34 rows
Year 2018: Updated 0 rows
Year 2018: Updated 0 rows
Year 2018: Updated 0 rows
Year 2018: Updated 0 rows
Year 2018: Updated 0 rows
Year 2019: Updated 0 rows
Year 2019: Updated 0 rows
Year 2019: Updated 0 rows
Year 2019: Updated 0 rows
Year 2019: Updated 0 rows
Year 2020: Updated 0 rows
Year 2020: Updated 0 ro

In [34]:
for year, df in CS_data_set.items():
    mask = df['cityname'] == '新疆维吾尔自治区'
    df.loc[mask, 'cityname'] = df.loc[mask, 'adname']
    print(f"Year {year}: Updated {mask.sum()} rows")
    mask = df['cityname'] == '河南省'
    df.loc[mask, 'cityname'] = df.loc[mask, 'adname']
    print(f"Year {year}: Updated {mask.sum()} rows")
    mask = df['cityname'] == '海南省'
    df.loc[mask, 'cityname'] = df.loc[mask, 'adname']
    print(f"Year {year}: Updated {mask.sum()} rows")
    mask = df['cityname'] == '湖北省'
    df.loc[mask, 'cityname'] = df.loc[mask, 'adname']
    print(f"Year {year}: Updated {mask.sum()} rows")
    mask = df['cityname'] == '那曲地区'
    df.loc[mask, 'cityname'] = '那曲市'
    print(f"Year {year}: Updated {mask.sum()} rows")

Year 2015: Updated 1 rows
Year 2015: Updated 3 rows
Year 2015: Updated 4 rows
Year 2015: Updated 1 rows
Year 2015: Updated 3 rows
Year 2016: Updated 1 rows
Year 2016: Updated 3 rows
Year 2016: Updated 4 rows
Year 2016: Updated 1 rows
Year 2016: Updated 3 rows
Year 2017: Updated 1 rows
Year 2017: Updated 3 rows
Year 2017: Updated 18 rows
Year 2017: Updated 2 rows
Year 2017: Updated 2 rows
Year 2018: Updated 0 rows
Year 2018: Updated 0 rows
Year 2018: Updated 0 rows
Year 2018: Updated 0 rows
Year 2018: Updated 0 rows
Year 2019: Updated 0 rows
Year 2019: Updated 0 rows
Year 2019: Updated 0 rows
Year 2019: Updated 0 rows
Year 2019: Updated 0 rows
Year 2020: Updated 0 rows
Year 2020: Updated 0 rows
Year 2020: Updated 0 rows
Year 2020: Updated 0 rows
Year 2020: Updated 0 rows
Year 2021: Updated 0 rows
Year 2021: Updated 0 rows
Year 2021: Updated 0 rows
Year 2021: Updated 0 rows
Year 2021: Updated 0 rows
Year 2022: Updated 0 rows
Year 2022: Updated 0 rows
Year 2022: Updated 0 rows
Year 2022: 

In [35]:
base_dir = "C:/Users/31155/Dropbox/EV-GasDualNetwork/Data/intermediate/yiwei/intermidiate"
CS_output_data_dir = base_dir + "/CS_cleaned/"
GS_output_data_dir = base_dir + "/GS_cleaned/"

In [36]:
for key in CS_data_set.keys():
    CS_data_set[key].to_parquet(f"{CS_output_data_dir}/{key}.parquet", index=False)
    print(f"CS cleaned data {key} saved. Length: {len(CS_data_set[key])}")

for key in GS_data_set.keys():
    GS_data_set[key].to_parquet(f"{GS_output_data_dir}/{key}.parquet", index=False)
    print(f"GS cleaned data {key} saved. Length: {len(GS_data_set[key])}")


CS cleaned data 2015 saved. Length: 1856
CS cleaned data 2016 saved. Length: 1909
CS cleaned data 2017 saved. Length: 4252
CS cleaned data 2018 saved. Length: 33552
CS cleaned data 2019 saved. Length: 56661
CS cleaned data 2020 saved. Length: 72863
CS cleaned data 2021 saved. Length: 93389
CS cleaned data 2022 saved. Length: 97896
CS cleaned data 2023 saved. Length: 119790
CS cleaned data 2024 saved. Length: 184763
CS cleaned data 2025 saved. Length: 225258
GS cleaned data 2013 saved. Length: 101591
GS cleaned data 2014 saved. Length: 104327
GS cleaned data 2015 saved. Length: 118011
GS cleaned data 2016 saved. Length: 119393
GS cleaned data 2017 saved. Length: 118441
GS cleaned data 2018 saved. Length: 107083
GS cleaned data 2019 saved. Length: 113338
GS cleaned data 2020 saved. Length: 119894
GS cleaned data 2021 saved. Length: 121589
GS cleaned data 2022 saved. Length: 110988
GS cleaned data 2023 saved. Length: 118521
GS cleaned data 2024 saved. Length: 118537
GS cleaned data 2025 s

In [37]:
# Get unique names from both CS and GS datasets
cs_pnames, cs_citynames = get_unique_names(CS_data_set)
gs_pnames, gs_citynames = get_unique_names(GS_data_set)

# Combine all unique names
all_pnames = sorted(list(set(cs_pnames + gs_pnames)))
all_citynames = sorted(list(set(cs_citynames + gs_citynames)))

print(f"Total unique province names: {len(all_pnames)}")
print(f"Total unique city names: {len(all_citynames)}")

Total unique province names: 31
Total unique city names: 385


In [38]:
print(all_citynames)

['七台河市', '万宁市', '三亚市', '三明市', '三沙市', '三门峡市', '上海市', '上饶市', '东方市', '东莞市', '东营市', '中卫市', '中山市', '临夏回族自治州', '临汾市', '临沂市', '临沧市', '临高县', '丹东市', '丽水市', '丽江市', '乌兰察布市', '乌海市', '乌鲁木齐市', '乐东黎族', '乐东黎族自治', '乐东黎族自治县', '乐山市', '九江市', '云浮市', '五家渠市', '五指山市', '亳州市', '仙桃市', '伊春市', '伊犁哈萨克自治州', '佛山市', '佳木斯市', '保亭黎族', '保亭黎族苗族', '保亭黎族苗族自治县', '保定市', '保山市', '信阳市', '儋州市', '克孜勒苏柯尔克孜自治州', '克拉玛依市', '六安市', '六盘水市', '兰州市', '兴安盟', '内江市', '凉山彝族自治州', '包头市', '北京市', '北屯市', '北海市', '十堰市', '南京市', '南充市', '南宁市', '南平市', '南昌市', '南通市', '南阳市', '博尔塔拉蒙古自治州', '厦门市', '双河市', '双鸭山市', '可克达拉', '可克达拉市', '台州市', '合肥市', '吉安市', '吉林市', '吐鲁番市', '吕梁市', '吴忠市', '周口市', '呼伦贝尔市', '呼和浩特市', '和田地区', '咸宁市', '咸阳市', '哈密市', '哈尔滨市', '唐山市', '商丘市', '商洛市', '喀什地区', '嘉兴市', '嘉峪关市', '四平市', '固原市', '图木舒克', '图木舒克市', '塔城地区', '大兴安岭地区', '大同市', '大庆市', '大理白族自治州', '大连市', '天水市', '天津市', '天门市', '太原市', '威海市', '娄底市', '孝感市', '宁德市', '宁波市', '安庆市', '安康市', '安阳市', '安顺市', '定安县', '定西市', '宜宾市', '宜昌市', '宜春市', '宝鸡市', '宣城市', '宿州市', '宿迁市', '屯昌县', '山南市', '岳阳市', '崇左市', '巴中市', '巴彦淖尔市', '巴音郭楞蒙

In [42]:
for year, df in CS_data_set.items():
    mask = (df['cityname'] == '乐东黎族') | (df['cityname'] == '乐东黎族自治')
    df.loc[mask, 'cityname'] = '乐东黎族自治县'
    print(f"Year {year}: Updated {mask.sum()} rows")
    mask = (df['cityname'] == '保亭黎族') | (df['cityname'] == '保亭黎族苗族')
    df.loc[mask, 'cityname'] = '保亭黎族苗族自治县'
    print(f"Year {year}: Updated {mask.sum()} rows")
    mask = (df['cityname'] == '可克达拉')
    df.loc[mask, 'cityname'] = '可克达拉市'
    print(f"Year {year}: Updated {mask.sum()} rows")
    mask = (df['cityname'] == '图木舒克')
    df.loc[mask, 'cityname'] = '图木舒克市'
    print(f"Year {year}: Updated {mask.sum()} rows")
    mask = (df['cityname'] == '昌江黎族') | (df['cityname'] == '昌江黎族自治')
    df.loc[mask, 'cityname'] = '昌江黎族自治县'
    print(f"Year {year}: Updated {mask.sum()} rows")
    mask = (df['cityname'] == '琼中黎族') | (df['cityname'] == '琼中黎族苗族')
    df.loc[mask, 'cityname'] = '琼中黎族苗族自治县'
    print(f"Year {year}: Updated {mask.sum()} rows")
    mask = (df['cityname'] == '白沙黎族') | (df['cityname'] == '白沙黎族苗族')
    df.loc[mask, 'cityname'] = '白沙黎族苗族自治县'
    print(f"Year {year}: Updated {mask.sum()} rows")
    mask = (df['cityname'] == '神农架林')
    df.loc[mask, 'cityname'] = '神农架林区'
    print(f"Year {year}: Updated {mask.sum()} rows")
    mask = (df['cityname'] == '陵水黎族') | (df['cityname'] == '陵水黎族自治')
    df.loc[mask, 'cityname'] = '陵水黎族自治县'
    print(f"Year {year}: Updated {mask.sum()} rows")

for year, df in GS_data_set.items():
    mask = (df['cityname'] == '乐东黎族') | (df['cityname'] == '乐东黎族自治')
    df.loc[mask, 'cityname'] = '乐东黎族自治县'
    print(f"Year {year}: Updated {mask.sum()} rows")
    mask = (df['cityname'] == '保亭黎族') | (df['cityname'] == '保亭黎族苗族')
    df.loc[mask, 'cityname'] = '保亭黎族苗族自治县'
    print(f"Year {year}: Updated {mask.sum()} rows")
    mask = (df['cityname'] == '可克达拉')
    df.loc[mask, 'cityname'] = '可克达拉市'
    print(f"Year {year}: Updated {mask.sum()} rows")
    mask = (df['cityname'] == '图木舒克')
    df.loc[mask, 'cityname'] = '图木舒克市'
    print(f"Year {year}: Updated {mask.sum()} rows")
    mask = (df['cityname'] == '昌江黎族') | (df['cityname'] == '昌江黎族自治')
    df.loc[mask, 'cityname'] = '昌江黎族自治县'
    print(f"Year {year}: Updated {mask.sum()} rows")
    mask = (df['cityname'] == '琼中黎族') | (df['cityname'] == '琼中黎族苗族')
    df.loc[mask, 'cityname'] = '琼中黎族苗族自治县'
    print(f"Year {year}: Updated {mask.sum()} rows")
    mask = (df['cityname'] == '白沙黎族') | (df['cityname'] == '白沙黎族苗族')
    df.loc[mask, 'cityname'] = '白沙黎族苗族自治县'
    print(f"Year {year}: Updated {mask.sum()} rows")
    mask = (df['cityname'] == '神农架林')
    df.loc[mask, 'cityname'] = '神农架林区'
    print(f"Year {year}: Updated {mask.sum()} rows")
    mask = (df['cityname'] == '陵水黎族') | (df['cityname'] == '陵水黎族自治')
    df.loc[mask, 'cityname'] = '陵水黎族自治县'
    print(f"Year {year}: Updated {mask.sum()} rows")

Year 2015: Updated 0 rows
Year 2015: Updated 0 rows
Year 2015: Updated 0 rows
Year 2015: Updated 0 rows
Year 2015: Updated 0 rows
Year 2015: Updated 0 rows
Year 2015: Updated 0 rows
Year 2015: Updated 0 rows
Year 2015: Updated 0 rows
Year 2016: Updated 0 rows
Year 2016: Updated 0 rows
Year 2016: Updated 0 rows
Year 2016: Updated 0 rows
Year 2016: Updated 0 rows
Year 2016: Updated 0 rows
Year 2016: Updated 0 rows
Year 2016: Updated 0 rows
Year 2016: Updated 0 rows
Year 2017: Updated 0 rows
Year 2017: Updated 0 rows
Year 2017: Updated 0 rows
Year 2017: Updated 0 rows
Year 2017: Updated 0 rows
Year 2017: Updated 0 rows
Year 2017: Updated 0 rows
Year 2017: Updated 0 rows
Year 2017: Updated 5 rows
Year 2018: Updated 0 rows
Year 2018: Updated 0 rows
Year 2018: Updated 0 rows
Year 2018: Updated 0 rows
Year 2018: Updated 0 rows
Year 2018: Updated 0 rows
Year 2018: Updated 0 rows
Year 2018: Updated 0 rows
Year 2018: Updated 0 rows
Year 2019: Updated 0 rows
Year 2019: Updated 0 rows
Year 2019: U

In [43]:
# Get unique names from both CS and GS datasets
cs_pnames, cs_citynames = get_unique_names(CS_data_set)
gs_pnames, gs_citynames = get_unique_names(GS_data_set)

# Combine all unique names
all_pnames = sorted(list(set(cs_pnames + gs_pnames)))
all_citynames = sorted(list(set(cs_citynames + gs_citynames)))

print(f"Total unique province names: {len(all_pnames)}")
print(f"Total unique city names: {len(all_citynames)}")

Total unique province names: 31
Total unique city names: 372


In [44]:
for key in CS_data_set.keys():
    CS_data_set[key].to_parquet(f"{CS_output_data_dir}/{key}.parquet", index=False)
    print(f"CS cleaned data {key} saved. Length: {len(CS_data_set[key])}")

for key in GS_data_set.keys():
    GS_data_set[key].to_parquet(f"{GS_output_data_dir}/{key}.parquet", index=False)
    print(f"GS cleaned data {key} saved. Length: {len(GS_data_set[key])}")


CS cleaned data 2015 saved. Length: 1856
CS cleaned data 2016 saved. Length: 1909
CS cleaned data 2017 saved. Length: 4252
CS cleaned data 2018 saved. Length: 33552
CS cleaned data 2019 saved. Length: 56661
CS cleaned data 2020 saved. Length: 72863
CS cleaned data 2021 saved. Length: 93389
CS cleaned data 2022 saved. Length: 97896
CS cleaned data 2023 saved. Length: 119790
CS cleaned data 2024 saved. Length: 184763
CS cleaned data 2025 saved. Length: 225258
GS cleaned data 2013 saved. Length: 101591
GS cleaned data 2014 saved. Length: 104327
GS cleaned data 2015 saved. Length: 118011
GS cleaned data 2016 saved. Length: 119393
GS cleaned data 2017 saved. Length: 118441
GS cleaned data 2018 saved. Length: 107083
GS cleaned data 2019 saved. Length: 113338
GS cleaned data 2020 saved. Length: 119894
GS cleaned data 2021 saved. Length: 121589
GS cleaned data 2022 saved. Length: 110988
GS cleaned data 2023 saved. Length: 118521
GS cleaned data 2024 saved. Length: 118537
GS cleaned data 2025 s