In [1]:
import pandas as pd
import difflib
from difflib import SequenceMatcher
import warnings
warnings.filterwarnings("ignore")
import os
import re
import json

### Matching labels for each pair of years

In [48]:
years = range(2014, 2015)
st_directory = f'../YtoY_diff'

for year in years:

    # File paths for the year1 and year2 data
    file_year1_path = f'../Metadata/ACS-ST5Y{year}-Metadata-type.csv'
    file_year2_path = f'../Metadata/ACS-ST5Y{year+1}-Metadata-type.csv'

    # Read the files into dataframes, skipping the first three rows
    df_year1 = pd.read_csv(file_year1_path)
    df_year2 = pd.read_csv(file_year2_path)
    
    # Make all label and concept columns uppercase
    df_year1['label'] = df_year1['label'].str.upper()
    df_year2['label'] = df_year2['label'].str.upper()
    
    df_year1['concept'] = df_year1['concept'].str.upper()
    df_year2['concept'] = df_year2['concept'].str.upper()
    
    # Regular expression pattern to remove '[YEAR] INFLATION-ADJUSTED DOLLARS'
    pattern1 = r'\(IN 20(1[0-9]|2[0-9]) INFLATION-ADJUSTED DOLLARS\)'
    pattern2 = r'\(IN 20(1[0-9]|2[0-9]) INFLATION ADJUSTED DOLLARS\)'

    # Replace the matched pattern with an empty string
    df_year1['label'] = df_year1['label'].replace(pattern1, '', regex=True)
    df_year2['label'] = df_year2['label'].replace(pattern1, '', regex=True)
    df_year1['label'] = df_year1['label'].replace(pattern2, '', regex=True)
    df_year2['label'] = df_year2['label'].replace(pattern2, '', regex=True)

    df_year1['concept'] = df_year1['concept'].replace(pattern1, '', regex=True)
    df_year2['concept'] = df_year2['concept'].replace(pattern1, '', regex=True)
    df_year1['concept'] = df_year1['concept'].replace(pattern2, '', regex=True)
    df_year2['concept'] = df_year2['concept'].replace(pattern2, '', regex=True)

    # Two spaces --> one space
    df_year1['label'] = df_year1['label'].replace('  ', ' ', regex=True)
    df_year2['label'] = df_year2['label'].replace('  ', ' ', regex=True)
    df_year1['concept'] = df_year1['concept'].replace('  ', ' ', regex=True)
    df_year2['concept'] = df_year2['concept'].replace('  ', ' ', regex=True)

    # This is done because 2015~2016 concept is named differently from 2017~2022
    if year == 2016:
        df_year1.loc[df_year1['concept'] == 'PRIVATE HEALTH INSURANCE COVERAGE BY TYPE', 'concept'] = 'PRIVATE HEALTH INSURANCE COVERAGE BY TYPE AND SELECTED CHARACTERISTICS'
        df_year1.loc[df_year1['concept'] == 'PUBLIC HEALTH INSURANCE COVERAGE BY TYPE', 'concept'] = 'PUBLIC HEALTH INSURANCE COVERAGE BY TYPE AND SELECTED CHARACTERISTICS'
    
    # Filter out rows with labels containing "PR" (refers to Puerto Rico)
    df_year1 = df_year1[~df_year1.iloc[:, 0].str.contains(("PR"))]
    df_year2 = df_year2[~df_year2.iloc[:, 0].str.contains(("PR"))]
    
    # Filter out 'name' containing 'S2601B' (Only exist in 2015/2016)
    df_year1 = df_year1[~df_year1['name'].str.contains('S2601B')]   
    
    # Function to swap the first and second text segments
    def second_to_first(text):
        parts = text.split('!!')
        parts[0], parts[1] = parts[1], parts[0]
        return '!!'.join(parts)
    
    # Function to swap the first and third text segments
    def third_to_first(text):
        parts = text.split('!!')
        parts[0], parts[1], parts[2] = parts[2], parts[0], parts[1]
        return '!!'.join(parts)

    # Apply the function to each row in the 'label' column
    df_year1['label'] = df_year1['label'].apply(lambda x: second_to_first(x) if 'ESTIMATE' in x.split('!!')[1] else x)
    df_year2['label'] = df_year2['label'].apply(lambda x: second_to_first(x) if 'ESTIMATE' in x.split('!!')[1] else x)
    
    df_year1['label'] = df_year1['label'].apply(lambda x: third_to_first(x) if 'ESTIMATE' in x.split('!!')[2] else x)
    df_year2['label'] = df_year2['label'].apply(lambda x: third_to_first(x) if 'ESTIMATE' in x.split('!!')[2] else x)
        
    # Remove colon only in rows containing 'A.M.' or 'P.M.' or 'GRADE'
    df_year1['label'] = df_year1['label'].apply(lambda x: x.replace(':', ' ') if 'A.M.' in x or 'P.M.' in x else x)
    df_year2['label'] = df_year2['label'].apply(lambda x: x.replace(':', ' ') if 'A.M.' in x or 'P.M.' in x else x)

    df_year1['label'] = df_year1['label'].apply(lambda x: x.replace(':', '') if 'GRADE' in x or 'EARNINGS' in x or 'BIRTH' in x else x)
    df_year2['label'] = df_year2['label'].apply(lambda x: x.replace(':', '') if 'GRADE' in x or 'EARNINGS' in x or 'BIRTH' in x else x)
    
    # Change all plural MALES, FEMALES to singular
    df_year1['label'] = df_year1['label'].apply(lambda x: x.replace('MALES', 'MALE') if 'MALES' in x or 'FEMALES' in x else x)
    df_year2['label'] = df_year2['label'].apply(lambda x: x.replace('MALES', 'MALE') if 'MALES' in x or 'FEMALES' in x else x)
    
    df_year1['label'] = df_year1['label'].apply(lambda x: x.replace('CIVILIAN GRANDCHILDREN', 'GRANDCHILDREN'))
    df_year2['label'] = df_year2['label'].apply(lambda x: x.replace('CIVILIAN GRANDCHILDREN', 'GRANDCHILDREN'))
    
    df_year1['label'] = df_year1['label'].replace('BOTH SPOUSES', 'HUSBAND AND WIFE', regex=True)
    df_year2['label'] = df_year2['label'].replace('BOTH SPOUSES', 'HUSBAND AND WIFE', regex=True)
    
    df_year1['label'] = df_year1['label'].apply(lambda x: x.replace('SPOUSE', 'HUSBAND') if 'FEMALE HOUSEHOLDER' in x else x)
    df_year2['label'] = df_year2['label'].apply(lambda x: x.replace('SPOUSE', 'HUSBAND') if 'FEMALE HOUSEHOLDER' in x else x)
    
    df_year1['label'] = df_year1['label'].apply(lambda x: x.replace('SPOUSE', 'WIFE') if ' MALE HOUSEHOLDER' in x or '!!MALE HOUSEHOLDER' in x else x)
    df_year2['label'] = df_year2['label'].apply(lambda x: x.replace('SPOUSE', 'WIFE') if ' MALE HOUSEHOLDER' in x or '!!MALE HOUSEHOLDER' in x else x)
    
    # Remove unnecessary colons
    df_year1['label'] = df_year1['label'].apply(lambda x: x.replace(':', '') if 'OTHER FAMILY:' in x or 'ASSISTANCE:' in x or 'OCCUPATIONS:' in x or 'UTILITIES:' in x or 'SERVICES:' in x or 'MINING:' in x or 'LEASING:' in x or 'WORKERS:' in x else x)
    df_year2['label'] = df_year2['label'].apply(lambda x: x.replace(':', '') if 'OTHER FAMILY:' in x or 'ASSISTANCE:' in x or 'OCCUPATIONS:' in x or 'UTILITIES:' in x or 'SERVICES:' in x or 'MINING:' in x or 'LEASING:' in x or 'WORKERS:' in x else x)
    
    df_year1['label'] = df_year1['label'].apply(lambda x: x.replace(':', '') if 'DEVICES:' in x or 'SUBSCRIPTION:' in x or '$20,000:' in x or 'MORE:' in x or '$74,999:' in x or '12 MONTHS:' in x else x)
    df_year2['label'] = df_year2['label'].apply(lambda x: x.replace(':', '') if 'DEVICES:' in x or 'SUBSCRIPTION:' in x or '$20,000:' in x or 'MORE:' in x or '$74,999:' in x or '12 MONTHS:' in x else x)
        
    df_year1['label'] = df_year1['label'].apply(lambda x: x.replace('FROM HOME', 'AT HOME') if 'FROM HOME' in x else x)
    df_year2['label'] = df_year2['label'].apply(lambda x: x.replace('FROM HOME', 'AT HOME') if 'FROM HOME' in x else x)
    
    df_year1['label'] = df_year1['label'].apply(lambda x: x.replace('TIME OF DEPARTURE TO GO TO WORK', 'TIME LEAVING HOME TO GO TO WORK') if 'TIME OF DEPARTURE TO GO TO WORK' in x else x)
    df_year2['label'] = df_year2['label'].apply(lambda x: x.replace('TIME OF DEPARTURE TO GO TO WORK', 'TIME LEAVING HOME TO GO TO WORK') if 'TIME OF DEPARTURE TO GO TO WORK' in x else x)
    
    df_year1['label'] = df_year1['label'].replace('!!MALE REFERENCE PERSON, NO SPOUSE PRESENT', '!!MALE HOUSEHOLDER, NO WIFE PRESENT', regex=True)
    df_year2['label'] = df_year2['label'].replace('!!MALE REFERENCE PERSON, NO SPOUSE PRESENT', '!!MALE HOUSEHOLDER, NO WIFE PRESENT', regex=True)
    
    df_year1['label'] = df_year1['label'].replace('FEMALE REFERENCE PERSON, NO SPOUSE PRESENT', 'FEMALE HOUSEHOLDER, NO HUSBAND PRESENT', regex=True)
    df_year2['label'] = df_year2['label'].replace('FEMALE REFERENCE PERSON, NO SPOUSE PRESENT', 'FEMALE HOUSEHOLDER, NO HUSBAND PRESENT', regex=True)
    
    df_year1['label'] = df_year1['label'].replace('FIRE FIGHTING', 'FIREFIGHTING', regex=True)
    df_year2['label'] = df_year2['label'].replace('FIRE FIGHTING', 'FIREFIGHTING', regex=True)
    
    df_year1['label'] = df_year1['label'].replace('HEALTHCARE PRACTITIONERS', 'HEALTHCARE PRACTITIONER', regex=True)
    df_year2['label'] = df_year2['label'].replace('HEALTHCARE PRACTITIONERS', 'HEALTHCARE PRACTITIONER', regex=True)
    
    df_year1['label'] = df_year1['label'].replace('EDUCATIONAL INSTRUCTION', 'EDUCATION, TRAINING', regex=True)
    df_year2['label'] = df_year2['label'].replace('EDUCATIONAL INSTRUCTION', 'EDUCATION, TRAINING', regex=True)

    df_year1['label'] = df_year1['label'].replace('SOCIAL SERVICE OCCUPATIONS', 'SOCIAL SERVICES OCCUPATIONS', regex=True)
    df_year2['label'] = df_year2['label'].replace('SOCIAL SERVICE OCCUPATIONS', 'SOCIAL SERVICES OCCUPATIONS', regex=True)

    df_year1['label'] = df_year1['label'].replace('PERCENT IMPUTED', 'PERCENT ALLOCATED', regex=True)
    df_year2['label'] = df_year2['label'].replace('PERCENT IMPUTED', 'PERCENT ALLOCATED', regex=True)

    df_year1['label'] = df_year1['label'].replace('!!MARRIED-COUPLE FAMILIES', '!!OPPOSITE-SEX MARRIED-COUPLE FAMILIES', regex=True)
    df_year2['label'] = df_year2['label'].replace('!!MARRIED-COUPLE FAMILIES', '!!OPPOSITE-SEX MARRIED-COUPLE FAMILIES', regex=True)

    df_year1['label'] = df_year1['label'].replace('TIME ARRIVING AT WORK AT HOME!!', 'TIME ARRIVING AT WORK!!', regex=True)
    df_year2['label'] = df_year2['label'].replace('TIME ARRIVING AT WORK AT HOME!!', 'TIME ARRIVING AT WORK!!', regex=True)

    df_year1.loc[df_year1['concept'] == 'DISABILITY CHARACTERISTICS', 'label'] = df_year1['label'].str.replace('SUBJECT', 'TOTAL CIVILIAN NONINSTITUTIONALIZED POPULATION')
    df_year2.loc[df_year2['concept'] == 'DISABILITY CHARACTERISTICS', 'label'] = df_year2['label'].str.replace('SUBJECT', 'TOTAL CIVILIAN NONINSTITUTIONALIZED POPULATION')

    df_year1.loc[df_year1['concept'] == 'TYPES OF COMPUTERS AND INTERNET SUBSCRIPTIONS', 'label'] = df_year1['label'].apply(lambda x: x.replace('ESTIMATE!!PERCENT!!', 'ESTIMATE!!PERCENT!!TOTAL HOUSEHOLDS!!') if 'TOTAL HOUSEHOLDS' not in x else x)
    df_year2.loc[df_year2['concept'] == 'TYPES OF COMPUTERS AND INTERNET SUBSCRIPTIONS', 'label'] = df_year2['label'].apply(lambda x: x.replace('ESTIMATE!!PERCENT!!', 'ESTIMATE!!PERCENT!!TOTAL HOUSEHOLDS!!') if 'TOTAL HOUSEHOLDS' not in x else x)
    df_year1.loc[df_year1['concept'] == 'TYPES OF COMPUTERS AND INTERNET SUBSCRIPTIONS', 'label'] = df_year1['label'].apply(lambda x: x.replace('ESTIMATE!!TOTAL!!', 'ESTIMATE!!TOTAL!!TOTAL HOUSEHOLDS!!') if 'TOTAL HOUSEHOLDS' not in x else x)
    df_year2.loc[df_year2['concept'] == 'TYPES OF COMPUTERS AND INTERNET SUBSCRIPTIONS', 'label'] = df_year2['label'].apply(lambda x: x.replace('ESTIMATE!!TOTAL!!', 'ESTIMATE!!TOTAL!!TOTAL HOUSEHOLDS!!') if 'TOTAL HOUSEHOLDS' not in x else x)

    df_year1.loc[df_year1['concept'] == 'EDUCATIONAL ATTAINMENT', 'label'] = df_year1['label'].apply(lambda x: x.replace('!!POPULATION', '!!AGE BY EDUCATIONAL ATTAINMENT!!POPULATION') if 'AGE BY EDUCATIONAL ATTAINMENT' not in x else x)
    df_year2.loc[df_year2['concept'] == 'EDUCATIONAL ATTAINMENT', 'label'] = df_year2['label'].apply(lambda x: x.replace('!!POPULATION', '!!AGE BY EDUCATIONAL ATTAINMENT!!POPULATION') if 'AGE BY EDUCATIONAL ATTAINMENT' not in x else x)

    df_year1.loc[df_year1['concept'] == 'EDUCATIONAL ATTAINMENT', 'label'] = df_year1['label'].apply(lambda x: x.replace('AGE BY EDUCATIONAL ATTAINMENT!!', 'AGE BY EDUCATIONAL ATTAINMENT!!POPULATION 25 YEARS AND OVER!!') if 'POPULATION 25 YEARS AND OVER' not in x else x)
    df_year2.loc[df_year2['concept'] == 'EDUCATIONAL ATTAINMENT', 'label'] = df_year2['label'].apply(lambda x: x.replace('AGE BY EDUCATIONAL ATTAINMENT!!', 'AGE BY EDUCATIONAL ATTAINMENT!!POPULATION 25 YEARS AND OVER!!') if 'POPULATION 25 YEARS AND OVER' not in x else x)

    df_year1.loc[df_year1['concept'] == 'AGE AND SEX', 'label'] = df_year1['label'].apply(lambda x: x.replace('PERCENT ALLOCATED!!', 'TOTAL POPULATION!!PERCENT ALLOCATED!!') if 'TOTAL POPULATION!!' not in x else x)
    df_year2.loc[df_year2['concept'] == 'AGE AND SEX', 'label'] = df_year2['label'].apply(lambda x: x.replace('PERCENT ALLOCATED!!', 'TOTAL POPULATION!!PERCENT ALLOCATED!!') if 'TOTAL POPULATION!!' not in x else x)

    df_year1.loc[df_year1['concept'] == 'LANGUAGE SPOKEN AT HOME', 'label'] = df_year1['label'].str.replace('!!POPULATION 5 YEARS AND OVER!!', '!!CITIZENS 18 YEARS AND OVER!!')
    df_year2.loc[df_year2['concept'] == 'LANGUAGE SPOKEN AT HOME', 'label'] = df_year2['label'].str.replace('!!POPULATION 5 YEARS AND OVER!!', '!!CITIZENS 18 YEARS AND OVER!!')

    df_year1.loc[(df_year1['concept'] == 'FINANCIAL CHARACTERISTICS FOR HOUSING UNITS WITHOUT A MORTGAGE') & (df_year1['type'] == 'float'), 'label'] \
                = df_year1['label'].apply(lambda x: x.replace('ESTIMATE!!', 'ESTIMATE!!PERCENT OWNER-OCCUPIED HOUSING UNITS WITHOUT A MORTGAGE!!') if 'PERCENT OWNER-OCCUPIED HOUSING UNITS WITHOUT A MORTGAGE' not in x else x)
    df_year2.loc[(df_year2['concept'] == 'FINANCIAL CHARACTERISTICS FOR HOUSING UNITS WITHOUT A MORTGAGE') & (df_year2['type'] == 'float'), 'label'] \
                = df_year2['label'].apply(lambda x: x.replace('ESTIMATE!!', 'ESTIMATE!!PERCENT OWNER-OCCUPIED HOUSING UNITS WITHOUT A MORTGAGE!!') if 'PERCENT OWNER-OCCUPIED HOUSING UNITS WITHOUT A MORTGAGE' not in x else x)
    
    df_year1.loc[(df_year1['concept'] == 'FINANCIAL CHARACTERISTICS FOR HOUSING UNITS WITH A MORTGAGE') & (df_year1['type'] == 'float'), 'label'] \
                = df_year1['label'].apply(lambda x: x.replace('ESTIMATE!!', 'ESTIMATE!!PERCENT OWNER-OCCUPIED HOUSING UNITS WITH A MORTGAGE!!') if 'PERCENT OWNER-OCCUPIED HOUSING UNITS WITH A MORTGAGE' not in x else x)
    df_year2.loc[(df_year2['concept'] == 'FINANCIAL CHARACTERISTICS FOR HOUSING UNITS WITH A MORTGAGE') & (df_year2['type'] == 'float'), 'label'] \
                = df_year2['label'].apply(lambda x: x.replace('ESTIMATE!!', 'ESTIMATE!!PERCENT OWNER-OCCUPIED HOUSING UNITS WITH A MORTGAGE!!') if 'PERCENT OWNER-OCCUPIED HOUSING UNITS WITH A MORTGAGE' not in x else x)

    df_year1['label'] = df_year1['label'].replace('!!RENTER-OCCUPIED HOUSING UNITS!!', '!!PERCENT RENTER-OCCUPIED HOUSING UNITS!!OCCUPIED HOUSING UNITS!!', regex=True)
    df_year2['label'] = df_year2['label'].replace('!!RENTER-OCCUPIED HOUSING UNITS!!', '!!PERCENT RENTER-OCCUPIED HOUSING UNITS!!OCCUPIED HOUSING UNITS!!', regex=True)

    df_year1['label'] = df_year1['label'].replace('!!OWNER-OCCUPIED HOUSING UNITS!!', '!!PERCENT OWNER-OCCUPIED HOUSING UNITS!!OCCUPIED HOUSING UNITS!!', regex=True)
    df_year2['label'] = df_year2['label'].replace('!!OWNER-OCCUPIED HOUSING UNITS!!', '!!PERCENT OWNER-OCCUPIED HOUSING UNITS!!OCCUPIED HOUSING UNITS!!', regex=True)

    df_year1['label'] = df_year1['label'].replace('!!ONE RACE\s*--!!', '!!ONE RACE!!', regex=True)
    df_year2['label'] = df_year2['label'].replace('!!ONE RACE\s*--!!', '!!ONE RACE!!', regex=True)

    df_year1.loc[df_year1['type'] == 'float', 'label'] = df_year1['label'].str.replace('!!OCCUPIED HOUSING UNITS!!', '!!PERCENT OCCUPIED HOUSING UNITS!!OCCUPIED HOUSING UNITS!!')
    df_year2.loc[df_year2['type'] == 'float', 'label'] = df_year2['label'].str.replace('!!OCCUPIED HOUSING UNITS!!', '!!PERCENT OCCUPIED HOUSING UNITS!!OCCUPIED HOUSING UNITS!!')

    df_year1['label'] = df_year1['label'].replace('!!UNINSURED POPULATION!!', '!!TOTAL UNINSURED!!', regex=True)
    df_year2['label'] = df_year2['label'].replace('!!UNINSURED POPULATION!!', '!!TOTAL UNINSURED!!', regex=True)

    df_year1.loc[(df_year1['name'].str.contains('S0101')) & (df_year1['type'] == 'float'), 'label'] = df_year1['label'].replace('ESTIMATE!!TOTAL!!', 'ESTIMATE!!PERCENT!!', regex=True)
    df_year2.loc[(df_year2['name'].str.contains('S0101')) & (df_year2['type'] == 'float'), 'label'] = df_year2['label'].replace('ESTIMATE!!TOTAL!!', 'ESTIMATE!!PERCENT!!', regex=True)
    df_year1.loc[(df_year1['name'].str.contains('S0101')) & (df_year1['type'] == 'float'), 'label'] = df_year1['label'].apply(lambda x: x.replace('ESTIMATE!!PERCENT!!', 'ESTIMATE!!PERCENT!!TOTAL POPULATION!!') if '!!TOTAL POPULATION!!' not in x else x)
    df_year2.loc[(df_year2['name'].str.contains('S0101')) & (df_year2['type'] == 'float'), 'label'] = df_year2['label'].apply(lambda x: x.replace('ESTIMATE!!PERCENT!!', 'ESTIMATE!!PERCENT!!TOTAL POPULATION!!') if '!!TOTAL POPULATION!!' not in x else x)

    df_year1.loc[(df_year1['name'].str.contains('S0101')) & (df_year1['type'] == 'float'), 'label'] = df_year1['label'].replace('ESTIMATE!!MALE!!', 'ESTIMATE!!PERCENT MALE!!', regex=True)
    df_year2.loc[(df_year2['name'].str.contains('S0101')) & (df_year2['type'] == 'float'), 'label'] = df_year2['label'].replace('ESTIMATE!!MALE!!', 'ESTIMATE!!PERCENT MALE!!', regex=True)
    df_year1.loc[(df_year1['name'].str.contains('S0101')) & (df_year1['type'] == 'float'), 'label'] = df_year1['label'].apply(lambda x: x.replace('ESTIMATE!!PERCENT MALE!!', 'ESTIMATE!!PERCENT MALE!!TOTAL POPULATION!!') if '!!TOTAL POPULATION!!' not in x else x)
    df_year2.loc[(df_year2['name'].str.contains('S0101')) & (df_year2['type'] == 'float'), 'label'] = df_year2['label'].apply(lambda x: x.replace('ESTIMATE!!PERCENT MALE!!', 'ESTIMATE!!PERCENT MALE!!TOTAL POPULATION!!') if '!!TOTAL POPULATION!!' not in x else x)

    df_year1.loc[(df_year1['name'].str.contains('S0101')) & (df_year1['type'] == 'float'), 'label'] = df_year1['label'].replace('ESTIMATE!!FEMALE!!', 'ESTIMATE!!PERCENT FEMALE!!', regex=True)
    df_year2.loc[(df_year2['name'].str.contains('S0101')) & (df_year2['type'] == 'float'), 'label'] = df_year2['label'].replace('ESTIMATE!!FEMALE!!', 'ESTIMATE!!PERCENT FEMALE!!', regex=True)
    df_year1.loc[(df_year1['name'].str.contains('S0101')) & (df_year1['type'] == 'float'), 'label'] = df_year1['label'].apply(lambda x: x.replace('ESTIMATE!!PERCENT FEMALE!!', 'ESTIMATE!!PERCENT FEMALE!!TOTAL POPULATION!!') if '!!TOTAL POPULATION!!' not in x else x)
    df_year2.loc[(df_year2['name'].str.contains('S0101')) & (df_year2['type'] == 'float'), 'label'] = df_year2['label'].apply(lambda x: x.replace('ESTIMATE!!PERCENT FEMALE!!', 'ESTIMATE!!PERCENT FEMALE!!TOTAL POPULATION!!') if '!!TOTAL POPULATION!!' not in x else x)


    df_year1.loc[(df_year1['name'].str.contains('S2201')), 'label'] = df_year1['label'].replace('!!HOUSEHOLD TYPE!!', '!!HOUSEHOLDS!!', regex=True)
    df_year2.loc[(df_year2['name'].str.contains('S2201')), 'label'] = df_year2['label'].replace('!!HOUSEHOLD TYPE!!', '!!HOUSEHOLDS!!', regex=True)

    df_year1['label'] = df_year1['label'].replace('FULL-TIME, ', 'FULL-TIME/', regex=True)
    df_year2['label'] = df_year2['label'].replace('FULL-TIME, ', 'FULL-TIME/', regex=True)

    df_year1['label'] = df_year1['label'].replace('PERCENT (NONVETERANS|VETERANS)', 'PERCENT', regex=True)
    df_year2['label'] = df_year2['label'].replace('PERCENT (NONVETERANS|VETERANS)', 'PERCENT', regex=True)      

    df_year1['label'] = df_year1['label'].replace('PERCENT TOTAL', 'PERCENT', regex=True)
    df_year2['label'] = df_year2['label'].replace('PERCENT TOTAL', 'PERCENT', regex=True)  

    df_year1['label'] = df_year1['label'].replace(' ?--', '', regex=True)
    df_year2['label'] = df_year2['label'].replace(' ?--', '', regex=True)

    df_year1['label'] = df_year1['label'].replace('PLACE OF BIRTH, NATIVITY AND CITIZENSHIP STATUS, AND YEAR OF ENTRY', 'PLACE OF BIRTH/ NATIVITY AND CITIZENSHIP STATUS/ AND YEAR OF ENTRY', regex=True)
    df_year2['label'] = df_year2['label'].replace('PLACE OF BIRTH, NATIVITY AND CITIZENSHIP STATUS, AND YEAR OF ENTRY', 'PLACE OF BIRTH/ NATIVITY AND CITIZENSHIP STATUS/ AND YEAR OF ENTRY', regex=True)

    ########################## This is very specific to 2016 and should be re-evaluated for its validity ##########################
    if year == 2016:
        df_year1.loc[(df_year1['name'].str.contains('S2701|S2702|S2703')), 'label'] = df_year1['label'].replace('18 TO 64 YEARS', '19 TO 64 YEARS', regex=True)
        df_year1.loc[(df_year1['name'].str.contains('S2701')), 'label'] = df_year1['label'].replace('25 YEARS AND OVER', '26 YEARS AND OVER', regex=True)
        df_year1.loc[(df_year1['name'].str.contains('S2701')), 'label'] = df_year1['label'].replace('18 YEARS AND OVER', '19 TO 64 YEARS', regex=True)
        df_year1.loc[(df_year1['name'].str.contains('S2701|S2702|S2703')), 'label'] = df_year1['label'].replace('UNDER 18 YEARS', 'UNDER 19 YEARS', regex=True)
        df_year1.loc[(df_year1['name'].str.contains('S2703')), 'label'] = df_year1['label'].replace('6 TO 17 YEARS', '6 TO 18 YEARS', regex=True)
        df_year1.loc[(df_year1['name'].str.contains('S2703')), 'label'] = df_year1['label'].replace('18 TO 24 YEARS', '19 TO 25 YEARS', regex=True)
        df_year1.loc[(df_year1['name'].str.contains('S2703')), 'label'] = df_year1['label'].replace('25 TO 34 YEARS', '26 TO 34 YEARS', regex=True)
        df_year1.loc[(df_year1['name'].str.contains('S2703')), 'label'] = df_year1['label'].replace('UNDER 18', 'UNDER 19', regex=True)
        df_year1.loc[(df_year1['name'].str.contains('S2703')), 'label'] = df_year1['label'].replace('TRICARE/MILITARY HEALTH COVERAGE', 'TRICARE/MILITARY HEALTH INSURANCE', regex=True)     

    # Clean up the labels in 2014
    if year == 2014:
        df_year1['concept'] = df_year1['concept'].replace('HEALTH INSURANCE COVERAGE STATUS', 'SELECTED CHARACTERISTICS OF HEALTH INSURANCE COVERAGE IN THE UNITED STATES', regex=True)     
        df_year1['concept'] = df_year1['concept'].replace('FOOD STAMPS/SNAP', 'FOOD STAMPS/SUPPLEMENTAL NUTRITION ASSISTANCE PROGRAM (SNAP)', regex=True)  
        df_year1['concept'] = df_year1['concept'].replace('OCCUPATION BY SEX AND MEDIAN EARNINGS IN THE PAST 12 MONTHS FOR THE CIVILIAN EMPLOYED POPULATION 16 YEARS AND OVER', 'OCCUPATION BY SEX FOR THE CIVILIAN EMPLOYED POPULATION 16 YEARS AND OVER', regex=True)     
        df_year1['concept'] = df_year1['concept'].replace('OCCUPATION BY SEX AND MEDIAN EARNINGS IN THE PAST 12 MONTHS FOR FULL-TIME, YEAR-ROUND CIVILIAN EMPLOYED POPULATION 16 YEARS AND OVER', 'OCCUPATION BY SEX FOR THE CIVILIAN EMPLOYED POPULATION 16 YEARS AND OVER', regex=True)     
        df_year1['concept'] = df_year1['concept'].replace('INDUSTRY BY SEX AND MEDIAN EARNINGS IN THE PAST 12 MONTHS FOR THE CIVILIAN EMPLOYED POPULATION 16 YEARS AND OVER', 'INDUSTRY BY SEX FOR THE CIVILIAN EMPLOYED POPULATION 16 YEARS AND OVER', regex=True)             
        df_year1['concept'] = df_year1['concept'].replace('INDUSTRY BY SEX AND MEDIAN EARNINGS IN THE PAST 12 MONTHS FOR THE FULL-TIME, YEAR-ROUND CIVILIAN EMPLOYED POPULATION 16 YEARS AND OVER', 'INDUSTRY BY SEX FOR THE FULL-TIME, YEAR-ROUND CIVILIAN EMPLOYED POPULATION 16 YEARS AND OVER', regex=True)             
        df_year1['concept'] = df_year1['concept'].replace('CLASS OF WORKER BY SEX AND MEDIAN EARNINGS IN THE PAST 12 MONTHS FOR THE CIVILIAN EMPLOYED POPULATION 16 YEARS AND OVER', 'CLASS OF WORKER BY SEX FOR THE CIVILIAN EMPLOYED POPULATION 16 YEARS AND OVER', regex=True)             
        df_year1['concept'] = df_year1['concept'].replace('CLASS OF WORKER BY SEX AND MEDIAN EARNINGS IN THE PAST 12 MONTHS FOR THE FULL-TIME, YEAR-ROUND CIVILIAN EMPLOYED POPULATION 16 YEARS AND OVER', 'CLASS OF WORKER BY SEX FOR THE FULL-TIME, YEAR-ROUND CIVILIAN EMPLOYED POPULATION 16 YEARS AND OVER', regex=True)             
        
        
        df_year1['label'] = df_year1['label'].replace('RELATED CHILDREN', 'RELATED CHILDREN OF THE HOUSEHOLDER', regex=True)        
        # df_year1['label'] = df_year1['label'].replace('HOUSEHOLDS WITH NO ONE AGE 14 AND OVER WHO SPEAKS ENGLISH ONLY OR SPEAKS ENGLISH \VERY WELL\', 'LIMITED ENGLISH SPEAKING HOUSEHOLDS', regex=True)        
        df_year1['label'] = df_year1['label'].replace(r'HOUSEHOLDS WITH NO ONE AGE 14 AND OVER WHO SPEAKS ENGLISH ONLY OR SPEAKS ENGLISH "VERY WELL\\\"', "LIMITED ENGLISH SPEAKING HOUSEHOLDS", regex=True)
        df_year1['label'] = df_year1['label'].replace('LIVING WITH OWN GRANDCHILDREN', 'GRANDPARENTS LIVING WITH OWN GRANDCHILDREN', regex=True) 
        df_year1['label'] = df_year1['label'].replace('HOUSEHOLDS WITH OWN CHILDREN', 'HOUSEHOLDS WITH OWN CHILDREN OF THE HOUSEHOLDER', regex=True)   
        df_year1['label'] = df_year1['label'].replace('WITH AN OWN CHILD', 'WITH AN OWN CHILD OF THE HOUSEHOLDER', regex=True)   



    # Rename columns for better look + understanding
    df_year1.columns = [f'{year}_ID', f'{year}_Label', f'{year}_Concept', f'{year}_Type']        
    
    
    # Function that goes through all year1 variables and find match in year2
    def match_labels(df_year1, df_year2):
        # Add columns for year2 ID, label, and flag in year1 dataframe
        df_year1[f'{year+1}_ID'] = None
        df_year1[f'{year+1}_Label'] = None
        df_year1[f'{year+1}_Concept'] = None
        df_year1[f'{year+1}_Type'] = None
        df_year1['flag'] = 0  # Flag column for whether labels of year1 and year2 are same (1: same, 0: not same)

        used_id = []  # To track used id in df_year2
        used_id_year1 = [] # To track used id in df_year1
        
        ########################### PART 1 (EXACT MATCH) ############################
        # This for loop goes through year1 and find EXACT match from year2
        for index_year1, row_year1 in df_year1.iterrows():
            id_year1 = row_year1[0]
            label_year1 = row_year1[1]
            concept_year1 = row_year1[2]
            type_year1 = row_year1[3]

            parts_year1 = label_year1.split('!!')       

            # Filter df_year2 for the same concept and type
            year2_same_concept = df_year2[(df_year2['concept'] == concept_year1) & (df_year2['type'] == type_year1)]

            # Check for exact matches in filtered year2 dataframe
            exact_matches = year2_same_concept[~year2_same_concept['name'].isin(used_id) & (year2_same_concept['label'] == label_year1)]

            if not exact_matches.empty:
                
                match_index = exact_matches['name'].values[0]
                
                used_id_year1.append(id_year1)
                used_id.append(match_index)
                df_year1.at[index_year1, f'{year+1}_ID'] = exact_matches.iloc[0]['name']
                df_year1.at[index_year1, f'{year+1}_Label'] = exact_matches.iloc[0]['label']
                df_year1.at[index_year1, f'{year+1}_Concept'] = exact_matches.iloc[0]['concept']
                df_year1.at[index_year1, f'{year+1}_Type'] = exact_matches.iloc[0]['type']
                df_year1.at[index_year1, 'flag'] = 1

            else:
                
                year2_same_concept['parts_year2'] = year2_same_concept['label'].str.split("!!").apply(lambda x: set(x))
                sub_exact_matches = year2_same_concept[~year2_same_concept['name'].isin(used_id) & (year2_same_concept['parts_year2'] == set(parts_year1))]

                if not sub_exact_matches.empty:
                    match_index = sub_exact_matches['name'].values[0]
                    
                    used_id_year1.append(id_year1)
                    used_id.append(match_index)
                    df_year1.at[index_year1, f'{year+1}_ID'] = sub_exact_matches.iloc[0]['name']
                    df_year1.at[index_year1, f'{year+1}_Label'] = sub_exact_matches.iloc[0]['label']
                    df_year1.at[index_year1, f'{year+1}_Concept'] = sub_exact_matches.iloc[0]['concept']
                    df_year1.at[index_year1, f'{year+1}_Type'] = sub_exact_matches.iloc[0]['type']
                    df_year1.at[index_year1, 'flag'] = 0.9

            # # Iterate over filtered year2 dataframe
            # for index_year2, row_year2 in year2_same_concept.iterrows():
                
            #     id_year2 = row_year2[0]
            #     label_year2 = row_year2[1]
            #     concept_year2 = row_year2[2]
            #     type_year2 = row_year2[3]

            #     parts_year2 = label_year2.split('!!')
            #     first_one_year2 = '!!'.join(parts_year2[:1])
            #     first_two_year2 = '!!'.join(parts_year2[:2])
            #     first_three_year2 = '!!'.join(parts_year2[:3])    
            #     last_one_year2 = parts_year2[-1]
            #     last_two_year2 = '!!'.join(parts_year2[-2:])
            #     last_three_year2 = '!!'.join(parts_year2[-3:])
                
            #     # Skip IDs that are used in exact match phase
            #     if id_year2 in used_id:
            #         continue

            #     # Check if all the parts are the same (but in different order) + same type
            #     if set(parts_year1) == set(parts_year2) and type_year1 == type_year2:
            #         used_id.append(id_year2)
            #         df_year1.at[index_year1, f'{year+1}_ID'] = id_year2
            #         df_year1.at[index_year1, f'{year+1}_Label'] = label_year2
            #         df_year1.at[index_year1, f'{year+1}_Concept'] = concept_year2
            #         df_year1.at[index_year1, f'{year+1}_Type'] = type_year2
            #         df_year1.at[index_year1, 'flag'] = 0.9 
            #         break
                
        ########################### PART 2 (NON-EXACT MATCH) ############################
        # Now we should deal with NON-EXACT matches and find how to match year1 and year2
        for index_year1, row_year1 in df_year1.iterrows():
            
            id_year1 = row_year1[0]
            label_year1 = row_year1[1]
            concept_year1 = row_year1[2]
            type_year1 = row_year1[3]
            
            # Skip IDs that are used in exact match phase
            if id_year1 in used_id_year1:
                continue
                    
            # Filter df_year2 for the same concept
            year2_same_concept = df_year2[df_year2['concept'] == concept_year1]
            
            parts_year1 = label_year1.split('!!')        
            first_one_year1 = '!!'.join(parts_year1[:1])           
            first_two_year1 = '!!'.join(parts_year1[:2])
            first_three_year1 = '!!'.join(parts_year1[:3])         
            last_one_year1 = parts_year1[-1]
            last_two_year1 = '!!'.join(parts_year1[-2:])
            last_three_year1 = '!!'.join(parts_year1[-3:])

            # Iterate over filtered year2 dataframe
            for index_year2, row_year2 in year2_same_concept.iterrows():
                
                id_year2 = row_year2[0]
                label_year2 = row_year2[1]
                concept_year2 = row_year2[2]
                type_year2 = row_year2[3]
                
                # Skip IDs that are used in exact match phase
                if id_year2 in used_id:
                    continue
                    
                else:     

                    if ((any(sub in label_year1 for sub in ["!!MALE!!", "!!MALE", " MALE!!"]) and 
                         any(sub in label_year2 for sub in ["!!MALE!!", "!!MALE", " MALE!!"])) or 
                        ("FEMALE!!" in label_year1 and "FEMALE!!" in label_year2) or 
                        ("MALE!!" not in label_year1 and "!!MALE" not in label_year1 
                         and "MALE!!" not in label_year2 and "!!MALE" not in label_year2)):        
                    
                        parts_year2 = label_year2.split('!!')
                        first_one_year2 = '!!'.join(parts_year2[:1])
                        first_two_year2 = '!!'.join(parts_year2[:2])
                        first_three_year2 = '!!'.join(parts_year2[:3])    
                        last_one_year2 = parts_year2[-1]
                        last_two_year2 = '!!'.join(parts_year2[-2:])
                        last_three_year2 = '!!'.join(parts_year2[-3:])

                        flag_value = df_year1.at[index_year1, 'flag']
                        
                        # Check if the first two and the last three parts match + same type
                        if first_two_year1 == first_two_year2 and last_three_year1 == last_three_year2 and type_year1 == type_year2 and (pd.isna(flag_value) or flag_value < 0.8):
                            used_id.append(id_year2)
                            df_year1.at[index_year1, f'{year+1}_ID'] = id_year2
                            df_year1.at[index_year1, f'{year+1}_Label'] = label_year2
                            df_year1.at[index_year1, f'{year+1}_Concept'] = concept_year2
                            df_year1.at[index_year1, f'{year+1}_Type'] = type_year2
                            df_year1.at[index_year1, 'flag'] = 0.8 
                            break

                        # Check if the first THREE and the last TWO parts match + same type
                        elif first_three_year1 == first_three_year2 and last_two_year1 == last_two_year2 and type_year1 == type_year2 and (pd.isna(flag_value) or flag_value < 0.7):
                            used_id.append(id_year2)
                            df_year1.at[index_year1, f'{year+1}_ID'] = id_year2
                            df_year1.at[index_year1, f'{year+1}_Label'] = label_year2
                            df_year1.at[index_year1, f'{year+1}_Concept'] = concept_year2
                            df_year1.at[index_year1, f'{year+1}_Type'] = type_year2
                            df_year1.at[index_year1, 'flag'] = 0.7
                            break

                        # Check if the first two and the last two parts match + same type
                        elif first_two_year1 == first_two_year2 and last_two_year1 == last_two_year2 and type_year1 == type_year2 and (pd.isna(flag_value) or flag_value < 0.6):
                            used_id.append(id_year2)
                            df_year1.at[index_year1, f'{year+1}_ID'] = id_year2
                            df_year1.at[index_year1, f'{year+1}_Label'] = label_year2
                            df_year1.at[index_year1, f'{year+1}_Concept'] = concept_year2
                            df_year1.at[index_year1, f'{year+1}_Type'] = type_year2
                            df_year1.at[index_year1, 'flag'] = 0.6 
                            break

                        # Check if the first two and the last one part match + same type
                        elif first_two_year1 == first_two_year2 and last_one_year1 == last_one_year2 and type_year1 == type_year2 and (pd.isna(flag_value) or flag_value < 0.5):
                            used_id.append(id_year2)
                            df_year1.at[index_year1, f'{year+1}_ID'] = id_year2
                            df_year1.at[index_year1, f'{year+1}_Label'] = label_year2
                            df_year1.at[index_year1, f'{year+1}_Concept'] = concept_year2
                            df_year1.at[index_year1, f'{year+1}_Type'] = type_year2
                            df_year1.at[index_year1, 'flag'] = 0.5
                            break

                        # Check if the first one and the last two parts match + same type
                        elif first_one_year1 == first_one_year2 and last_two_year1 == last_two_year2 and type_year1 == type_year2 and (pd.isna(flag_value) or flag_value < 0.3):
                            used_id.append(id_year2)
                            df_year1.at[index_year1, f'{year+1}_ID'] = id_year2
                            df_year1.at[index_year1, f'{year+1}_Label'] = label_year2
                            df_year1.at[index_year1, f'{year+1}_Concept'] = concept_year2
                            df_year1.at[index_year1, f'{year+1}_Type'] = type_year2
                            df_year1.at[index_year1, 'flag'] = 0.3
                            # break

                        # Check if the first one and the last one part match + same type
                        elif first_one_year1 == first_one_year2 and last_one_year1 == last_one_year2 and type_year1 == type_year2 and pd.isna(flag_value):

                            if (("PRIVATE SCHOOL!!" in label_year1 and "PRIVATE SCHOOL!!" in label_year2) or 
                                ("PUBLIC SCHOOL!!" in label_year1 and "PUBLIC SCHOOL!!" in label_year2) or 
                                ("PUBLIC SCHOOL!!" not in label_year1 and "PRIVATE SCHOOL" not in label_year1 
                                and "PUBLIC SCHOOL!!" not in label_year2 and "PRIVATE SCHOOL" not in label_year2)):      

                                used_id.append(id_year2)
                                df_year1.at[index_year1, f'{year+1}_ID'] = id_year2
                                df_year1.at[index_year1, f'{year+1}_Label'] = label_year2
                                df_year1.at[index_year1, f'{year+1}_Concept'] = concept_year2
                                df_year1.at[index_year1, f'{year+1}_Type'] = type_year2
                                df_year1.at[index_year1, 'flag'] = 0.2
                                # break

        return df_year1          


    # Match labels with match_labels function
    df = match_labels(df_year1, df_year2)

    df['id_flag'] = 0
    df['type_flag'] = 0
    
    # Flag column for whether matched IDs of year1 and year2 are same (1: same, 0: not same)
    df['id_flag'] = df.apply(lambda row: 1 if row[f'{year}_ID'] == row[f'{year+1}_ID'] else 0, axis=1)
    
    # Flag column for whether matched types of year1 and year2 are same (1: same, 0: not same)
    df['type_flag'] = df.apply(lambda row: 1 if row[f'{year}_Type'] == row[f'{year+1}_Type'] else 0, axis=1)

    # Construct the file name
    file_name = f'match_{year}_{year+1}.xlsx'

    # Construct the path including the directory
    file_path = os.path.join(st_directory, file_name)

    # Save the DataFrame to an Excel file in the respective directory
    df.to_excel(file_path, index=False)

### The following is used only for matching 2016-2017 and 2018-2019

In [41]:
import pandas as pd

years = [2016, 2018]

for year in years:

    file_path = f'../YtoY_diff/match_{year}_{year+1}.xlsx'

    df = pd.read_excel(file_path)

    # Modifying the function to return differences as strings without set notation (curly brackets)
    def find_difference(row, col1, col2):
        # Handling potential NaN values
        parts1 = set(str(row[col1]).split("!!")) if pd.notna(row[col1]) else set()
        parts2 = set(str(row[col2]).split("!!")) if pd.notna(row[col2]) else set()
        # Finding the differences and formatting them as comma-separated strings
        diff1 = ", ".join(parts1 - parts2)  # Present in col1 but not in col2
        diff2 = ", ".join(parts2 - parts1)  # Present in col2 but not in col1
       
        return diff1, diff2

    # Applying the find_difference function and storing the results
    df[[f'{year}_diff', f'{year+1}_diff']] = df.apply(lambda row: find_difference(row, f'{year}_Label', f'{year+1}_Label'), axis=1, result_type="expand")

    if year == 2016:
      df.loc[df['2016_ID'].str.contains('S1902_C01|S1903_C01'), '2016_Label'] = df.apply(lambda x: x['2016_Label'].replace('TOTAL!!', 'PERCENT DISTRIBUTION!!') if isinstance(x['2016_Label'], str) and isinstance(x['2017_Label'], str) and 'PERCENT DISTRIBUTION!!' in x['2017_Label'] else x['2016_Label'], axis=1)
      df.loc[df['2016_ID'].str.contains('S1902_C01|S1903_C01'), '2016_Label'] = df.apply(lambda x: x['2016_Label'].replace('TOTAL!!', 'NUMBER!!') if isinstance(x['2016_Label'], str) and isinstance(x['2017_Label'], str) and 'NUMBER!!' in x['2017_Label'] else x['2016_Label'], axis=1)
      df.loc[df['2017_ID'].str.contains('S2702_C01', na=False), '2017_Label'] = df.apply(lambda x: x['2017_Label'].replace('TOTAL!!', 'TOTAL CIVILIAN NONINSTITUTIONALIZED POPULATION!!') if isinstance(x['2016_Label'], str) and isinstance(x['2017_Label'], str) and 'TOTAL CIVILIAN NONINSTITUTIONALIZED POPULATION!!' in x['2016_Label'] else x['2017_Label'], axis=1)
    
      df.loc[(df['2016_ID'].str.contains('S2001')), '2016_Label'] = df.apply(lambda x: x['2016_Label'].replace('!!TOTAL!!', '!!PERCENT!!') if 'TOTAL' in x['2016_diff'] else x['2016_Label'], axis=1)
      df.loc[(df['2016_ID'].str.contains('S2001')), '2016_Label'] = df.apply(lambda x: x['2016_Label'].replace('!!MALE!!', '!!PERCENT MALE!!') if 'MALE' in x['2016_diff'] and 'FEMALE' not in x['2016_diff'] else x['2016_Label'], axis=1)
      df.loc[(df['2016_ID'].str.contains('S2001')), '2016_Label'] = df.apply(lambda x: x['2016_Label'].replace('!!FEMALE!!', '!!PERCENT FEMALE!!') if 'FEMALE' in x['2016_diff'] else x['2016_Label'], axis=1)

      # Applying the find_difference function and storing the results
      df[[f'{year}_diff', f'{year+1}_diff']] = df.apply(lambda row: find_difference(row, f'{year}_Label', f'{year+1}_Label'), axis=1, result_type="expand")

    phrases_1617 = ['AGE',
                    'SEX',
                    'TOTAL',
                    'FAMILIES',
                    'INDUSTRY',
                    'HOUSEHOLDS',
                    'OCCUPATION',
                    'INDIVIDUALS',
                    'FOREIGN BORN',
                    'SEX AND AGE',
                    'PLACE OF WORK',
                    'ALL HOUSEHOLDS',
                    'HOUSING TENURE',
                    'CLASS OF WORKER',
                    'TOTAL HOUSEHOLDS',
                    'HOUSEHOLD INCOME',
                    'TOTAL POPULATION',
                    'DISABILITY STATUS',
                    'YEAR-ROUND WORKERS',
                    'WOMEN 15 TO 50 YEARS',
                    'EDUCATIONAL ATTAINMENT',
                    'OCCUPIED HOUSING UNITS',
                    'FOREIGN-BORN POPULATION',
                    'POPULATION 16 TO 19 YEARS',
                    'POPULATION 15 TO 19 YEARS',
                    'POPULATION 16 TO 64 YEARS',
                    'POPULATION 20 TO 64 YEARS',
                    'WORKERS 16 YEARS AND OVER',
                    'CITIZENS 18 YEARS AND OVER',
                    'EARNINGS IN PAST 12 MONTHS',
                    'POPULATION 1 YEAR AND OVER',
                    'POPULATION AGE 16 AND OVER',
                    'WORK STATUS CHARACTERISTICS',
                    'POPULATION 5 YEARS AND OVER',
                    'LANGUAGE OTHER THAN ENGLISH',
                    'POPULATION 15 YEARS AND OVER',
                    'POPULATION 16 YEARS AND OVER',
                    'POPULATION 25 YEARS AND OVER',
                    'AGE BY EDUCATIONAL ATTAINMENT',
                    'INCOME IN THE PAST 12 MONTHS ',
                    'POPULATION 1[5-6] TO 19 YEARS',
                    'PERCENT OCCUPIED HOUSING UNITS',
                    'CITIZENSHIP AND PERIOD OF ENTRY',
                    'EMPLOYMENT STATUS CHARACTERISTICS',
                    'TOTAL POPULATION 5 YEARS AND OVER',
                    'FAMILIES WITH A HOUSEHOLDER WHO IS',
                    'RACE AND HISPANIC OR LATINO ORIGIN',
                    'EMPLOYED POPULATION AGE 16 AND OVER',
                    'NATIVITY AND U.S. CITIZENSHIP STATUS',
                    'PERCENT OWNER-OCCUPIED HOUSING UNITS',
                    'POVERTY STATUS IN THE PAST 12 MONTHS',
                    'CHILDREN UNDER 18 YEARS IN HOUSEHOLDS',
                    'CIVILIAN POPULATION 18 YEARS AND OVER',
                    'PERCENT RENTER-OCCUPIED HOUSING UNITS',
                    'POPULATION AGE 16 AND OVER WITH EARNINGS',
                    'WORKERS 16 YEARS AND OVER IN HOUSEHOLDS',
                    'WORKERS 16 YEARS AND OVER WITH EARNINGS',
                    'CIVILIAN NONINSTITUTIONALIZED POPULATION',
                    'INDIVIDUAL INCOME IN THE PAST 12 MONTHS ',
                    'RACE AND HISPANIC ORIGIN OR LATINO ORIGIN',
                    'EARNINGS IN THE PAST 12 MONTHS FOR WORKERS',
                    'INCOME AND BENEFITS IN THE PAST 12 MONTHS ',
                    "INDIVIDUALS' INCOME IN THE PAST 12 MONTHS ",
                    'POPULATION 16 YEARS AND OVER WITH EARNINGS',                    
                    'CIVILIAN EMPLOYED POPULATION 16 YEARS AND OVER',
                    'POPULATION 3 YEARS AND OVER ENROLLED IN SCHOOL',
                    'TOTAL CIVILIAN NONINSTITUTIONALIZED POPULATION',
                    'PUBLIC HEALTH INSURANCE ALONE OR IN COMBINATION',
                    'POPULATION FOR WHOM POVERTY STATUS IS DETERMINED',
                    'PRIVATE HEALTH INSURANCE ALONE OR IN COMBINATION',
                    'POPULATION ENROLLED IN COLLEGE OR GRADUATE SCHOOL',
                    'RACE AND HISPANIC OR LATINO ORIGIN OF HOUSEHOLDER',
                    'WORKERS 16 YEARS AND OVER WHO DID NOT WORK AT HOME',
                    'WORKERS 16 YEARS AND OVER WHO DID NOT WORK AT HOME',
                    'LANGUAGE SPOKEN AT HOME AND ABILITY TO SPEAK ENGLISH',
                    'PER CAPITA INCOME BY RACE AND HISPANIC OR LATINO ORIGIN',
                    'GRANDPARENTS LIVING WITH OWN GRANDCHILDREN UNDER 18 YEARS',
                    'OCCUPATIONRACE AND HISPANIC OR LATINO ORIGIN OF HOUSEHOLDER',
                    'UNRELATED INDIVIDUALS FOR WHOM POVERTY STATUS IS DETERMINED',
                    'CIVILIAN EMPLOYED POPULATION 16 YEARS AND OVER WITH EARNINGS',
                    'FULL-TIME/YEAR-ROUND WORKERS 16 YEARS AND OVER WITH EARNINGS',
                    'RACE AND HISPANIC OR LATINO ORIGIN BY EDUCATIONAL ATTAINMENT',
                    'EARNINGS IN THE PAST 12 MONTHS FOR FULL-TIME/YEAR-ROUND WORKERS',
                    'GRANDCHILDREN UNDER 18 YEARS LIVING WITH A GRANDPARENT HOUSEHOLDER',
                    'PLACE OF BIRTH/ NATIVITY AND CITIZENSHIP STATUS/ AND YEAR OF ENTRY',
                    'FULL-TIME/YEAR-ROUND CIVILIAN EMPLOYED POPULATION 16 YEARS AND OVER',
                    'FULL-TIME/YEAR-ROUND CIVILIAN WORKERS 16 YEARS AND OVER WITH EARNINGS',
                    "TOTAL POPULATION 25 YEARS AND OVER WITH A BACHELOR'S DEGREE OR HIGHER",
                    'HOUSEHOLD INCOME BY RACE AND HISPANIC OR LATINO ORIGIN OF HOUSEHOLDER',
                    'FULL-TIME/YEAR-ROUND CIVILIAN EMPLOYED POPULATION 16 YEARS AND OVER WITH EARNINGS',
                    'GRANDCHILDREN UNDER 18 YEARS LIVING WITH A GRANDPARENT HOUSEHOLDER IN OCCUPIED HOUSING UNITS']
    
    phrases_1819 = ['SEX',
                    'AGE',
                    'SUBJECT',
                    'INDUSTRY',
                    'OCCUPATION',
                    'CLASS OF WORKER',
                    'HOUSEHOLD INCOME',
                    'WORK EXPERIENCE',
                    'EMPLOYMENT STATUS',
                    'DISABILITY STATUS',
                    'DID NOT WORK',
                    'TOTAL HOUSEHOLD POPULATION',
                    'KINDERGARTEN TO 12TH GRADE',
                    'COVERAGE ALONE OR IN COMBINATION',
                    'POPULATION 18 TO 24 YEARS',
                    'RESIDENCE 1 YEAR AGO',
                    'EDUCATIONAL ATTAINMENT',
                    'EARNINGS IN THE PAST 12 MONTHS',
                    'NATIVITY AND U.S. CITIZENSHIP STATUS',
                    'RACE AND HISPANIC OR LATINO ORIGIN',
                    'TOTAL CIVILIAN NONINSTITUTIONALIZED POPULATION',
                    'POPULATION ENROLLED IN COLLEGE OR GRADUATE SCHOOL',
                    'RATIO OF INCOME TO POVERTY LEVEL IN THE PAST 12 MONTHS',
                    'CIVILIAN NONINSTITUTIONALIZED POPULATION 1 YEAR AND OVER',
                    'CIVILIAN NONINSTITUTIONALIZED POPULATION 25 YEARS AND OVER',
                    'CIVILIAN NONINSTITUTIONALIZED WORKERS 16 YEARS AND OVER',
                    'CIVILIAN NONINSTITUTIONALIZED POPULATION 16 YEARS AND OVER',
                    'CIVILIAN NONINSTITUTIONALIZED POPULATION 16 TO 64 YEARS',
                    'CIVILIAN NONINSTITUTIONALIZED POPULATION FOR WHOM POVERTY STATUS IS DETERMINED',
                    'CIVILIAN NONINSTITUTIONALIZED POPULATION 16 YEARS AND OVER WITH EARNINGS']

    if year == 2016:
      def check_subset_1617(row):
        split_list = row.split(", ")  # Splitting the string into a list
        return set(split_list).issubset(phrases_1617)

      mask1 = (df[f'{year+1}_diff'].apply(check_subset_1617)) & (df[f'{year}_diff']=='')
      df.loc[mask1, f'{year}_Label'] = df[f'{year+1}_Label']

      mask2 = (df[f'{year}_diff'].apply(check_subset_1617)) & (df[f'{year+1}_diff']=='')
      df.loc[mask2, f'{year+1}_Label'] = df[f'{year}_Label']

    elif year == 2018:
      def check_subset_1819(row):
        split_list = row.split(", ")  # Splitting the string into a list
        return set(split_list).issubset(phrases_1819)

      mask1 = (df[f'{year+1}_diff'].apply(check_subset_1819)) & (df[f'{year}_diff']=='')
      df.loc[mask1, f'{year}_Label'] = df[f'{year+1}_Label']

      mask2 = (df[f'{year}_diff'].apply(check_subset_1819)) & (df[f'{year+1}_diff']=='')
      df.loc[mask2, f'{year+1}_Label'] = df[f'{year}_Label']

    # Re-applying the modified function and storing the formatted results
    df[[f'{year}_diff', f'{year+1}_diff']] = df.apply(lambda row: find_difference(row, f'{year}_Label', f'{year+1}_Label'), axis=1, result_type="expand")
    
    
    df.loc[(df[f'{year}_diff']=='') & (df[f'{year+1}_diff']==''), 'flag'] = 1

    # Save the updated DataFrame to a new Excel file
    file_save_path = f'../YtoY_diff/match_{year}_{year+1}_diff.xlsx'


    # Save the DataFrame to an Excel file in the respective directory
    df.to_excel(file_save_path, index=False)

### Standardize labels using 2022's data

In [None]:
import pandas as pd

# Read 'match_2021_2022.xlsx' file and select only 2022 columns
file_path = f'../YtoY_diff/match_2021_2022.xlsx'
df = pd.read_excel(file_path, usecols=[4, 5, 6, 7])

# Remove rows where rows are NaN
df = df.dropna(how='all')

# Merge 'label' and 'type' with the specified format
df['2022_Label'] = df['2022_Type'].str.upper() + "!!" + df['2022_Label']

# Merge 'label' and 'concept' with the specified format
df['2022_Label'] = df['2022_Concept'] + "!!" + df['2022_Label']

# Remove 'concept' and 'type' columns
df_final = df.drop(['2022_Concept', '2022_Type'], axis=1)

df_final.to_csv('label_2022.csv', index=False)