In [1]:
import pandas as pd
import numpy as np
import os
from dotenv import dotenv_values, find_dotenv
import re

In [2]:
# this looks for your configuration file and then reads it as a dictionary
config = dotenv_values(find_dotenv())

# set path using the dictionary key for which one you want
concordancedata = os.path.abspath(config["DATA_CONCORDANCE"]) + '\\'

In [4]:
# import data

# https://econweb.ucsd.edu/~jrauch/rauch_classification.html
rauch = pd.read_csv(concordancedata + 'Rauch_classification_revised.csv')
# https://unstats.un.org/unsd/trade/hs2_2sitc3.htm
sitc_hs6 = pd.read_csv(concordancedata + 'HS6 to STIC rev3 concordance - unstats.csv')[['HS02', 'SITC, R3']]
# rename columns for merging later
sitc_hs6.rename(columns={'HS02': 'hs6', 'SITC, R3': 'sitc4'}, inplace=True)
# switch to string datatype for easier data cleaning
sitc_hs6['hs6']= sitc_hs6['hs6'].astype(str)
sitc_hs6['sitc4']= sitc_hs6['sitc4'].astype(str)
rauch['sitc4'] = rauch['sitc4'].astype(str)

In [13]:
# https://econweb.ucsd.edu/~jrauch/rauchclass/SITC_Rev_3_english_structure.txt
with open(concordancedata + 'SITC_Rev_3_english_structure.txt', 'r', errors='ignore') as file:
    lines = file.readlines()

# refine a regex pattern for splitting the lines
pattern = re.compile(r'(\S+)\s+(.+)')

# df columns
code_list = []
description_list = []

# iterate through lines and extract code and description
for line in lines:
    match = pattern.match(line)
    if match:
        code_list.append(match.group(1))
        description_list.append(match.group(2))

sitc_desc = pd.DataFrame({'sitc4': code_list[1:], 'desc': description_list[1:]})

# keep sitc 4 digit long codes
sitc_desc['sitc4'] = sitc_desc['sitc4'].str.replace('.', '', regex=True)
sitc_desc = sitc_desc[sitc_desc['sitc4'].str.len() == 4]

In [14]:
sitc_desc

Unnamed: 0,sitc4,desc
3,0011,"Bovine animals, live"
6,0012,"Sheep and goats, live"
9,0013,"Swine, live"
12,0014,"Poultry, live (i.e., fowls of the species Gall..."
15,0015,"Horses, asses, mules and hinnies, live"
...,...,...
4170,8999,"Manufactured goods, n.e.s."
4180,9110,Postal packages not classified according to kind
4183,9310,Special transactions and commodities not class...
4186,9610,"Coin (other than gold coin), not being legal t..."


In [4]:
# restrict to our product categories
sitc_hs6 = sitc_hs6.loc[sitc_hs6['hs6'].str.startswith('85') | sitc_hs6['hs6'].str.startswith('61') | sitc_hs6['hs6'].str.startswith('62')]

# fix formatting for sitc codes in order to merge with rauch codes file
sitc_hs6['sitc4'] = sitc_hs6['sitc4'].str.replace('.', '', regex=True)
sitc_hs6['sitc4'] = sitc_hs6['sitc4'].apply(lambda x: x[:-1] if len(x) == 5 else x)

In [5]:
# do the merge
concordance = pd.merge(left=sitc_hs6, right=rauch, how='left', on='sitc4')

In [6]:
# unmatched products
concordance[concordance['lib'].isna()]

Unnamed: 0,hs6,sitc4,con,lib
256,8503.0,7169,,
309,8513.9,8138,,
375,8523.2,8985,,
376,8523.3,8985,,
377,8523.9,8985,,
378,8524.1,8987,,
379,8524.31,8987,,
380,8524.32,8987,,
381,8524.39,8987,,
386,8524.6,8987,,


In [7]:
# non-differentiated products (by "liberal" classification)
concordance[(concordance['lib'] != 'n') & (concordance['lib'].notnull())]

Unnamed: 0,hs6,sitc4,con,lib
268,8505.11,7788,n,r
269,8505.19,7788,n,r
270,8505.2,7788,n,r
271,8505.3,7788,n,r
272,8505.9,7788,n,r
273,8506.1,7781,n,r
274,8506.3,7781,n,r
275,8506.4,7781,n,r
276,8506.5,7781,n,r
277,8506.6,7781,n,r


In [11]:
# the list of unmatched
sitc_unmatched = set(list(concordance[concordance['lib'].isna()]['sitc4']))

# list of products with differing "liberal" classification
sitc_conflicting_matches = set(list(concordance[(concordance['lib'] != 'n') & (concordance['lib'].notnull())]['sitc4']))

In [12]:
# there are 9 SITC codes that don't have a Rauch classification (let ChatGPT fill in the matching?)
sitc_unmatched

{'7169', '7723', '7724', '7726', '7728', '7787', '8138', '8985', '8987'}

In [13]:
# there are only 4 SITC codes where they are classified liberally as referenced price but conservatively as differentiated
sitc_conflicting_matches

{'7764', '7781', '7786', '7788'}

In [8]:
# save concordance file
concordance.to_csv(concordancedata + 'rauch_hs6_sitc.csv')