In [692]:
from collections import Counter
import os
import re
import numpy as np
import pandas as pd

import pprint

In [552]:
# Read RoO data
roo_texts = {}
roo_folder = 'RoO Non-table/roo_clean_2'

# roo_clean_2 -> UTF-8 ('cause Ms. Word?), roo_clean -> ANSI
for filename in os.listdir(roo_folder):
    with open(roo_folder + '/' + filename, mode='r', encoding='utf-8') as f:
        # Replace en dash with hyphen
        roo_texts[filename[:-4]] = f.read().replace('–', '-')

In [695]:
# Check how many sections within the RoO
# Apparently India is missing some sections: X, XIV, XIX, XXI
pattern_count = re.compile('Section [IVX]{1,5}')
for fta, rule in roo_texts.items():
    print(fta, len(pattern_count.findall(rule)))

# Clean some misconverted HS code
# Update apparently there are no misconverted hs code with new file!
def addZero(matchobj):
    """Example: Instead of 21.6, which is a wrong HS code, return 21.06"""
    hs_code = matchobj[0]
    return hs_code[:-1] + '0' + hs_code[-1]

#pattern_zero = re.compile(r'\b\d{1,2}\.\d\b')
#for fta, rule in roo_texts.items():
#    roo_texts[fta] = pattern_zero.sub(addZero, rule)

BRN_JPN 21
CHL_JPN 21
IDN_JPN 21
IND_JPN 17
JPN_MEX 21
JPN_MYS 21
JPN_PER 21
JPN_PHL 21
JPN_THA 21


In [711]:
# Read csv files of different versions of HS code mapping
HS_maps = {
    2002: pd.read_csv('HS2.csv'),
    2007: pd.read_csv('HS3.csv'),
    2012: pd.read_csv('HS4.csv'),
    2017: pd.read_csv('HS5.csv')
}

# Read the HS code version used in each agreement
HS_map_used = {}
for row in pd.read_csv('HS_ver.csv').itertuples(index=None, name=None):
    HS_map_used[row[0]] = HS_maps[row[1]]

In [710]:
# FUNCTION FOR EXTRACTING HS CODES
def get_hs_codes(hs_code1, hs_code2, HS_map):
    """Given a range of HS codes, return list of all HS codes within."""
    # Clean HS codes
    hs_code1 = hs_code1.replace('.', '')
    hs_code2 = hs_code2.replace('.', '')
    
    if not hs_code2:
        result = HS_map.loc[(HS_map['ProductCode'].str.startswith(hs_code1)) & (HS_map['Tier'] == 3)]
        return list(result['ProductCode'])
        
    try:
        # Is there a way to index by value? HS_map['ProductCode'].indval(hs_code) ?
        index_1 = HS_map.loc[HS_map['ProductCode'] == hs_code1].index[0]
        index_2 = HS_map.loc[HS_map['ProductCode'] == hs_code2].index[0]

        range_1 = HS_map.loc[HS_map['Tier'] == 3].loc[index_1:index_2]
        range_2 = HS_map.loc[(HS_map['ProductCode'].str.startswith(hs_code2)) & (HS_map['ProductCode'] != hs_code2)
                             & (HS_map['Tier'] == 3)]

        result = pd.concat([range_1, range_2], ignore_index=True)
        return list(result['ProductCode'])

    except:
        print(hs_code1, hs_code2)

print(get_hs_codes('39.06', '39.07', HS_maps[2002]))

['390610', '390690', '390710', '390720', '390730', '390740', '390750', '390760', '390791', '390799']


In [705]:
# CREATE A DICTIONARY FOR FASTER ACCESS TO ALL HS CODES
def expand_map(HS_map):
    """Given part of HS code (chapter, heading, or subheading), return list
    containing all 6-digit HS code within it.
    """
    expanded_map = {}
    # Expand chapters
    chapters = HS_map.loc[HS_map['Tier'] == 1]
    for row in chapters.itertuples(index=False, name=None):
        result = HS_map.loc[(HS_map['ProductCode'].str.startswith(row[1])) & (HS_map['Tier'] == 3)]
        expanded_map[row[1]] = list(result['ProductCode'])
    
    # Expand headings
    headings = HS_map.loc[HS_map['Tier'] == 2]
    for row in headings.itertuples(index=False, name=None):
        result = HS_map.loc[(HS_map['ProductCode'].str.startswith(row[1])) & (HS_map['Tier'] == 3)]
        expanded_map[row[1]] = list(result['ProductCode'])
    
    # Expand subheadings
    subheadings = HS_map.loc[HS_map['Tier'] == 3]
    for row in subheadings.itertuples(index=False, name=None):
        expanded_map[row[1]] = row[1]
    
    return expanded_map

HS_exp_maps = {
    2002: expand_map(HS_maps[2002]),
    2007: expand_map(HS_maps[2007]),
    2012: expand_map(HS_maps[2012]),
    2017: expand_map(HS_maps[2017])
}

In [757]:
# Useful groups for regular expressions
HS_CODE = r'(\d+\.\d+)'
HS_CODE_NC = r'(?:\d+\.\d+)'
HS_RANGE = r'({0}(?:\-{0})?)'.format(HS_CODE)
HS_RANGE_NC = r'{0}(?:\-{0})?'.format(HS_CODE_NC)

def parse_roo(roo_text):
    """Given complete text of specific rules of origin, return a dictionary representing
    the complete structure of RoO (from sections to chapters)
    
    Argument:
        roo_text
    Return:
        
    """
    def clean_ws(match):
        """Reduce number of whitespaces into a single space."""
        return ' '

    # Capture sections
    pattern_section = re.compile(r'(Section [IVX]{1,5})\s+(.+?)(?=Section [IVX]{1,5}\s+[A-Z]|\Z)', flags=re.DOTALL)
    result = pattern_section.findall(roo_text)
    roo = {match[0]: match[1] for match in result}

    # Capture chapters in every section
    pattern_chapter = re.compile(r'(Chapter \d{1,2})\s+(.+?)(?=Chapter \d{1,2}\s+[A-Z]|\Z)', flags=re.DOTALL)
    for section, content in roo.items():
        result = pattern_chapter.findall(content)
        roo[section] = {match[0]: match[1] for match in result}

    # Capture rules in every chapter
    pattern_rule = re.compile(HS_RANGE + r'\s+([A-Z].+?\.)(?=\s+' + HS_RANGE_NC + r'|\s*\Z)', flags=re.DOTALL)
    pattern_whitespace = re.compile(r'\s+')    
    for section in roo:
        for chapter, rules in roo[section].items():
            result = pattern_rule.findall(rules)
            roo[section][chapter] = {match[0]: pattern_whitespace.sub(clean_ws, match[3]) for match in result}
    return roo

def get_roo_rules(roo, HS_map=None, complete=False):
    """Create a dictionary which only stores the rules without additional stuctures"""
    roo_rules = {}
    pattern_range = re.compile(HS_RANGE)
    for section in roo:
        for chapter in roo[section]:
            for hs_code_range, rule in roo[section][chapter].items():
                if complete:
                    #print(hs_code_range)
                    result = pattern_range.findall(hs_code_range)
                    hs_codes = get_hs_codes(result[0][1], result[0][2], HS_map)
                    for hs_code in hs_codes:
                        roo_rules[hs_code] = rule
                else:
                    roo_rules[hs_code_range] = rule
    return roo_rules


roo_rules = {fta: {} for fta in roo_texts}
for fta, roo_text in roo_texts.items():
    roo_rules[fta] = get_roo_rules(parse_roo(roo_text), HS_map_used[fta], complete=True)
    print(fta, len(roo_rules[fta]))
    
# NOTE!!!
# JPN_PHL, 15.16 - 15.19 -> TYPO??? CHANGE TO 15.18

for ver, HS_map in HS_maps.items():
    print('HS', ver, ':', HS_map.loc[HS_map['Tier'] == 3]['ProductCode'].count())

BRN_JPN 5222
CHL_JPN 5212
IDN_JPN 5210
IND_JPN 2474
JPN_MEX 5224
JPN_MYS 5158
JPN_PER 5046
JPN_PHL 5222
JPN_THA 5214
HS 2002 : 5224
HS 2007 : 5053
HS 2012 : 5205
HS 2017 : 6276


In [615]:
#one = set(get_roo_rules(parse_roo(roo_texts['IND_JPN'], get_hs_map('IND_JPN'), complete=True)))
#two = set(get_roo_rules(parse_roo(roo_texts['JPN_PER'], get_hs_map('JPN_PER'), complete=True)))
#sorted(list(two-one))
#roo_texts['IND_JPN']

In [691]:
# NEXT PROJECT: ANALYZE GRAMMAR
patterns = {
    'CC': re.compile(r'A change to (?:heading|subheading) {0}(?: through {0})? from any other chapter.'.format(HS_CODE_NC)),
    'CTH': re.compile(r'A change to (?:heading|subheading) {0}(?: through {0})? from any other heading.'.format(HS_CODE_NC)),
    'CTSH': re.compile(r'A change to (?:heading|subheading) {0}(?: through {0})? from any other subheading.'.format(HS_CODE_NC)),
    'CTH_RVC': re.compile(r'A change to (?:heading|subheading) {0}(?: through {0})? from(?: subheading {0} or)? any other heading, provided there is a regional value content of not less than \d\d? percent.'.format(HS_CODE_NC)),
    'WO': re.compile(r'(?:Manufacture in which all the materials used are wholly obtained.|Goods of (?:heading|subheading) {0}(?: through {0})? are wholly obtained or produced entirely in a Party\,? as defined in Article \d\d?.)'.format(HS_CODE_NC)),
    'RVC': re.compile(r'(?:No required change in tariff classification to (?:heading|subheading) {0}(?: through {0})?, provided(?: that)? there is a (?:qualifying|regional) value content of not less than \d\d? per\s?cent.|A qualifying value content of not less than \d\d? percent.)'.format(HS_CODE_NC)),
    'WO_scrap': re.compile(r'No required change in tariff classification to (?:heading|subheading) {0}(?: through {0})?, provided(?: that)? the waste(?: and scrap are)?(?: is)? wholly obtained or produced entirely in (?:a Party|the Area of one or both Parties|the territory of a Country) as defined in Article \d\d?(?: of Chapter \d\d?)?.'.format(HS_CODE_NC)),
    'CTH_ECT': re.compile(r'A change to (?:heading|subheading) {0}(?: through {0})? from any heading outside that group.'.format(HS_CODE)),
    'CTSH_ECT': re.compile(r'A change to subheading {0}(?: through {0})? from any subheading outside that group.'.format(HS_CODE)),
    'Manufacture_1': re.compile(r'Manufacture from (?:yarns|fibres|fabrics|chemical materials or textile pulps), provided that necessary process stipulated in the Appendix is undertaken.'),
    'Manufacture_2': re.compile(r'Manufacture from (?:yarns|fibres|fabrics|chemical materials or textile pulps).'),

    'pattern_1': re.compile(r'A change to subheading {0} from any classification to subheading {0}, provided that there is a qualifying value content of not less than \d\d? percent.'.format(HS_CODE_NC)),
    'pattern_2': re.compile(r'All the animals of Chapter 1 shall be wholly obtained.')
}

patterns_small = {
    'CC': re.compile(r'A change to (?:heading|subheading) {0}(?: through {0})? from any other chapter.'.format(HS_CODE_NC)),
    'CTH': re.compile(r'A change to (?:heading|subheading) {0}(?: through {0})? from any other heading.'.format(HS_CODE_NC)),
    'CTSH': re.compile(r'A change to (?:heading|subheading) {0}(?: through {0})? from any other subheading.'.format(HS_CODE_NC)),
    'CTH_RVC': re.compile(r'A change to (?:heading|subheading) {0}(?: through {0})? from(?: subheading {0} or)? any other heading, provided there is a regional value content of not less than \d\d? percent.'.format(HS_CODE_NC)),
    'CTH_ECT': re.compile(r'A change to (?:heading|subheading) {0}(?: through {0})? from any heading outside that group.'.format(HS_CODE_NC)),
    'CTSH_ECT': re.compile(r'A change to subheading {0}(?: through {0})? from any subheading outside that group.'.format(HS_CODE_NC)),
}

for fta in roo_rules:
    begin = Counter()
    for hsc, rule in roo_rules[fta].items():
        res = []
        for pattern in patterns.values():
            res.append(pattern.search(rule))
        if any(res):
            #print(pattern1.search(rule))
            continue
        begin[rule[:20]] += 1
        if fta == 'JPN_THA':
            pass
            #print(rule)
    print(fta, sum(begin.values()))
    #print(begin, end='\n\n')
    

BRN_JPN 4
CHL_JPN 4
IDN_JPN 7
IND_JPN 7
JPN_MEX 57
JPN_MYS 9
JPN_PER 4
JPN_PHL 12
JPN_THA 7


In [685]:
# Assuming complete=True
final_roo = {fta: {} for fta in roo_rules}
for fta in roo_rules:
    hs_map_ex = get_hs_map_ex(fta)
    
    for hs_code, rule in roo_rules[fta].items():
        for roo_type, pattern in patterns_small.items():
            match = pattern.search(rule)
            if match:
                if roo_type == 'CC':
                    final_roo[fta][hs_code] = hs_map_ex[hs_code[:2]]
                if roo_type == 'CTH' or roo_type == 'CTH_RVC':
                    final_roo[fta][hs_code] = hs_map_ex[hs_code[:4]]
                if roo_type == 'CTSH':                    
                    final_roo[fta][hs_code] = hs_map_ex[hs_code[:6]]
                if roo_type == 'CTH_ECT':
                    
                if roo_type == 'CTSH_ECT':
                    

In [689]:
len(final_roo['JPN_MYS'])

4835

In [434]:
# TRY NEW ONE
# Count types of RoO in every FTA
roo_types_count = {}
pattern_chapter = re.compile(r'A change to.+?from any other chapter', flags=re.DOTALL)
pattern_heading = re.compile(r'A change to.+?from any other heading', flags=re.DOTALL)
pattern_subheading = re.compile(r'A change to.+?from any other subheading', flags=re.DOTALL)

for fta in roo_rules:
    freq = Counter()
    for hs_code, rule in roo_rules[fta].items():
        # Chapter level
        if pattern_chapter.search(rule):
            freq['chapter'] += 1
        
        # Heading level
        elif pattern_heading.search(rule):
            freq['heading'] += 1
        
        # Subheading level
        # Note: There are a few specific rules related to chemicals (e.g. alkali metal; "or any other subheading")
        elif pattern_subheading.search(rule):
            freq['subheading'] += 1
            
        # "Wholly obtained"
        elif 'are wholly' in rule:
            freq['wholly'] += 1

        # Manufacture criteria?
        elif 'Manufacture from' in rule:
            freq['manufacture'] += 1
        
        # Value added?
        elif 'A qualifying value content of not less than' in rule or 'value content' in rule:
            freq['value'] += 1
            
        # No rules imposed, unless...
        elif 'No required change in tariff' in rule or 'No change in tariff' in rule:
            freq['no rule'] += 1
        
        # Other notes
        else:
            freq['else'] += 1
            
    roo_types_count[fta] = freq
    print(fta)
    pprint.pprint(freq, width=35)

BRN_JPN
Counter({'subheading': 3240,
         'chapter': 1392,
         'heading': 226,
         'no rule': 197,
         'else': 97,
         'wholly': 66,
         'value': 4})
CHL_JPN
Counter({'heading': 2265,
         'chapter': 1796,
         'subheading': 727,
         'else': 346,
         'wholly': 35,
         'no rule': 25,
         'value': 18})
IDN_JPN
Counter({'subheading': 2660,
         'chapter': 1562,
         'heading': 746,
         'no rule': 163,
         'else': 70,
         'wholly': 9})
IND_JPN
Counter()
JPN_MEX
Counter({'chapter': 2422,
         'heading': 1672,
         'subheading': 517,
         'else': 507,
         'value': 49})
JPN_MYS
Counter({'subheading': 2662,
         'chapter': 1351,
         'heading': 828,
         'no rule': 200,
         'else': 79,
         'value': 29})
JPN_PER
Counter()
JPN_PHL
Counter({'chapter': 1917,
         'heading': 1851,
         'subheading': 1081,
         'no rule': 200,
         'wholly': 79,
         'else': 61,


In [437]:
print('FTA\t\tChapter\t\tHeading\t\tSubheading\tOther\t\tTotal')
for fta, rules in roo_rules.items():
    total = len(rules)
    try:
        chapter = roo_types_count[fta]['chapter'] / total
        heading = roo_types_count[fta]['heading'] / total
        subheading = roo_types_count[fta]['subheading'] / total
    except:
        continue
    other = 1 - (chapter + heading + subheading)
    print('{}\t\t{:2.2%}\t\t{:2.2%}\t\t{:2.2%}\t\t{:2.2%}\t\t{}'.format(fta, chapter, heading, subheading, other, total))

FTA		Chapter		Heading		Subheading	Other		Total
BRN_JPN		26.69%		9.92%		62.05%		1.34%		5222
CHL_JPN		34.46%		49.46%		14.58%		1.50%		5212
IDN_JPN		29.98%		18.79%		51.06%		0.17%		5210
JPN_MEX		47.42%		41.86%		10.45%		0.27%		5167
JPN_MYS		26.24%		21.50%		51.70%		0.56%		5149
JPN_PHL		36.71%		40.44%		20.70%		2.14%		5222
JPN_THA		32.05%		47.10%		19.10%		1.75%		5214


In [443]:
# OLD ONE
print('FTA\t\tChapter\t\tHeading\t\tSubheading\tOther\t\tTotal')
for fta, rules in roo_rules.items():
    total = len(rules)
    chapter = roo_types_count[fta]['chapter'] / total
    heading = roo_types_count[fta]['heading'] / total
    subheading = roo_types_count[fta]['subheading'] / total
    other = 1 - (chapter + heading + subheading)
    print('{}\t\t{:2.2%}\t\t{:2.2%}\t\t{:2.2%}\t\t{:2.2%}\t\t{}'.format(fta, chapter, heading, subheading, other, total))

FTA		Chapter		Heading		Subheading	Other		Total
BRN_JPN		38.89%		15.56%		36.39%		9.17%		360
CHL_JPN		28.06%		45.20%		22.30%		4.43%		677
IDN_JPN		51.22%		21.14%		27.37%		0.27%		369
IND_JPN		8.22%		73.74%		0.27%		17.77%		377
JPN_MEX		22.38%		58.54%		18.02%		1.06%		849
JPN_MYS		41.96%		29.76%		26.19%		2.08%		336
JPN_PER		32.88%		40.36%		19.27%		7.48%		441
JPN_PHL		29.09%		37.48%		27.13%		6.29%		715
JPN_THA		25.45%		41.36%		28.08%		5.12%		723
