In [2]:
from collections import Counter
import os
import re

import pprint

In [3]:
# Read RoO data
roo_text = {}

roo_folder = 'RoO Non-table/roo_clean'
for filename in os.listdir(roo_folder):
    with open(roo_folder + '/' + filename, mode='r', encoding='ANSI') as f:
        roo_text[filename[:-4]] = f.read().replace('–', '-')

In [4]:
# Check how many sections within the RoO
# Apparently India is missing some sections: X, XIV, XIX, XXI
pattern_count = re.compile('Section [IVX]{1,5}')
for fta, rule in roo_text.items():
    print(fta, len(pattern_count.findall(rule)))

# Clean some misconverted HS code
def addZero(matchobj):
    """Example: Instead of 21.6, which is a wrong HS code, return 21.06"""
    hs_code = matchobj[0]
    return hs_code[:-1] + '0' + hs_code[-1]

pattern_zero = re.compile(r'\s\d{1,2}\.\d\b')
for fta, rule in roo_text.items():
    roo_text[fta] = pattern_zero.sub(addZero, rule)

BRN_JPN 21
CHL_JPN 21
IDN_JPN 21
IND_JPN 17
JPN_MEX 21
JPN_MYS 21
JPN_PER 21
JPN_PHL 21
JPN_THA 21


In [5]:
# Dictionary storing the whole structure of RoO
roo = {}

# Capture sections
pattern_section = re.compile(r'(Section [IVX]{1,5})\s+(.+?)(?=Section|\Z)', flags=re.DOTALL)
for fta, rule in roo_text.items():
    result = pattern_section.findall(rule)
    roo[fta] = {group[0]: group[1] for group in result}

# Capture chapters in every section
pattern_chapter = re.compile(r'(Chapter \d{1,2})\s+(.+?)(?=Chapter|\Z)', flags=re.DOTALL)
for fta in roo:
    for section, rule in roo[fta].items():
        result = pattern_chapter.findall(rule)
        roo[fta][section] = {group[0]: group[1] for group in result}

# Capture rules in every chapter
HS_CODE = r'(\d+\.\d+(?:\-\d+\.\d+)?)'
pattern_rule = re.compile(HS_CODE + r'\s+([A-Z].+?\.)(?=\s+' + HS_CODE + r'|\s*\Z)', flags=re.DOTALL)
for fta in roo:
    for section in roo[fta]:
        for chapter, rule in roo[fta][section].items():
            result = pattern_rule.findall(rule)
            roo[fta][section][chapter] = {group[0]: group[1] for group in result}

# Create a dictionary which only stores the rules without additional stuctures
roo_rules = {fta: {} for fta in roo}
for fta in roo:
    for section in roo[fta]:
        for chapter in roo[fta][section]:
            for hs_code, rule in roo[fta][section][chapter].items():
                roo_rules[fta][hs_code] = rule

for fta in roo_rules:
    print(fta, len(roo_rules[fta]))

BRN_JPN 357
CHL_JPN 665
IDN_JPN 348
IND_JPN 377
JPN_MEX 814
JPN_MYS 336
JPN_PER 444
JPN_PHL 716
JPN_THA 726


In [10]:
# Count types of RoO in every FTA
roo_types_count = {}

for fta in roo_rules:
    freq = Counter()
    for hs_code, rule in roo_rules[fta].items():
        # Chapter level
        if 'any other chapter' in rule or 'from any chapter' in rule or 'any\nother chapter' in rule:
            freq['chapter'] += 1
        
        # Heading level
        elif 'any other heading' in rule or 'from any heading' in rule or 'any\nother heading' in rule or \
             'from any heading outside that group' in rule:
            freq['heading'] += 1
        
        # Subheading level
        # Note: There are a few specific rules related to chemicals (e.g. alkali metal; "or any other subheading")
        elif 'any other subheading' in rule or 'subheading outside that group' in rule:
            freq['subheading'] += 1
            
        # "Wholly obtained"
        elif 'are wholly' in rule:
            freq['wholly'] += 1

        # Manufacture criteria?
        elif 'Manufacture from' in rule:
            freq['manufacture'] += 1
        
        # Value added?
        elif 'A qualifying value content of not less than' in rule or 'value content' in rule:
            freq['value'] += 1
            
        # No rules imposed, unless...
        elif 'No required change in tariff' in rule or 'No change in tariff' in rule:
            freq['no rule'] += 1
        
        # Other notes
        else:
            freq['else'] += 1
            
    roo_types_count[fta] = freq
    print(fta)
    pprint.pprint(freq, width=35)

BRN_JPN
Counter({'chapter': 136,
         'subheading': 131,
         'heading': 56,
         'wholly': 31,
         'value': 3})
CHL_JPN
Counter({'heading': 303,
         'chapter': 181,
         'subheading': 149,
         'wholly': 23,
         'no rule': 8,
         'value': 1})
IDN_JPN
Counter({'chapter': 168,
         'subheading': 101,
         'heading': 78,
         'wholly': 1})
IND_JPN
Counter({'heading': 278,
         'wholly': 44,
         'chapter': 31,
         'manufacture': 23,
         'subheading': 1})
JPN_MEX
Counter({'heading': 464,
         'chapter': 190,
         'subheading': 151,
         'value': 8,
         'else': 1})
JPN_MYS
Counter({'chapter': 140,
         'heading': 100,
         'subheading': 88,
         'value': 8})
JPN_PER
Counter({'heading': 181,
         'chapter': 145,
         'subheading': 85,
         'wholly': 28,
         'value': 5})
JPN_PHL
Counter({'heading': 268,
         'chapter': 208,
         'subheading': 198,
         'wholly': 36,

In [9]:
print('FTA\t\tChapter\t\tHeading\t\tSubheading\tOther\t\tTotal')
for fta, rules in roo_rules.items():
    total = len(rules)
    chapter = roo_types_count[fta]['chapter'] / total
    heading = roo_types_count[fta]['heading'] / total
    subheading = roo_types_count[fta]['subheading'] / total
    other = 1 - (chapter + heading + subheading)
    print('{}\t\t{:2.2%}\t\t{:2.2%}\t\t{:2.2%}\t\t{:2.2%}\t\t{}'.format(fta, chapter, heading, subheading, other, total))

FTA		Chapter		Heading		Subheading	Other		Total
BRN_JPN		38.10%		15.69%		36.69%		9.52%		357
CHL_JPN		27.22%		45.56%		22.41%		4.81%		665
IDN_JPN		48.28%		22.41%		29.02%		0.29%		348
IND_JPN		8.22%		73.74%		0.27%		17.77%		377
JPN_MEX		23.34%		57.00%		18.55%		1.11%		814
JPN_MYS		41.67%		29.76%		26.19%		2.38%		336
JPN_PER		32.66%		40.77%		19.14%		7.43%		444
JPN_PHL		29.05%		37.43%		27.65%		5.87%		716
JPN_THA		25.34%		41.87%		28.24%		4.55%		726
