- This program merges OEC (with opportunity variables) and GTA dataset

# TODO

- OK Investigate left-only merge

- Investigate right-only merge

# Loading packages

In [3]:
import numpy as np
import pandas as pd
import os
from google.colab import drive

# Setting the ambience

In [4]:
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [5]:
work_dir                            = '/content/gdrive/My Drive/ip_complexity/create_finaldataset'
output_dir                          = '/content/gdrive/My Drive/ip_complexity/create_finaldataset/output'
create_dataset_gta_dir              = '/content/gdrive/My Drive/ip_complexity/create_dataset_gta/output/data'
create_variables_opportunity_dir    = '/content/gdrive/My Drive/ip_complexity/oec_data/create_variables_opportunity/output/data'

In [6]:
pd.set_option('display.float_format', '{:.4f}'.format) # only shows 2 decimal numbers

# Oppening OEC dataset

In [7]:
df_oec = pd.read_csv(create_variables_opportunity_dir + "/dataset_oec_with_opportunity.csv")

In [8]:
# Convert 'hs4_id' column to string and fill with zeros until it reachs 4 digits
df_oec["hs4_id"] = df_oec["hs4_id"].astype(str).str.zfill(4)

In [9]:
df_oec.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3882912 entries, 0 to 3882911
Data columns (total 14 columns):
 #   Column               Dtype  
---  ------               -----  
 0   Unnamed: 0           int64  
 1   year                 int64  
 2   country_id           object 
 3   country              object 
 4   country_eci          float64
 5   hs4_id               object 
 6   hs4_name             object 
 7   hs4_pci              float64
 8   tv_relatedness       float64
 9   tv_rca               float64
 10  opportunity_index_1  float64
 11  opportunity_index_2  float64
 12  opportunity_index_3  float64
 13  tv_rca_bigger_1      int64  
dtypes: float64(7), int64(3), object(4)
memory usage: 414.7+ MB


In [10]:
df_oec = df_oec.sort_values(by = 'hs4_id')

# Oppening GTA dataset

In [11]:
df_gta = pd.read_csv(create_dataset_gta_dir + "/dataset_gta_intervention_product.csv", sep = ";", dtype= str)

## Cleaning

In [12]:
# Define the renaming function
rename_func = lambda x: x.lower().replace(' ', '_')
# Rename the columns using the function
df_gta = df_gta.rename(columns=rename_func)

In [13]:
df_gta.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6574565 entries, 0 to 6574564
Data columns (total 11 columns):
 #   Column                     Dtype 
---  ------                     ----- 
 0   state_act_id               object
 1   intervention_id            object
 2   state_act_title            object
 3   announcement_date          object
 4   gta_evaluation             object
 5   currently_in_force         object
 6   implementing_jurisdiction  object
 7   intervention_type          object
 8   mast_chapter               object
 9   affected_sectors           object
 10  affected_product           object
dtypes: object(11)
memory usage: 551.8+ MB


In [14]:
df_gta = df_gta.rename(columns = {"implementing_jurisdiction": "country",
                         "affected_product": "hs4_id"
                        })

In [15]:
# Extracting numbers before "."
df_gta['hs4_id'] = df_gta['hs4_id'].str.split('.').str[0]


In [16]:
# Convert "announcement_data" column to date type
df_gta["announcement_date"] = pd.to_datetime(df_gta["announcement_date"])

# Create a variable "year" with the year from "announcement_data"
df_gta["year"] = df_gta["announcement_date"].dt.year

In [17]:
# Extract the first four digits from 'hs4_id' to merge with OEC data
df_gta["hs4_id"] = df_gta["hs4_id"].astype(str).str[:4]

In [18]:
# drop duplicates
df_gta = df_gta.drop_duplicates()

## Normalizing countries' names

In [19]:
# Extract unique country names from both datasets
oec_countries = set(df_oec['country'])
gta_countries = set(df_gta['country'])

# Find the countries with different names
different_countries = oec_countries.symmetric_difference(gta_countries)

# Sort the different countries in alphabetical order
sorted_countries = sorted(different_countries)

In [20]:
# Create a mapping dictionary for country renaming
country_mapping = {
    'United States of America': 'United States',
    'Republic of Korea': 'South Korea',
    'Republic of the Sudan': 'Sudan',
    'Congo': 'Republic of the Congo',
    #'': 'North Korea',
    'DR Congo': 'Democratic Republic of the Congo',
    'Republic of Moldova': 'Moldova',
    'Lao': 'Laos',
    'Ivory Coast': "Cote d'Ivoire'",
    'Myanmar': 'Burma',
    'Bosnia & Herzegovina': 'Bosnia and Herzegovina'
    }

    # Add more country mappings as needed

df_gta['country'] = df_gta['country'].apply(lambda x: country_mapping.get(x, x))

In [21]:
# Drop observations that I could not normalize between both datasets
df_gta = df_gta[~(df_gta['country'].isin(gta_countries) & df_gta['country'].isin(different_countries))]

In [22]:
# Drop observations that affect no product
df_gta = df_gta[df_gta["hs4_id"] != '0nan']
df_gta = df_gta[df_gta["hs4_id"] != '000n']
df_gta = df_gta[df_gta["hs4_id"] != 'nan']

In [23]:
df_gta.hs4_id.unique()

array(['2710', '6302', '6303', ..., '8134', '8135', '8140'], dtype=object)

In [24]:
len(np.sort(df_gta.hs4_id.unique()))

1450

In [25]:
len(df_oec.hs4_id.unique()) # In fact, some products have no data on OEC about PCI

1062

In [26]:
# Extract unique country names from both datasets
oec_products = set(df_oec['hs4_id'])
gta_products = set(df_gta['hs4_id'])

# Find the countries with different names
different_products = oec_products.symmetric_difference(gta_products)
#the different countries in alphabetical order
sorted_products = sorted(different_products)

In [27]:
# Droping products that will not merge
df_oec = df_oec[~df_oec["hs4_id"].isin(different_products)]
df_gta = df_gta[~df_gta["hs4_id"].isin(different_products)]

# Merging

In [28]:
df = pd.merge(df_gta, df_oec, on=['hs4_id', 'year', 'country'], how = "outer", indicator = True)

In [29]:
df = df[df["year"] < 2022]

In [30]:
df["_merge"].value_counts() # right_only means that a product p was not protected or liberalized by country c at year j or that OEC do not have data on that product (this happens because of PCI). It is ok to drop, since we're interested only in the products that are target by policies


right_only    3549583
both          2831527
left_only          50
Name: _merge, dtype: int64

- Left_only
  - Maybe left_only this is happening because Country c enacted an act that harms product p, but Country c do not exports product p. I'll try to find an example.

In [31]:
df = df[df["_merge"] != "right_only"]

In [32]:
df[(df["_merge"] == "left_only")].year.value_counts()

2019    18
2009    13
2015    11
2010     2
2011     2
2013     1
2012     1
2016     1
2020     1
Name: year, dtype: int64

In [33]:
df[(df["_merge"] == "left_only") & (df["country"] == "Brazil")].year.value_counts()

Series([], Name: year, dtype: int64)

In [34]:
df[(df["_merge"] == "left_only") & (df['gta_evaluation'] == "Amber")].country.value_counts().head(50)

Series([], Name: country, dtype: int64)

- This list is very interesting! If my interpretation is right, these countries are using harmfull protection against products that they don't even export!

In [35]:
df[(df["_merge"] == "left_only")].hs4_id.value_counts().head(50)

6304    5
8702    4
1803    3
1804    3
0303    3
1511    3
3808    3
1801    3
8712    2
1805    2
1806    2
3920    2
2711    2
0203    1
0403    1
8543    1
0207    1
0202    1
0101    1
0301    1
0206    1
0105    1
0104    1
0103    1
0102    1
0106    1
Name: hs4_id, dtype: int64

In [36]:
df[(df["_merge"] == "left_only") & (df['gta_evaluation'] == "Red")].T

Unnamed: 0,1613768,1613770,1613771,1613772,1645268,1645269,1645278,1645572,2296443,2296444,...,2296446,2296447,2296448,2296449,2296450,2296451,2296452,2296453,2296454,2936889
state_act_id,34518,34518,34518,34518,34732,34732,34743,34769,45839,45839,...,45839,45839,45839,45839,45839,45839,45839,45839,45839,63275
intervention_id,110702,110702,110702,110702,67860,67860,67941,68270,82034,82034,...,82034,82034,82034,82034,82034,82034,82034,82034,82034,102892
state_act_title,Cote d'Ivoire: Import tariff changes in 2015,Cote d'Ivoire: Import tariff changes in 2015,Cote d'Ivoire: Import tariff changes in 2015,Cote d'Ivoire: Import tariff changes in 2015,Cote d'Ivoire: Import tariff changes in 2011,Cote d'Ivoire: Import tariff changes in 2011,Cote d'Ivoire: Import tariff changes in 2012,Cote d'Ivoire: Import tariff changes in 2016,Ivory Coast: Import tariff changes in 2019,Ivory Coast: Import tariff changes in 2019,...,Ivory Coast: Import tariff changes in 2019,Ivory Coast: Import tariff changes in 2019,Ivory Coast: Import tariff changes in 2019,Ivory Coast: Import tariff changes in 2019,Ivory Coast: Import tariff changes in 2019,Ivory Coast: Import tariff changes in 2019,Ivory Coast: Import tariff changes in 2019,Ivory Coast: Import tariff changes in 2019,Ivory Coast: Import tariff changes in 2019,Ivory Coast: Hydro-alcoholic gels export ban
announcement_date,2015-01-01 00:00:00,2015-01-01 00:00:00,2015-01-01 00:00:00,2015-01-01 00:00:00,2011-12-31 00:00:00,2011-12-31 00:00:00,2012-12-31 00:00:00,2016-12-31 00:00:00,2019-01-01 00:00:00,2019-01-01 00:00:00,...,2019-01-01 00:00:00,2019-01-01 00:00:00,2019-01-01 00:00:00,2019-01-01 00:00:00,2019-01-01 00:00:00,2019-01-01 00:00:00,2019-01-01 00:00:00,2019-01-01 00:00:00,2019-01-01 00:00:00,2020-03-24 00:00:00
gta_evaluation,Red,Red,Red,Red,Red,Red,Red,Red,Red,Red,...,Red,Red,Red,Red,Red,Red,Red,Red,Red,Red
currently_in_force,no,no,no,no,yes,yes,no,yes,yes,yes,...,yes,yes,yes,yes,yes,yes,yes,yes,yes,no
country,Cote d'Ivoire',Cote d'Ivoire',Cote d'Ivoire',Cote d'Ivoire',Cote d'Ivoire',Cote d'Ivoire',Cote d'Ivoire',Cote d'Ivoire',Cote d'Ivoire',Cote d'Ivoire',...,Cote d'Ivoire',Cote d'Ivoire',Cote d'Ivoire',Cote d'Ivoire',Cote d'Ivoire',Cote d'Ivoire',Cote d'Ivoire',Cote d'Ivoire',Cote d'Ivoire',Cote d'Ivoire'
intervention_type,Import tariff,Import tariff,Import tariff,Import tariff,Import tariff,Import tariff,Import tariff,Import tariff,Import tariff,Import tariff,...,Import tariff,Import tariff,Import tariff,Import tariff,Import tariff,Import tariff,Import tariff,Import tariff,Import tariff,Export ban
mast_chapter,Tariff measures,Tariff measures,Tariff measures,Tariff measures,Tariff measures,Tariff measures,Tariff measures,Tariff measures,Tariff measures,Tariff measures,...,Tariff measures,Tariff measures,Tariff measures,Tariff measures,Tariff measures,Tariff measures,Tariff measures,Tariff measures,Tariff measures,"P3 Export licences, export quotas, export proh..."
affected_sectors,211,211,211,211,334,363,499,216,"212, 271, 469, 491",271,...,"212, 271, 469, 491","212, 491","271, 491","212, 271, 469, 491",491,212,"212, 491","212, 271, 469, 491",222,346


In [37]:
#df = df.drop(columns = '_merge')

In [38]:
#Drop the DataFrame from memory
del df_gta
del df_oec

In [39]:
df.describe()

Unnamed: 0.1,year,Unnamed: 0,country_eci,hs4_pci,tv_relatedness,tv_rca,opportunity_index_1,opportunity_index_2,opportunity_index_3,tv_rca_bigger_1
count,2831577.0,2831527.0,2830644.0,2817610.0,2831527.0,2831527.0,2817610.0,2817610.0,2817610.0,2831527.0
mean,2015.7483,1385699.7301,0.915,0.2091,0.3449,1.3733,0.5539,0.2966,0.3177,0.3834
std,3.6411,1356115.953,0.5551,1.0168,0.1324,3.5284,1.026,0.3471,0.2297,0.4862
min,2007.0,55.0,-2.3705,-3.5582,0.0013,0.0,-3.4778,-1.1232,-0.6528,0.0
25%,2013.0,197360.0,0.5183,-0.5748,0.2588,0.264,-0.2329,0.0329,0.1515,0.0
50%,2015.0,582528.0,0.9741,0.4386,0.3664,0.6921,0.7649,0.3513,0.3302,0.0
75%,2019.0,2985021.5,1.3951,1.0044,0.4415,1.4835,1.3504,0.5649,0.489,1.0
max,2021.0,3696247.0,2.2607,3.1534,0.9479,639.2207,3.6705,1.3819,1.0543,1.0


# Saving

In [40]:
# Specify the desired column order
column_order = ['year', 'country', 'country_id', 'country_eci', 'state_act_id', 'intervention_id','gta_evaluation']  # Add more variables as needed

# Reorder the DataFrame columns
df = df[column_order + list(df.columns.difference(column_order))]

In [41]:
df = df.drop(columns = ['tv_rca_bigger_1', 'announcement_date'])

In [42]:
df.to_csv(output_dir + '/data/oec_gta_dataset.csv', index = False)