In [1]:
# This program merges OEC (with opportunity variables) and GTA dataset

# TODO

- Investigate left-only merge

# Loading packages

In [2]:
import numpy as np
import pandas as pd
import os
from google.colab import drive

# Setting the ambience

In [3]:
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [4]:
work_dir                            = '/content/gdrive/My Drive/ip_complexity/create_finaldataset'
output_dir                          = '/content/gdrive/My Drive/ip_complexity/create_finaldataset/output'
create_dataset_gta_dir              = '/content/gdrive/My Drive/ip_complexity/create_dataset_gta/output/data'
create_variables_opportunity_dir    = '/content/gdrive/My Drive/ip_complexity/oec_data/create_variables_opportunity/output/data'

# Oppening OEC dataset

In [5]:
df_oec = pd.read_csv(create_variables_opportunity_dir + "/dataset_oec_with_opportunity.csv")

In [6]:
# Convert 'hs4_id' column to string and fill with zeros until it reachs 4 digits
df_oec["hs4_id"] = df_oec["hs4_id"].astype(str).str.zfill(4)

In [7]:
df_oec.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3882912 entries, 0 to 3882911
Data columns (total 13 columns):
 #   Column               Dtype  
---  ------               -----  
 0   year                 int64  
 1   country_id           object 
 2   country              object 
 3   country_eci          float64
 4   hs4_id               object 
 5   hs4_name             object 
 6   hs4_pci              float64
 7   tv_relatedness       float64
 8   tv_rca               float64
 9   opportunity_index_1  float64
 10  opportunity_index_2  float64
 11  opportunity_index_3  float64
 12  tv_rca_bigger_1      int64  
dtypes: float64(7), int64(2), object(4)
memory usage: 385.1+ MB


In [8]:
df_oec = df_oec.sort_values(by = 'hs4_id')

# Oppening GTA dataset

In [9]:
df_gta = pd.read_csv(create_dataset_gta_dir + "/dataset_gta_intervention_product.csv", sep = ";")

  df_gta = pd.read_csv(create_dataset_gta_dir + "/dataset_gta_intervention_product.csv", sep = ";")


In [10]:
# Define the renaming function
rename_func = lambda x: x.lower().replace(' ', '_')
# Rename the columns using the function
df_gta = df_gta.rename(columns=rename_func)

In [11]:
df_gta.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6574565 entries, 0 to 6574564
Data columns (total 11 columns):
 #   Column                     Dtype 
---  ------                     ----- 
 0   state_act_id               int64 
 1   intervention_id            int64 
 2   state_act_title            object
 3   announcement_date          object
 4   gta_evaluation             object
 5   currently_in_force         object
 6   implementing_jurisdiction  object
 7   intervention_type          object
 8   mast_chapter               object
 9   affected_sectors           object
 10  affected_product           object
dtypes: int64(2), object(9)
memory usage: 551.8+ MB


In [12]:
df_gta = df_gta.rename(columns = {"implementing_jurisdiction": "country",
                         "affected_product": "hs4_id"
                        })

In [13]:
# Convert "announcement_data" column to date type
df_gta["announcement_date"] = pd.to_datetime(df_gta["announcement_date"])

# Create a variable "year" with the year from "announcement_data"
df_gta["year"] = df_gta["announcement_date"].dt.year

In [14]:
# Extract the first four digits from 'hs4_id' to merge with OEC data
df_gta["hs4_id"] = df_gta["hs4_id"].astype(str).str[:4]

In [15]:
# Convert 'hs4_id' column to string and fill with zeros until it reachs 4 digits
df_gta["hs4_id"] = df_gta["hs4_id"].astype(str).str.zfill(4)

In [16]:
df_gta.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6574565 entries, 0 to 6574564
Data columns (total 12 columns):
 #   Column              Dtype         
---  ------              -----         
 0   state_act_id        int64         
 1   intervention_id     int64         
 2   state_act_title     object        
 3   announcement_date   datetime64[ns]
 4   gta_evaluation      object        
 5   currently_in_force  object        
 6   country             object        
 7   intervention_type   object        
 8   mast_chapter        object        
 9   affected_sectors    object        
 10  hs4_id              object        
 11  year                int64         
dtypes: datetime64[ns](1), int64(3), object(8)
memory usage: 601.9+ MB


In [17]:
# drop duplicates
df_gta = df_gta.drop_duplicates()

In [18]:
df_gta.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3469498 entries, 0 to 6574564
Data columns (total 12 columns):
 #   Column              Dtype         
---  ------              -----         
 0   state_act_id        int64         
 1   intervention_id     int64         
 2   state_act_title     object        
 3   announcement_date   datetime64[ns]
 4   gta_evaluation      object        
 5   currently_in_force  object        
 6   country             object        
 7   intervention_type   object        
 8   mast_chapter        object        
 9   affected_sectors    object        
 10  hs4_id              object        
 11  year                int64         
dtypes: datetime64[ns](1), int64(3), object(8)
memory usage: 344.1+ MB


# Merging 

In [19]:
df = pd.merge(df_gta, df_oec, on=['hs4_id', 'year', 'country'], how = "outer", indicator = True)

In [20]:
df["_merge"].value_counts() # right_only means that a product p was not protected/liberalized by country c at year j. It is ok to drop, since we're interested only in the products that are target by policies


right_only    3608565
both          2426825
left_only     1042673
Name: _merge, dtype: int64

In [33]:
df[df["_merge"] == "left_only"].year.value_counts()

2022    300281
2015    147825
2020     92025
2014     79701
2021     60379
2013     50221
2018     49655
2019     41702
2010     40614
2012     40484
2011     38559
2023     26881
2016     25188
2009     24619
2017     20252
2008      4287
Name: year, dtype: int64

In [34]:
df[df["_merge"] == "left_only"].country.value_counts()

United States of America    237061
France                       84611
Germany                      63064
Spain                        61911
Italy                        49530
                             ...  
Bahamas                          1
Turkmenistan                     1
Guam                             1
New Caledonia                    1
Faeroe Islands                   1
Name: country, Length: 196, dtype: int64

In [21]:
df = df[df["_merge"] != "right_only"]

In [22]:
#df = df.drop(columns = '_merge')

In [23]:
# Drop the DataFrame from memory
del df_gta
del df_oec

In [24]:
# create protectionist/liberal dummies
# Create a dummy variable based on the condition 'color == "Red"'
df['protectionist'] = df['gta_evaluation'].apply(lambda x: 1 if x == 'Red' else (0 if x == 'Green' else np.nan))
df['liberal'] = df['gta_evaluation'].apply(lambda x: 1 if x == 'Green' else (0 if x == 'Red' else np.nan))

In [26]:
df.describe()

Unnamed: 0,state_act_id,intervention_id,year,country_eci,hs4_pci,tv_relatedness,tv_rca,opportunity_index_1,opportunity_index_2,opportunity_index_3,tv_rca_bigger_1,protectionist,liberal
count,3469498.0,3469498.0,3469498.0,2425985.0,2413428.0,2426825.0,2426825.0,2413428.0,2413428.0,2413428.0,2426825.0,3338299.0,3338299.0
mean,39522.06,72156.23,2016.323,0.850253,0.3417798,0.3397625,1.360237,0.1176235,0.03177675,0.05815142,0.3851444,0.7377727,0.2622273
std,21783.52,32484.83,3.905801,0.5384924,0.966703,0.1375071,3.426373,0.3601785,4.330025,1.355858,0.4866295,0.4398456,0.4398456
min,53.0,5688.0,2007.0,-2.343338,-3.558203,0.001312343,0.0,-1.860406,-491.182,-146.3796,0.0,0.0,0.0
25%,26511.0,58720.0,2014.0,0.4840182,-0.3286398,0.2411988,0.2576924,-0.09031292,-0.08652249,-0.02299402,0.0,0.0,0.0
50%,44370.0,81019.0,2016.0,0.8617006,0.595357,0.3578778,0.6899295,0.1378532,0.1759326,0.04076422,0.0,1.0,0.0
75%,54904.0,93078.0,2020.0,1.271406,1.064076,0.442538,1.501526,0.3491689,0.8036211,0.2901872,1.0,1.0,1.0
max,73378.0,118814.0,2023.0,2.260692,3.153356,0.9479168,253.1716,1.736715,116.2207,49.68227,1.0,1.0,1.0


# Saving

In [27]:
# Specify the desired column order
column_order = ['year', 'country', 'country_id', 'country_eci', 'state_act_id', 'intervention_id', 'protectionist', 'liberal']  # Add more variables as needed

# Reorder the DataFrame columns
df = df[column_order + list(df.columns.difference(column_order))]

In [28]:
df.to_csv(output_dir + '/data/oec_gta_dataset.csv', index = False)