In [None]:
# This program merges dataset from PCI and Relatedness (OEC)
# Source: https://oec.world/en/rankings/pci/hs4/hs92 and https://app-nutria.oec.world/api/stats/relatedness?cube=trade_i_baci_a_92&filter_Country=aschn&Year=2021&measures=Trade+Value&parents=true&rca=Exporter+Country,HS4,Trade+Value&alias=Country,HS4&locale=en

# Loading packages

In [None]:
import pandas as pd
import os
from google.colab import drive

# Setting the ambience

In [None]:
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:
work_dir                      = '/content/gdrive/My Drive/ip_complexity/oec_data/create_oec_dataset'
output_dir                    = '/content/gdrive/My Drive/ip_complexity/oec_data/create_oec_dataset/output'
download_relatedness_data_dir = '/content/gdrive/My Drive/ip_complexity/oec_data/downloads_relatedness_data/output/data'
create_dataset_complexity_dir = '/content/gdrive/My Drive/ip_complexity/oec_data/create_dataset_complexity/output/data'
create_dataset_eci_dir        = '/content/gdrive/My Drive/ip_complexity/oec_data/create_dataset_eci/output/data'

# Oppening Relatedness dataset

In [None]:
dfs = []

# Iterate over the files in the directory
for table in os.listdir(download_relatedness_data_dir):
    # Read each CSV file
    file_path = os.path.join(download_relatedness_data_dir, table)
    df = pd.read_csv(file_path)                       
    
    # Append the dataframe to the list
    dfs.append(df)

# Concatenate the dataframes into a single dataframe
df_related = pd.concat(dfs, ignore_index=True) # relatedness by product p at year j and contry c


In [None]:
df_related.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4865634 entries, 0 to 4865633
Data columns (total 14 columns):
 #   Column                   Dtype  
---  ------                   -----  
 0   Country ID               object 
 1   HS4 ID                   float64
 2   Trade Value Relatedness  float64
 3   Trade Value              float64
 4   Continent                object 
 5   Country                  object 
 6   Continent ID             object 
 7   Section                  object 
 8   HS2                      object 
 9   HS4                      object 
 10  Section ID               float64
 11  HS2 ID                   float64
 12  Trade Value RCA          float64
 13  year                     object 
dtypes: float64(6), object(8)
memory usage: 519.7+ MB


In [None]:
### Cleaning

In [None]:
df_related = df_related.loc[:, ["Country ID", "Country", "year", "HS4 ID", "Trade Value Relatedness", "Trade Value RCA", "Trade Value", "HS4"]]

In [None]:
df_related = df_related.rename(columns = {"Country ID": "country_id", "Country": "country", "HS4 ID": "hs4_id", "Trade Value Relatedness": "tv_relatedness", "Trade Value RCA": "tv_rca", "HS4": "product_name_hs4", "Trade Value": "trade_value"})

# Oppening Complexity dataset

In [None]:
df_pci = pd.read_csv(create_dataset_complexity_dir + "/complexity_dataset.csv")

# Merging 

In [None]:
df = pd.merge(df_related, df_pci, on=['product_name_hs4', 'year'], how = "left", indicator = True) # indicator = True shows where the data came from

In [None]:
df["_merge"].value_counts() # _merge == 1 means that product p at year j had no pci (probably because product was not year commercialized)

both          3882912
left_only      982722
right_only          0
Name: _merge, dtype: int64

In [None]:
df = df[df["_merge"] == "both"]

In [None]:
df = df.drop(columns = ["_merge", "hs4_id_x"])
df = df.rename(columns = {"hs4_id_y": "hs4_id"})

# Oppening ECI dataset

In [None]:
df_eci_cj = pd.read_csv(create_dataset_eci_dir + "/eci_dataset.csv")

# Merging

In [None]:
df = pd.merge(df, df_eci_cj, on=['country', 'country_id', 'year'], how = "left", indicator = True) # indicator = True shows where the data came from

In [None]:
df["_merge"].value_counts() # _merge == 'left_only' means that country c at year j had no data one ECI

both          3462360
left_only      420552
right_only          0
Name: _merge, dtype: int64

In [None]:
df[df["_merge"] == "left_only"].year.value_counts() # only the years that OCE don't have data before 1998 for ECI

1995    140184
1997    140184
1996    140184
Name: year, dtype: int64

In [None]:
df = df.drop(columns = "_merge")

In [None]:
# fixing hs4_id
df["hs4_id"] = df["hs4_id"].astype(int).astype(str).str.zfill(4) # now it is a string and every observation with 4 digits

In [None]:
df = df.rename(columns = {"product_name_hs4": "hs4_name"})

In [None]:
# Define the desired order of columns
new_order = ['year', 'country_id', 'country', 'eci', 'hs4_id', 'hs4_name', 'pci', 'tv_relatedness', 'tv_rca']
# Reorder the columns
df = df[new_order]

# Saving

In [None]:
df.to_csv(output_dir + "/data/dataset_oec.csv", sep = ";")

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3882912 entries, 0 to 3882911
Data columns (total 9 columns):
 #   Column          Dtype  
---  ------          -----  
 0   year            object 
 1   country_id      object 
 2   country         object 
 3   eci             float64
 4   hs4_id          object 
 5   hs4_name        object 
 6   pci             float64
 7   tv_relatedness  float64
 8   tv_rca          float64
dtypes: float64(4), object(5)
memory usage: 296.2+ MB
