- This program cleans Global Trade Alert data, making it ready to merge with OEC data by product and year

# Todo

# Loading packages

In [1]:
import pandas as pd
from google.colab import drive

# Setting the ambience

In [2]:
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [3]:
work_dir    = '/content/gdrive/My Drive/ip_complexity/create_dataset_gta'
output_dir  = '/content/gdrive/My Drive/ip_complexity/create_dataset_gta/output'

# Oppening datasets

In [4]:

chunksize = 1000  # Set the desired chunk size

# Create an empty DataFrame to store the results
df_gta = pd.DataFrame()

# Iterate over the chunks of data

# Read the dataset in chunks
reader = pd.read_csv(work_dir + '/input/GTA_all_data.csv', sep = ";", chunksize = chunksize)

for chunk in reader:
    chunk["Affected Products"] = chunk["Affected Products"].astype(str)
    df_affected_products = chunk["Affected Products"].str.split(',', expand=True)

    # Get the existing column names
    existing_columns = df_affected_products.columns

    # Create a new list for modified column names
    modified_columns = ["affected_product_" + str(column) for column in existing_columns]

    # Update the column names in the chunk
    df_affected_products.columns = modified_columns

    # Merge the chunk with the affected products DataFrame
    chunk = pd.merge(chunk, df_affected_products, left_index=True, right_index=True, how="left", indicator=True)

    # Rename the "Affected Products" column
    chunk = chunk.rename(columns={"Affected Products": "affected_product_00"})

    # Replace commas with NaN
    chunk["affected_product_00"] = chunk["affected_product_00"].replace(',', pd.NA, regex=True)

    # Convert from wide to long format
    chunk = pd.melt(chunk, id_vars=['State Act ID', 'Affected Jurisdiction', 'Intervention ID', 'State Act Title', 'Announcement Date', 'GTA Evaluation', 'Currently in force', 'Implementing Jurisdiction', 'Intervention Type', 'MAST chapter', 'Affected Sectors'],
                    value_vars=chunk.columns[chunk.columns.str.contains('affected_product')],
                    var_name='product_variable', value_name='affected_product')

    # Drop rows with NaN values in the "affected_product" column
    chunk = chunk.dropna(subset=['affected_product'])

    # concat the processed chunk to the final DataFrame
    df_gta = pd.concat([df_gta, chunk])

# Sort the final DataFrame
df_gta = df_gta.sort_values(by=["State Act ID", "affected_product"])


In [6]:
df_gta = df_gta.drop("product_variable", axis = 1)

In [7]:
#df_gta['affected_product'] = pd.to_numeric(df_gta['affected_product']) 

In [8]:
df_gta["affected_product"] = df_gta["affected_product"].str.strip() # removes leading and ending spaces

In [9]:
df_gta = df_gta.drop_duplicates()

# Saving

In [10]:
df_gta.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 26253307 entries, 2001 to 574
Data columns (total 12 columns):
 #   Column                     Dtype 
---  ------                     ----- 
 0   State Act ID               int64 
 1   Affected Jurisdiction      object
 2   Intervention ID            int64 
 3   State Act Title            object
 4   Announcement Date          object
 5   GTA Evaluation             object
 6   Currently in force         object
 7   Implementing Jurisdiction  object
 8   Intervention Type          object
 9   MAST chapter               object
 10  Affected Sectors           object
 11  affected_product           object
dtypes: int64(2), object(10)
memory usage: 2.5+ GB


In [11]:
df_gta.to_csv(output_dir + "/data/dataset_gta_intervention_product_affectedcountry.csv", sep = ";", index = False)

In [12]:
df_gta = df_gta.drop(columns = "Affected Jurisdiction")

In [13]:
df_gta = df_gta.drop_duplicates()

In [14]:
df_gta.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6574565 entries, 2001 to 570
Data columns (total 11 columns):
 #   Column                     Dtype 
---  ------                     ----- 
 0   State Act ID               int64 
 1   Intervention ID            int64 
 2   State Act Title            object
 3   Announcement Date          object
 4   GTA Evaluation             object
 5   Currently in force         object
 6   Implementing Jurisdiction  object
 7   Intervention Type          object
 8   MAST chapter               object
 9   Affected Sectors           object
 10  affected_product           object
dtypes: int64(2), object(9)
memory usage: 601.9+ MB


In [15]:
df_gta.to_csv(output_dir + "/data/dataset_gta_intervention_product.csv", sep = ";", index = False)