- This program cleans Global Trade Alert data, making it ready to merge with OEC data by product and year

# Todo

# Loading packages

In [30]:
import pandas as pd
from google.colab import drive

# Setting the ambience

In [31]:
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [32]:
work_dir    = '/content/gdrive/My Drive/ip_complexity/create_dataset_gta'
output_dir  = '/content/gdrive/My Drive/ip_complexity/create_dataset_gta/output'

# Oppening datasets

In [33]:

chunksize = 1000  # Set the desired chunk size

# Create an empty DataFrame to store the results
df_gta = pd.DataFrame()

# Iterate over the chunks of data

# Read the dataset in chunks
reader = pd.read_csv(work_dir + '/input/GTA_all_data.csv', sep = ";", dtype = str, nrows = 10000, chunksize = chunksize)

for chunk in reader:
    #chunk["Affected Products"] = chunk["Affected Products"].astype(str)
    df_affected_products = chunk["Affected Products"].str.split(',', expand=True)

    # Get the existing column names
    existing_columns = df_affected_products.columns

    # Create a new list for modified column names
    modified_columns = ["affected_product_" + str(column) for column in existing_columns]

    # Update the column names in the chunk
    df_affected_products.columns = modified_columns

    # Merge the chunk with the affected products DataFrame
    chunk = pd.merge(chunk, df_affected_products, left_index=True, right_index=True, how="left", indicator=True)

    # Rename the "Affected Products" column
    chunk = chunk.rename(columns={"Affected Products": "affected_product_00"})

    # Replace commas with NaN
    chunk["affected_product_00"] = chunk["affected_product_00"].replace(',', pd.NA, regex=True)

    # Convert from wide to long format
    chunk = pd.melt(chunk, id_vars=['State Act ID', 'Affected Jurisdiction', 'Intervention ID', 'State Act Title', 'Announcement Date', 'GTA Evaluation', 'Currently in force', 'Implementing Jurisdiction', 'Intervention Type', 'MAST chapter', 'Affected Sectors'],
                    value_vars=chunk.columns[chunk.columns.str.contains('affected_product')],
                    var_name='product_variable', value_name='affected_product')

    # Drop rows with NaN values in the "affected_product" column
    chunk = chunk.dropna(subset=['affected_product'])

    # concat the processed chunk to the final DataFrame
    df_gta = pd.concat([df_gta, chunk])

# Sort the final DataFrame
df_gta = df_gta.sort_values(by=["State Act ID", "affected_product"])


In [34]:
df_gta = df_gta.drop("product_variable", axis = 1)

In [35]:
#df_gta['affected_product'] = pd.to_numeric(df_gta['affected_product']) 

In [36]:
df_gta["affected_product"] = df_gta["affected_product"].str.strip() # removes leading and ending spaces

In [37]:
df_gta = df_gta.drop_duplicates()

# Saving

In [38]:
df_gta.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 127156 entries, 2061 to 43
Data columns (total 12 columns):
 #   Column                     Non-Null Count   Dtype 
---  ------                     --------------   ----- 
 0   State Act ID               127156 non-null  object
 1   Affected Jurisdiction      126912 non-null  object
 2   Intervention ID            127156 non-null  object
 3   State Act Title            127156 non-null  object
 4   Announcement Date          127156 non-null  object
 5   GTA Evaluation             127156 non-null  object
 6   Currently in force         127156 non-null  object
 7   Implementing Jurisdiction  127156 non-null  object
 8   Intervention Type          127156 non-null  object
 9   MAST chapter               127156 non-null  object
 10  Affected Sectors           127156 non-null  object
 11  affected_product           127156 non-null  object
dtypes: object(12)
memory usage: 12.6+ MB


In [39]:
df_gta.to_csv(output_dir + "/data/dataset_gta_intervention_product_affectedcountry.csv", sep = ";", index = False)

KeyboardInterrupt: ignored

In [None]:
df_gta = df_gta.drop(columns = "Affected Jurisdiction")

In [None]:
df_gta = df_gta.drop_duplicates()

In [None]:
df_gta.info()

In [None]:
df_gta.to_csv(output_dir + "/data/dataset_gta_intervention_product.csv", sep = ";", index = False)