# Extract Additional Data from EJAtlas YAML Files

As per Monalisa's request, this notebook extracts the following information from EJAtlas YAML files:

**From "Conflicts and Mobilization" section:**
- Conflict intensity
- Reaction stage

**From "Project details and actors" section:**
- Relevant government actors
- Company names or state enterprises (with expanded country information)

In [9]:
# Import required libraries
import pandas as pd
import yaml
from glob import glob
from pathlib import Path

In [10]:
# Get all YAML files
yaml_files = glob("../yaml/*.yaml")
print(f"Found {len(yaml_files)} YAML files to process")

Found 3838 YAML files to process


In [11]:
def process_yamls(filepath):
    """
    Process a single YAML file and extract the required information.
    Handles None values and missing data gracefully.
    
    Returns:
        tuple: (df_conflict_intensity, df_reaction_stage, df_relevant_government, df_company_names)
    """
    d = yaml.load(open(filepath), Loader=yaml.FullLoader)
    slug = d['presentation']['slug']
    conflict_intensity = d['mobilization']['intensity']
    reaction_stage = d['mobilization']['reaction_stage']
    relevant_government = d['details_and_actors']['govt_actors']
    company_names = d['details_and_actors']['companies']

    # Handle conflict_intensity - could be None or dict
    if conflict_intensity:
        df_conflict_intensity = pd.DataFrame.from_dict(conflict_intensity, orient='index').T
        df_conflict_intensity['slug'] = slug
    else:
        df_conflict_intensity = pd.DataFrame({'slug': [slug]})

    # Handle reaction_stage - could be None or dict
    if reaction_stage:
        df_reaction_stage = pd.DataFrame.from_dict(reaction_stage, orient='index').T
        df_reaction_stage['slug'] = slug
    else:
        df_reaction_stage = pd.DataFrame({'slug': [slug]})

    # Handle relevant_government - create dataframe
    df_relevant_government = pd.DataFrame({
        'slug': [slug],
        'relevant_government': [relevant_government]
    })
    
    # Handle company_names - could be None, empty list, or list of dicts
    if company_names:  # If there are companies
        df_company_names = pd.DataFrame(company_names)
        df_company_names['slug'] = slug

        # Check if 'country' column exists and has data
        if 'country' in df_company_names.columns and not df_company_names['country'].isna().all():
            # Expand the country column (which contains dicts) into separate columns
            country_expanded = pd.json_normalize(df_company_names['country'])
            country_expanded.columns = ['country_' + col for col in country_expanded.columns]

            # Replace the original country column with the expanded columns
            df_company_names_expanded = pd.concat([
                df_company_names.drop('country', axis=1), 
                country_expanded
            ], axis=1)
        else:
            # If no country column or all country values are NaN, just return the original dataframe
            df_company_names_expanded = df_company_names
    else:
        # If no companies, create an empty dataframe with just the slug
        df_company_names_expanded = pd.DataFrame({'slug': [slug]})

    return df_conflict_intensity, df_reaction_stage, df_relevant_government, df_company_names_expanded

In [12]:
# Process all YAML files
print(f"Processing {len(yaml_files)} YAML files...")

df_conflict_intensity_list = []
df_reaction_stage_list = []
df_relevant_government_list = []
df_company_names_list = []

successful_files = 0
failed_files = 0

for i, f in enumerate(yaml_files):
    if i % 100 == 0:
        print(f"Processing file {i+1}/{len(yaml_files)}")
    
    try:
        df_conflict_intensity, df_reaction_stage, df_relevant_government, df_company_names = process_yamls(f)
        df_conflict_intensity_list.append(df_conflict_intensity)
        df_reaction_stage_list.append(df_reaction_stage)
        df_relevant_government_list.append(df_relevant_government)
        df_company_names_list.append(df_company_names)
        successful_files += 1
    except Exception as e:
        print(f"✗ Error processing {f}: {e}")
        failed_files += 1

print(f"\nProcessing complete!")
print(f"Successfully processed: {successful_files} files")
print(f"Failed to process: {failed_files} files")

Processing 3838 YAML files...
Processing file 1/3838
Processing file 101/3838
Processing file 201/3838
Processing file 301/3838
Processing file 401/3838
Processing file 501/3838
Processing file 601/3838
Processing file 701/3838
Processing file 801/3838
Processing file 901/3838
Processing file 1001/3838
Processing file 1101/3838
Processing file 1201/3838
Processing file 1301/3838
Processing file 1401/3838
Processing file 1501/3838
Processing file 1601/3838
Processing file 1701/3838
Processing file 1801/3838
Processing file 1901/3838
Processing file 2001/3838
Processing file 2101/3838
Processing file 2201/3838
Processing file 2301/3838
Processing file 2401/3838
Processing file 2501/3838
Processing file 2601/3838
Processing file 2701/3838
Processing file 2801/3838
Processing file 2901/3838
Processing file 3001/3838
Processing file 3101/3838
Processing file 3201/3838
Processing file 3301/3838
Processing file 3401/3838
Processing file 3501/3838
Processing file 3601/3838
Processing file 3701

In [13]:
# Combine all individual DataFrames into final datasets
if df_conflict_intensity_list:
    final_conflict_intensity = pd.concat(df_conflict_intensity_list, ignore_index=True)
    final_reaction_stage = pd.concat(df_reaction_stage_list, ignore_index=True)
    final_relevant_government = pd.concat(df_relevant_government_list, ignore_index=True)
    final_company_names = pd.concat(df_company_names_list, ignore_index=True)

    print(f"Final dataset shapes:")
    print(f"Conflict intensity: {final_conflict_intensity.shape}")
    print(f"Reaction stage: {final_reaction_stage.shape}")
    print(f"Relevant government: {final_relevant_government.shape}")
    print(f"Company names: {final_company_names.shape}")
else:
    print("No files were successfully processed!")

Final dataset shapes:
Conflict intensity: (3838, 4)
Reaction stage: (3838, 4)
Relevant government: (3838, 2)
Company names: (9557, 11)


## Preview the Results

In [14]:
# Preview conflict intensity data
print("Conflict Intensity Data:")
print(f"\nColumns: {list(final_conflict_intensity.columns)}")
final_conflict_intensity.head()

Conflict Intensity Data:

Columns: ['color', 'id', 'name', 'slug']


Unnamed: 0,color,id,name,slug
0,#ff7f50,4,"MEDIUM (street protests, visible mobilization)",calcatreu-rio-negro-argentina
1,#049cdb,2,LATENT (no visible organising at the moment),water-contamination-by-rahim-yar-khan-sugar-mill
2,#ff7f50,4,"MEDIUM (street protests, visible mobilization)",oak-flat-usa
3,#ff7f50,4,"MEDIUM (street protests, visible mobilization)",new-airport-on-tioman-island
4,#ff7f50,4,"MEDIUM (street protests, visible mobilization)",moroccan-business-grabs-10000ha-for-intensive-...


In [15]:
# Preview reaction stage data
print("Reaction Stage Data:")
print(f"\nColumns: {list(final_reaction_stage.columns)}")
final_reaction_stage.head()

Reaction Stage Data:

Columns: ['color', 'id', 'name', 'slug']


Unnamed: 0,color,id,name,slug
0,#46a546,3,PREVENTIVE resistance (precautionary phase),calcatreu-rio-negro-argentina
1,#7a43b6,5,Mobilization for reparations once impacts have...,water-contamination-by-rahim-yar-khan-sugar-mill
2,#46a546,3,PREVENTIVE resistance (precautionary phase),oak-flat-usa
3,#46a546,3,PREVENTIVE resistance (precautionary phase),new-airport-on-tioman-island
4,#ee2c2c,4,In REACTION to the implementation (during cons...,moroccan-business-grabs-10000ha-for-intensive-...


In [16]:
# Preview relevant government data
print("Relevant Government Data:")
print(f"\nColumns: {list(final_relevant_government.columns)}")
final_relevant_government.head(10)

Relevant Government Data:

Columns: ['slug', 'relevant_government']


Unnamed: 0,slug,relevant_government
0,calcatreu-rio-negro-argentina,Gobierno de Río Negro.\r\nIntendencia de Ingen...
1,water-contamination-by-rahim-yar-khan-sugar-mill,
2,oak-flat-usa,USDA Forest Service\r\nCooperating Agencies: A...
3,new-airport-on-tioman-island,Government of Malaysia\nPahang Darul Makmur\nD...
4,moroccan-business-grabs-10000ha-for-intensive-...,Agence de promotion des investissements (Apix)...
5,jose-claudio-maria,Courts of Justice\nICREA (land reform institute)
6,pesticides-contaminate-drinkable-water-in-bret...,"ANSES : French Agency for Food, Environmental ..."
7,hebron-a-city-flooded-with-sewage,Hebron Governorate\nIsraeli Defense Forces
8,copper-mining-on-the-asana-river-peru,"Instituto Nacional de Desarrollo (INADE), Inst..."
9,jiaokou-feimei-aluminum-industry-co-ltd-lvlian...,-Huilong Township Government\r\n-Jiaokou Count...


In [17]:
# Preview company names data
print("Company Names Data:")
print(f"\nColumns: {list(final_company_names.columns)}")
final_company_names.head()

Company Names Data:

Columns: ['acronym', 'description', 'involvement', 'local_names', 'logo_image', 'name', 'other_products', 'slug', 'url', 'country_name', 'country_slug']


Unnamed: 0,acronym,description,involvement,local_names,logo_image,name,other_products,slug,url,country_name,country_slug
0,,,,,,Aquiline Resources,,calcatreu-rio-negro-argentina,,Canada,canada
1,,,,,,Patagonia Gold,,calcatreu-rio-negro-argentina,http://www.patagoniagold.com/,United Kingdom,united-kingdom
2,RYK,RYK Group is a leading agribusiness and renewa...,,,,Rahim Yar Khan (RYK) Sugar Mill,,water-contamination-by-rahim-yar-khan-sugar-mill,http://rykmills.com.pk/about/,Pakistan,pakistan
3,,,,,,Resolution Copper Co.,,oak-flat-usa,www.resolutioncopper.com,United States of America,united-states-of-america
4,Rio Tinto,Rio Tinto Group is an Anglo-Australian multina...,,,,Rio Tinto,,oak-flat-usa,http://www.riotinto.com/,United Kingdom,united-kingdom


In [19]:
final_conflict_intensity.slug.nunique(), final_reaction_stage.slug.nunique(), final_relevant_government.slug.nunique(), final_company_names.slug.nunique(),

(3838, 3838, 3838, 3838)

In [22]:
final_conflict_intensity.slug = final_conflict_intensity.slug.str.strip()
final_reaction_stage.slug = final_reaction_stage.slug.str.strip()
final_relevant_government.slug = final_relevant_government.slug.str.strip()
final_company_names.slug = final_company_names.slug.str.strip()
final_conflict_intensity.slug.nunique(), final_reaction_stage.slug.nunique(), final_relevant_government.slug.nunique(), final_company_names.slug.nunique(),

(3838, 3838, 3838, 3838)

In [23]:
ds1 = pd.read_csv('final/dataset_1-gold_labels.csv')
ds2 = pd.read_csv('final/dataset_2.csv')
ds3 = pd.read_csv('final/dataset_3.csv')

all_ds = pd.concat([ds1, ds2, ds3], ignore_index=True)
all_ds

Unnamed: 0,label,text,slug,annotator_1,annotator_2,annotator_3,annotator_ai,label_majority_vote
0,SIM,The 5 No. Habilashdweep Union (lowest administ...,groundwater-lifting-by-industries-leading-to-w...,,,,,
1,NÃO,Cassandra mines operate since 6th BC century a...,gold-mining-in-chalkidiki-greece,,,,,
2,NÃO,Jambi-2 power plant is a proposed mine-mouth c...,proposed-jambi-2-coal-fired-power-plant-jambi-...,,,,,
3,NÃO,"Relocation from mining sites, an influx of peo...",koidu-diamond-mining-conflict-sierra-leone,,,,,
4,NÃO,The inhabitants of Catacaos denounce that they...,comunidad-de-catacaos-vs-sodalicio,,,,,
...,...,...,...,...,...,...,...,...
3328,,The Minas-Rio Project is a big business owned ...,impacts-of-iron-ore-mining-in-conceicao-do-mat...,NÃO,SIM,NÃO TENHO CERTEZA,NÃO,NÃO
3329,,Cirali Coast consists of a 90 ha area. On 27.0...,leasing-of-the-cirali-coast-turkey,SIM,NÃO,NÃO TENHO CERTEZA,NÃO,NÃO
3330,,The construction of 120 MW wind energy and 380...,unlawful-removal-of-olive-trees-and-wind-energ...,SIM,NÃO,NÃO TENHO CERTEZA,NÃO,NÃO
3331,,The proposed Tanintharyi National Park is part...,tanintharyi-national-park-proposed-threatens-i...,NÃO,SIM,NÃO TENHO CERTEZA,NÃO,NÃO


In [24]:
all_slugs = set(all_ds['slug'].unique())
print(f"Total unique slugs in all datasets: {len(all_slugs)}")

Total unique slugs in all datasets: 3267


In [25]:
final_conflict_intensity_new = final_conflict_intensity[final_conflict_intensity['slug'].isin(all_slugs)]
print(f"Conflict intensity data for slugs in all datasets: {final_conflict_intensity_new.shape} (from {final_conflict_intensity.shape})")
print(f"Number of unique slugs retained: {final_conflict_intensity_new['slug'].nunique()}")

final_reaction_stage_new = final_reaction_stage[final_reaction_stage['slug'].isin(all_slugs)]
print(f"Reaction stage data for slugs in all datasets: {final_reaction_stage_new.shape} (from {final_reaction_stage.shape})")
print(f"Number of unique slugs retained: {final_reaction_stage_new['slug'].nunique()}")

final_relevant_government_new = final_relevant_government[final_relevant_government['slug'].isin(all_slugs)]
print(f"Relevant government data for slugs in all datasets: {final_relevant_government_new.shape} (from {final_relevant_government.shape})")
print(f"Number of unique slugs retained: {final_relevant_government_new['slug'].nunique()}")

final_company_names_new = final_company_names[final_company_names['slug'].isin(all_slugs)]
print(f"Company names data for slugs in all datasets: {final_company_names_new.shape} (from {final_company_names.shape})")
print(f"Number of unique slugs retained: {final_company_names_new['slug'].nunique()}")


Conflict intensity data for slugs in all datasets: (3266, 4) (from (3838, 4))
Number of unique slugs retained: 3266
Reaction stage data for slugs in all datasets: (3266, 4) (from (3838, 4))
Number of unique slugs retained: 3266
Relevant government data for slugs in all datasets: (3266, 2) (from (3838, 2))
Number of unique slugs retained: 3266
Company names data for slugs in all datasets: (8144, 11) (from (9557, 11))
Number of unique slugs retained: 3266


## Save the Results

In [26]:
final_conflict_intensity_new.to_csv('raw/conflict_intensity.csv', index=False)
final_reaction_stage_new.to_csv('raw/reaction_stage.csv', index=False)
final_relevant_government_new.to_csv('raw/relevant_government.csv', index=False)
final_company_names_new.to_csv('raw/company_names.csv', index=False)



In [27]:
final_conflict_intensity_new.to_excel('raw/conflict_intensity.xlsx', index=False)
final_reaction_stage_new.to_excel('raw/reaction_stage.xlsx', index=False)
final_relevant_government_new.to_excel('raw/relevant_government.xlsx', index=False)
final_company_names_new.to_excel('raw/company_names.xlsx', index=False)


In [28]:
final_conflict_intensity_new.to_json('raw/conflict_intensity.json', orient='records', lines=True)
final_reaction_stage_new.to_json('raw/reaction_stage.json', orient='records', lines=True)
final_relevant_government_new.to_json('raw/relevant_government.json', orient='records', lines=True)
final_company_names_new.to_json('raw/company_names.json', orient='records', lines=True)


In [29]:
final_conflict_intensity_new = pd.read_csv('raw/conflict_intensity.csv')
final_reaction_stage_new = pd.read_csv('raw/reaction_stage.csv')
final_relevant_government_new = pd.read_csv('raw/relevant_government.csv')
final_company_names_new = pd.read_csv('raw/company_names.csv')


In [30]:
final_conflict_intensity_new.slug.nunique(), final_reaction_stage_new.slug.nunique(), final_relevant_government_new.slug.nunique(), final_company_names_new.slug.nunique(),

(3266, 3266, 3266, 3266)