In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import matthews_corrcoef

Process files:

In [2]:
import pandas as pd
import json

# Function to load data from a JSON file
def load_data(filename):
    with open(filename, 'r', encoding='utf-8') as file:
        data = json.load(file)
    return data

# Function to preprocess and extract features
def preprocess_data(data):
    # Convert the data to a DataFrame first
    df = pd.DataFrame(data)
    
    # Initialize an empty DataFrame for annotations
    annotations_expanded = pd.DataFrame()
    
    # Check if the 'annotations' column exists and has data
    if 'annotations' in df.columns and not df['annotations'].isnull().all():
        # Try to normalize annotations
        try:
            # Normalize annotations and use a meta_prefix to avoid name conflicts
            annotations_expanded = pd.json_normalize(
                data, 
                record_path='annotations', 
                meta=['id', 'text', 'category'],
                meta_prefix='parent_'  # Add 'parent_' prefix to metadata columns to avoid conflicts
            )
        except Exception as e:
            print(f"Error normalizing annotations: {e}")
    else:
        print("No annotations found or annotations are malformed in some entries.")

    return df, annotations_expanded


# Main processing function
def process_train_files(train_filename, test_filename):
    # Load data
    train_data = load_data(train_filename)
    test_data = load_data(test_filename)
    
    # Preprocess data
    train_df, train_annotations = preprocess_data(train_data)
    test_df, test_annotations = preprocess_data(test_data)
    
    # Example of further operations: merge annotations back to the main DataFrame if needed
    # or perform text preprocessing/vectorization for the 'text' field
    
    return train_df, train_annotations, test_df, test_annotations

# Example usage
train_es_df, train_es_annotations, train_en_df, train_en_annotations = process_train_files('../Dataset-Oppositional/training/dataset_es_train.json', '../Dataset-Oppositional/training/dataset_en_train.json')

     id                                               text    category  \
0  2807  Fallo en Matrix 08/02/2022 Hoy el señor Joan R...    CRITICAL   
1  3054  Siento ya tdas las vacunas vienen contaminadas...    CRITICAL   
2   268  Veo que curiosamente te autoproclamados interl...  CONSPIRACY   
3  2669  [ Documental ] Vacunas : Una inyección en la o...    CRITICAL   
4  3205  Una sugerencia para los que se han vacunado y ...  CONSPIRACY   

                                         annotations  \
0  [{'span_text': 'el señor Joan Ramón Laporte Ro...   
1  [{'span_text': 'mi sobrina', 'category': 'VICT...   
2  [{'span_text': 'todo el grupo', 'category': 'C...   
3  [{'span_text': '[ Documental ] Vacunas : Una i...   
4  [{'span_text': 'los que se han vacunado y no q...   

                                        spacy_tokens  
0  WyJGYWxsbyIsICJlbiIsICJNYXRyaXgiLCAiMDgvMDIvMj...  
1  WyJTaWVudG8iLCAieWEiLCAidGRhcyIsICJsYXMiLCAidm...  
2  WyJWZW8iLCAicXVlIiwgImN1cmlvc2FtZW50ZSIsICJ0ZS... 

Unnamed: 0,span_text,category,annotator,start_char,end_char,start_spacy_token,end_spacy_token,parent_id,parent_text,parent_category
0,"el señor Joan Ramón Laporte Roselló , catedrát...",CAMPAIGNER,gold_label,31,109,5,16,2807,Fallo en Matrix 08/02/2022 Hoy el señor Joan R...,CRITICAL
1,Joan Ramón Laporte,CAMPAIGNER,gold_label,40,58,7,10,2807,Fallo en Matrix 08/02/2022 Hoy el señor Joan R...,CRITICAL
2,el Gobierno de España ( PSOE y Podemos ),AGENT,gold_label,178,218,29,38,2807,Fallo en Matrix 08/02/2022 Hoy el señor Joan R...,CRITICAL
3,la mayoría de la población española,VICTIM,gold_label,529,564,95,101,2807,Fallo en Matrix 08/02/2022 Hoy el señor Joan R...,CRITICAL
4,mi sobrina,VICTIM,gold_label,47,57,7,9,3054,Siento ya tdas las vacunas vienen contaminadas...,CRITICAL
...,...,...,...,...,...,...,...,...,...,...
18968,la tele,FACILITATOR,gold_label,111,118,25,27,124,"Hola , aporto prueba de curación : Mi padre , ...",CRITICAL
18969,líquido pulmón,NEGATIVE_EFFECT,gold_label,207,221,49,51,124,"Hola , aporto prueba de curación : Mi padre , ...",CRITICAL
18970,apenas caminaba,NEGATIVE_EFFECT,gold_label,224,239,52,54,124,"Hola , aporto prueba de curación : Mi padre , ...",CRITICAL
18971,Deliraba,NEGATIVE_EFFECT,gold_label,276,284,62,63,124,"Hola , aporto prueba de curación : Mi padre , ...",CRITICAL
