In [1]:
import os, sys
sys.path.append("pan-clef-2024-oppositional/")
TRAIN_DATASET_ES="Dataset-Oppositional/training/dataset_es_train.json"
TRAIN_DATASET_EN="Dataset-Oppositional/training/dataset_en_train.json"
TEST_DATASET_EN ="Dataset-Oppositional/test/dataset_en_official_test_nolabels.json"
TEST_DATASET_ES ="Dataset-Oppositional/test/dataset_en_official_test_nolabels.json"

In [2]:
from data_tools.dataset_loaders import load_dataset_classification
texts_es, labels_es, ids_es = load_dataset_classification("es", string_labels=False, positive_class='conspiracy')
texts_en, labels_en, ids_en = load_dataset_classification("en", string_labels=False, positive_class='conspiracy')


Loading official JSON es dataset
Loading official JSON en dataset


In [5]:
ids_es

0       2807
1       3054
2        268
3       2669
4       3205
        ... 
3995    1056
3996     861
3997    5248
3998    1328
3999     124
Length: 4000, dtype: object

## Representation models

- Traditional form: **Latent Semantic Analysis (LSA)**. LSA is a technique used in NLP and information retrieval to
analyze the relationships between a set of documents and the
terms they contain. It's primarily employed for text classification,
clustering, and information retrieval tasks.

- Static and contextual: **fasttext-wiki-news-subwords-300** for english text. Due to this model was trained only with english text, to make better representation in spanish, we need another model. For spanish we are going to use a model similar trained with spanish text that is trained on Common Crawl and Wikipedia using fastText. This models was trained using CBOW with position-weights, in dimension 300, with character n-grams of length 5, a window of size 5 and 10 negatives. https://fasttext.cc/docs/en/crawl-vectors.html

- Contextual word embedding: **DeBERTa**. 

In [1]:
import gensim.downloader as api # works with scipy==1.10.1
info = api.info()
for item in info["models"].items():
    print(item)
    
word_vectors = api.load('fasttext-wiki-news-subwords-300')


('fasttext-wiki-news-subwords-300', {'num_records': 999999, 'file_size': 1005007116, 'base_dataset': 'Wikipedia 2017, UMBC webbase corpus and statmt.org news dataset (16B tokens)', 'reader_code': 'https://github.com/RaRe-Technologies/gensim-data/releases/download/fasttext-wiki-news-subwords-300/__init__.py', 'license': 'https://creativecommons.org/licenses/by-sa/3.0/', 'parameters': {'dimension': 300}, 'description': '1 million word vectors trained on Wikipedia 2017, UMBC webbase corpus and statmt.org news dataset (16B tokens).', 'read_more': ['https://fasttext.cc/docs/en/english-vectors.html', 'https://arxiv.org/abs/1712.09405', 'https://arxiv.org/abs/1607.01759'], 'checksum': 'de2bb3a20c46ce65c9c131e1ad9a77af', 'file_name': 'fasttext-wiki-news-subwords-300.gz', 'parts': 1})
('conceptnet-numberbatch-17-06-300', {'num_records': 1917247, 'file_size': 1225497562, 'base_dataset': 'ConceptNet, word2vec, GloVe, and OpenSubtitles 2016', 'reader_code': 'https://github.com/RaRe-Technologies/ge

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
import nltk
from transformers import DebertaTokenizer
from sklearn.decomposition import TruncatedSVD
import gensim.downloader as api # works with scipy==1.10.1
import string 

nltk.download('punkt')
nltk.download('stopwords')

# Function to preprocess text
def preprocess_text(text, stop_words=stopwords.words('english')):
    # Tokenize text
    tokens = word_tokenize(text.lower())
    # Remove stopwords, punctuation marks, and symbols
    tokens = [w for w in tokens
              if w not in stop_words
              and w not in string.punctuation
              and w.isalnum()]
    return tokens

# Function to calculate average vector representation of a document
def document_vector(tokens, word_vectors):
    # Initialize empty vector
    document_vector = np.zeros((300,))
    # Count number of words in document
    words_count = 0
    # Iterate over each word in the document
    for word in tokens:
        if word in word_vectors.key_to_index:
            # Add word vector to document vector
            document_vector += word_vectors[word]
            words_count += 1
    # Average the document vector
    if words_count != 0:
        document_vector /= words_count
    return document_vector

# Function to create LSA features
def create_lsa_features(train_texts, val_texts, n_components=100):
    # Create TF-IDF vectors
    tfidf_vectorizer = TfidfVectorizer(binary=False, ngram_range=(3, 3), analyzer='char')
    train_tfidf = tfidf_vectorizer.fit_transform(train_texts)
    val_tfidf = tfidf_vectorizer.transform(val_texts)
    
    # Apply SVD
    svd = TruncatedSVD(n_components=n_components)
    train_lsa = svd.fit_transform(train_tfidf)
    val_lsa = svd.transform(val_tfidf)
    
    return train_lsa, val_lsa

def static_embedding(train_texts, val_texts):
    word_vectors = api.load('fasttext-wiki-news-subwords-300')
    
    # TRAIN TEXTS
    # Create dictionary to store document vectors
    document_vectors = {}

    # Iterate over each document in the dataset
    for index, row in train_texts.iterrows():
        # Preprocess text
        tokens = preprocess_text(row['text'])
        # Calculate document vector
        doc_vector = document_vector(tokens, word_vectors)
        # Store document vector in dictionary
        document_vectors[index] = doc_vector
    
    # Add document vectors as a new column in the DataFrame
    train_texts['fasttext'] = document_vectors
    
    # VAL TEXTS
    # Create dictionary to store document vectors
    document_vectors = {}

    # Iterate over each document in the dataset
    for index, row in val_texts.iterrows():
        # Preprocess text
        tokens = preprocess_text(row['text'])
        # Calculate document vector
        doc_vector = document_vector(tokens, word_vectors)
        # Store document vector in dictionary
        document_vectors[index] = doc_vector
    
    # Add document vectors as a new column in the DataFrame
    val_texts['fasttext'] = document_vectors 

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\dandr\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\dandr\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
