## Code for infering the sentiment of wiki pages, using the RoBERTa language model. 
- To be ran in colab
- When inferring, the output sentiments are saved to a file ```sentiments.pickle``` at each iteration. You can stop and restart the cell while not losing progress (provided the colab environment does not delete the files / reload a new environment).
- Prior to inferring, please import the tar/zip file of wikispeedia text articles ```wikispeedia_articles_plaintext.tar``` and uncomment the corresponding loading lines

In [None]:
!pip install transformers
!pip install torch

In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import numpy as np
from scipy.special import softmax
import csv
import os
import glob
import tarfile
import zipfile
import torch
import pickle

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
articles_to_analyze = [
    '1755_Lisbon_earthquake.txt',
    '1896_Summer_Olympics.txt',
    '1997_Pacific_hurricane_season.txt',
    'Actinium.txt',
    'Barracuda.txt',
    'Basketball.txt',
    'Bath_School_disaster.txt',
    'Chicago.txt',
    'Chocolate.txt',
    'Diamond.txt',
    'Dice.txt',
    'Drinking_water.txt',
    'Duchenne_muscular_dystrophy.txt',
    'Geography_of_Ireland.txt',
    'George_S_Richardson_engineer.txt',
    'Giraffe.txt',
    'Gunpowder.txt',
    'Ordinal_number.txt',
    'Osama_bin_Laden.txt',
    'Palm_oil.txt',
    'Peace.txt',
    'Pellagra.txt',
    'Phishing.txt',
    'Plant.txt',
    'Plato.txt',
    'Pneumonia.txt',
    'Poison_gas_in_World_War_I.txt',
    'Politics.txt',
    'Pollution.txt',
    'Pompeii.txt',
    'Recycling.txt',
    'Red_Kite.txt',
    'Rice.txt',
    'Rio_de_Janeiro.txt',
    'Robert_K_Beck.txt',
    'Romeo_and_Juliet.txt',
    'Rugby_World_Cup.txt',
    'Rwandan_Genocide.txt',
    'Salt.txt',
    'Sand.txt',
    'Santa_Claus.txt',
    'Scooby-Doo.txt',
    'Seed.txt',
    'Sequoia.txt'
]



In [None]:
# Function to preprocess the text
def preprocess(text):
    new_text = []
    for t in text.split(" "):
        t = '@user' if t.startswith('@') and len(t) > 1 else t
        t = 'http' if t.startswith('http') else t
        new_text.append(t)
    return " ".join(new_text)

# Function to chunk text into manageable parts
def chunk_text(text, max_length):
    words = text.split()
    for i in range(0, len(words), max_length):
        yield ' '.join(words[i:i+max_length])

# Function to adjust score with the lexicon
def adjust_score_with_lexicon(text, score, positive_words, negative_words, max_adjustment=0.5):
    words = text.split()
    positive_count = sum(word in positive_words for word in words)
    negative_count = sum(word in negative_words for word in words)

    if abs(score) >= 0.2:
        difference = positive_count - negative_count
        adjustment = (np.log(abs(difference) + 1) / np.log(max_adjustment + 1)) * np.sign(difference)
        adjustment = np.clip(adjustment, -max_adjustment, max_adjustment)
    else:
        return score

    score += adjustment
    score = np.clip(score, -1, 1)
    return score

In [None]:
extract_dir = '/content/extracted_articles'

# zip_file_path = '/content/plaintext_articles.zip'

# Extract the zip file
# with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
#     zip_ref.extractall(extract_dir)

tar_file_path = '/content/wikispeedia_articles_plaintext.tar'

# Extract the tar file
with tarfile.open(tar_file_path, 'r:') as tar:
    tar.extractall(extract_dir)

In [None]:
text_files_dir = os.path.join(extract_dir, "plaintext_articles")

In [None]:
sentiments = dict()

if os.path.isfile("sentiments.pickle"):
    with open('sentiments.pickle', 'rb') as handle:
        sentiments = pickle.load(handle)

count = 0
for file_path in os.listdir(text_files_dir):
    if file_path not in sentiments:
        path = os.path.join(text_files_dir, file_path)
        print("On article: ", file_path)
        with open(path, 'r') as file:
            text = file.read()

        # Define the task and model
        task = 'sentiment'
        MODEL = f"cardiffnlp/twitter-roberta-base-{task}"

        # Load tokenizer and model from Hugging Face
        tokenizer = AutoTokenizer.from_pretrained(MODEL)
        model = AutoModelForSequenceClassification.from_pretrained(MODEL).to(device)

        # Set the max_length for tokenization
        max_length = 512

        # Download and load the label mapping
        labels = ["negative", "neutral", "positive"]

        # Load positive and negative words lists (assuming you have them as text files)
        positive_words = set(open('/content/Positive words 4.txt').read().splitlines())
        negative_words = set(open('/content/Negative words 4.txt').read().splitlines())

        chunks = chunk_text(preprocess(text), max_length)
        final_scores = np.zeros(len(labels))
        for chunk in chunks:
            encoded_input = tokenizer(chunk, return_tensors='pt', truncation=True, max_length=max_length).to(device)
            output = model(**encoded_input)
            output = output[0][0].to("cpu").detach().numpy()
            scores = softmax(output)
            final_scores += scores

        final_scores /= final_scores.sum()

        sentiment_score = list(final_scores) # [negative score, neutral score, positive score]

        # sentiment_score = final_scores[2] - final_scores[0]  # Positive score minus negative score
        # sentiment_score = adjust_score_with_lexicon(preprocess(text), sentiment_score, positive_words, negative_words)

        sentiments[file_path] = sentiment_score
        with open('sentiments.pickle', 'wb') as handle:
            pickle.dump(sentiments, handle, protocol=pickle.HIGHEST_PROTOCOL)

    print(f"File {count}: {os.path.basename(file_path)} - Sentiment score (neg-neut-pos): {sentiment_score}")
    count += 1

[1;30;43mLe flux de sortie a été tronqué et ne contient que les 5000 dernières lignes.[0m
File 1183: Popular_culture_studies.txt - Sentiment score (neg-neut-pos): [0.050570643186387355, 0.773255496485128, 0.17617386032848462]
File 1184: Rocky_Mountains.txt - Sentiment score (neg-neut-pos): [0.050570643186387355, 0.773255496485128, 0.17617386032848462]
File 1185: Yorkshire_Dales.txt - Sentiment score (neg-neut-pos): [0.050570643186387355, 0.773255496485128, 0.17617386032848462]
File 1186: William_Pitt_the_Younger.txt - Sentiment score (neg-neut-pos): [0.050570643186387355, 0.773255496485128, 0.17617386032848462]
File 1187: Bioinformatics.txt - Sentiment score (neg-neut-pos): [0.050570643186387355, 0.773255496485128, 0.17617386032848462]
File 1188: Osiris.txt - Sentiment score (neg-neut-pos): [0.050570643186387355, 0.773255496485128, 0.17617386032848462]
File 1189: Three_Laws_of_Robotics.txt - Sentiment score (neg-neut-pos): [0.050570643186387355, 0.773255496485128, 0.17617386032848462