In [1]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import numpy as np
from scipy.special import softmax
import csv
import os

In [2]:
articles_to_analyze = [
    '1755_Lisbon_earthquake.txt',
    '1896_Summer_Olympics.txt',
    '1997_Pacific_hurricane_season.txt',
    'Actinium.txt',
    'Barracuda.txt',
    'Basketball.txt',
    'Bath_School_disaster.txt',
    'Chicago.txt',
    'Chocolate.txt',
    'Diamond.txt',
    'Dice.txt',
    'Drinking_water.txt',
    'Duchenne_muscular_dystrophy.txt',
    'Geography_of_Ireland.txt',
    'George_S_Richardson_engineer.txt',
    'Giraffe.txt',
    'Gunpowder.txt',
    'Ordinal_number.txt',
    'Osama_bin_Laden.txt',
    'Palm_oil.txt',
    'Peace.txt',
    'Pellagra.txt',
    'Phishing.txt',
    'Plant.txt',
    'Plato.txt',
    'Pneumonia.txt',
    'Poison_gas_in_World_War_I.txt',
    'Politics.txt',
    'Pollution.txt',
    'Pompeii.txt',
    'Recycling.txt',
    'Red_Kite.txt',
    'Rice.txt',
    'Rio_de_Janeiro.txt',
    'Robert_K_Beck.txt',
    'Romeo_and_Juliet.txt',
    'Rugby_World_Cup.txt',
    'Rwandan_Genocide.txt',
    'Salt.txt',
    'Sand.txt',
    'Santa_Claus.txt',
    'Scooby-Doo.txt',
    'Seed.txt',
    'Sequoia.txt'
]



In [4]:
# Function to preprocess the text
def preprocess(text):
    new_text = []
    for t in text.split(" "):
        t = '@user' if t.startswith('@') and len(t) > 1 else t
        t = 'http' if t.startswith('http') else t
        new_text.append(t)
    return " ".join(new_text)

# Function to chunk text into manageable parts
def chunk_text(text, max_length):
    words = text.split()
    for i in range(0, len(words), max_length):
        yield ' '.join(words[i:i+max_length])

# Function to adjust score with the lexicon
def adjust_score_with_lexicon(text, score, positive_words, negative_words, max_adjustment=0.5):
    words = text.split()
    positive_count = sum(word in positive_words for word in words)
    negative_count = sum(word in negative_words for word in words)

    if abs(score) >= 0.2:
        difference = positive_count - negative_count
        adjustment = (np.log(abs(difference) + 1) / np.log(max_adjustment + 1)) * np.sign(difference)
        adjustment = np.clip(adjustment, -max_adjustment, max_adjustment)
    else:
        return score

    score += adjustment
    score = np.clip(score, -1, 1)
    return score

In [None]:
import glob
import os

zip_file_path = '/content/plaintext_articles.zip'
extract_dir = '/content/extracted_articles'


# Extract the zip file
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(extract_dir)

# After extracting, get the list of all files in the directory
extracted_files = os.listdir(extract_dir)
print("Extracted files:", extracted_files)
extracted_files = os.listdir(extract_dir +'/' + extracted_files[0])
print("Extracted files:", extracted_files)
# Ensure the files to analyze are in the extracted files list
text_files = [os.path.join(extract_dir +'/plaintext_articles' , file) for file in articles_to_analyze if file in extracted_files]
print(text_files)


# After extracting, get the list of all files in the directory
# Ensure the files to analyze are in the extracted files list
text_files = [os.path.join(content_dir , file) for file in articles_to_analyze]

for file_path in text_files:

    with open(file_path, 'r') as file:
        text = file.read()

    # Define the task and model
    task = 'sentiment'
    MODEL = f"cardiffnlp/twitter-roberta-base-{task}"

    # Load tokenizer and model from Hugging Face
    tokenizer = AutoTokenizer.from_pretrained(MODEL)
    model = AutoModelForSequenceClassification.from_pretrained(MODEL)

    # Set the max_length for tokenization
    max_length = 512

    # Download and load the label mapping
    labels = ["negative", "neutral", "positive"]

    # Load positive and negative words lists (assuming you have them as text files)
    positive_words = set(open('/content/Positive words.txt').read().splitlines())
    negative_words = set(open('/content/Negative words.txt').read().splitlines())

    chunks = chunk_text(preprocess(text), max_length)
    final_scores = np.zeros(len(labels))
    for chunk in chunks:
        encoded_input = tokenizer(chunk, return_tensors='pt', truncation=True, max_length=max_length)
        output = model(**encoded_input)
        scores = softmax(output[0][0].detach().numpy())
        final_scores += scores


    final_scores /= final_scores.sum()

    sentiment_score = final_scores[2] - final_scores[0]  # Positive score minus negative score
    sentiment_score = adjust_score_with_lexicon(preprocess(text), sentiment_score, positive_words, negative_words)

    print(f"File: {os.path.basename(file_path)} - Sentiment score: {sentiment_score:.2f}")

['../data/plaintext_articles/1755_Lisbon_earthquake.txt', '../data/plaintext_articles/1896_Summer_Olympics.txt', '../data/plaintext_articles/1997_Pacific_hurricane_season.txt', '../data/plaintext_articles/Actinium.txt', '../data/plaintext_articles/Barracuda.txt', '../data/plaintext_articles/Basketball.txt', '../data/plaintext_articles/Bath_School_disaster.txt', '../data/plaintext_articles/Chicago.txt', '../data/plaintext_articles/Chocolate.txt', '../data/plaintext_articles/Diamond.txt', '../data/plaintext_articles/Dice.txt', '../data/plaintext_articles/Drinking_water.txt', '../data/plaintext_articles/Duchenne_muscular_dystrophy.txt', '../data/plaintext_articles/Geography_of_Ireland.txt', '../data/plaintext_articles/George_S_Richardson_engineer.txt', '../data/plaintext_articles/Giraffe.txt', '../data/plaintext_articles/Gunpowder.txt', '../data/plaintext_articles/Ordinal_number.txt', '../data/plaintext_articles/Osama_bin_Laden.txt', '../data/plaintext_articles/Palm_oil.txt', '../data/pla