In [None]:
#! pip install transformers
#! pip install vaderSentiment

In [None]:
import pandas as pd
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import os
from tqdm.auto import tqdm

## 1. SentiWordNet

In [None]:
from sentiment_analysis_1 import *


In [None]:
articles = load_titles()
print(len(articles), "articles found")

4604 articles found


In [None]:
content = load_html(articles)
set(content[k][0] for k in content.keys())

{'\n'}

## Load sentiment data

In [None]:
sentiment_lexicon_path = "../sentiwordnet/SentiWordNet_3.0.0.txt"
df = pd.read_csv(sentiment_lexicon_path, sep="\t", skiprows=25, skipfooter=1, engine="python")
df

Unnamed: 0,# POS,ID,PosScore,NegScore,SynsetTerms,Gloss
0,a,1740,0.125,0.000,able#1,(usually followed by `to') having the necessar...
1,a,2098,0.000,0.750,unable#1,(usually followed by `to') not having the nece...
2,a,2312,0.000,0.000,dorsal#2 abaxial#1,facing away from the axis of an organ or organ...
3,a,2527,0.000,0.000,ventral#2 adaxial#1,nearest to or facing toward the axis of an org...
4,a,2730,0.000,0.000,acroscopic#1,facing or on the side toward the apex
...,...,...,...,...,...,...
117654,v,2771756,0.000,0.000,run_dry#1 dry_out#2,"become empty of water; ""The river runs dry in ..."
117655,v,2771888,0.000,0.125,fog_up#1,"get foggy; ""The windshield fogged up"""
117656,v,2771997,0.000,0.000,coal#1 char#1,"burn to charcoal; ""Without a drenching rain, t..."
117657,v,2772202,0.125,0.250,haze#1,"become hazy, dull, or cloudy"


In [None]:
def parse_synset_terms(row):
    words = row.split()
    words = [word.split('#')[0] for word in words]
    return words

df['SynsetTerms'] = df['SynsetTerms'].apply(parse_synset_terms)
df = df.explode('SynsetTerms').reset_index(drop=True)
df

Unnamed: 0,# POS,ID,PosScore,NegScore,SynsetTerms,Gloss
0,a,1740,0.125,0.000,able,(usually followed by `to') having the necessar...
1,a,2098,0.000,0.750,unable,(usually followed by `to') not having the nece...
2,a,2312,0.000,0.000,dorsal,facing away from the axis of an organ or organ...
3,a,2312,0.000,0.000,abaxial,facing away from the axis of an organ or organ...
4,a,2527,0.000,0.000,ventral,nearest to or facing toward the axis of an org...
...,...,...,...,...,...,...
206936,v,2771888,0.000,0.125,fog_up,"get foggy; ""The windshield fogged up"""
206937,v,2771997,0.000,0.000,coal,"burn to charcoal; ""Without a drenching rain, t..."
206938,v,2771997,0.000,0.000,char,"burn to charcoal; ""Without a drenching rain, t..."
206939,v,2772202,0.125,0.250,haze,"become hazy, dull, or cloudy"


In [None]:
positiveness = dict(zip(df.SynsetTerms, round(df.PosScore - df.NegScore,5)))     # positiveness/negativeness score on [-1,1]
extremeness = dict(zip(df.SynsetTerms, (df.PosScore + df.NegScore)/2))  # extremeness score on [0,1]

## Load validation data

In [None]:
validation = pd.read_csv("../validation.txt", sep="\t", header=None, names=["article", "sentiment"])
validation = validation.set_index("article")
validation.head()

Unnamed: 0_level_0,sentiment
article,Unnamed: 1_level_1
1755_Lisbon_earthquake,-1
1896_Summer_Olympics,1
1997_Pacific_hurricane_season,-1
Actinium,0
Barracuda,-1


## Model 
Since there is no training we can go straight to validation data

In [None]:
def model(articles_content, lexicon):
    """
    Infer the sentiment prediction for articles on their content, according to lexicon of sentiments
    Args:
        articles_content: dict(), dictionary of (article-title: article-content) pairs
        lexicon: dict(), dictionary of (word: sentiment) pairs, with sentiment a sentiment score
    Returns:
        dict(), dictionary of (article-title: average-score) pairs
    """
    article_scores = {}

    for title, content in articles_content.items():
        words = content.split()
        word_scores = [lexicon[word] for word in words if word in lexicon]
        average_score = sum(word_scores) / len(word_scores) if len(word_scores) > 0 else None
        article_scores[title] = average_score

    return article_scores

In [None]:
validation_content = dict(zip(list(validation.index), [None]*len(validation)))
for k in validation_content.keys():
    validation_content[k] = content[k]

In [None]:
scores = model(validation_content, positiveness)

In [None]:
validation["prediction"] = scores

In [None]:
validation.head(), validation.describe()

(                               sentiment  prediction
 article                                             
 1755_Lisbon_earthquake                -1   -0.050156
 1896_Summer_Olympics                   1   -0.006281
 1997_Pacific_hurricane_season         -1   -0.037893
 Actinium                               0   -0.005826
 Barracuda                             -1   -0.002747,
        sentiment  prediction
 count  35.000000   35.000000
 mean   -0.085714   -0.007911
 std     0.853072    0.016548
 min    -1.000000   -0.051758
 25%    -1.000000   -0.011520
 50%     0.000000   -0.005826
 75%     1.000000    0.001189
 max     1.000000    0.017951)

## Score predictions by the sign

In [None]:
print(sum((validation["sentiment"] * validation["prediction"]) > 0), "/", len(validation))

15 / 35


## Prediction corrected by validation average (but better if we correct with training average)

In [None]:
print(sum((validation["sentiment"] * (validation["prediction"] + validation["prediction"].mean())) > 0), "/", len(validation))

16 / 35


In [None]:
validation.to_csv('sentiment_analysis_1.csv', index=False)

# 2.

In [None]:
file_names = ['1755_Lisbon_earthquake','1896_Summer_Olympics','1997_Pacific_hurricane_season','Actinium','Barracuda','Basketball','Bath_School_disaster','Chicago','Chocolate','Diamond','Dice','Drinking_water','Duchenne_muscular_dystrophy','Geography_of_Ireland','Giraffe','Gunpowder','Osama_bin_Laden','Palm_oil','Peace','Pellagra','Phishing','Plant','Plato','Pneumonia','Poison_gas_in_World_War_I','Politics','Pollution','Red_Kite','Rice','Rio_de_Janeiro','Romeo_and_Juliet','Rugby_World_Cup','Rwandan_Genocide','Santa_Claus','Scooby-Doo']
len(file_names)

35

### MODEL DESCRIPTION
Pattern is a multipurpose library that is capable of handling NLP operations, data mining, machine learning etc. It also contains sentiment analysis functionality which is suitable for our task.
The `sentiment` function under `pattern.text.en` module is used to calculate the sentiment of a given text, it takes a sentence as input which can also be a string, Synset, word or document, and returns a (polarity, subjectivity)-tuple with polarity between -1.0 and +1.0 and subjectivity between 0.0 and 1.0. polarity describes the emotional leaning of the text, while subjectivity describes the strength of such emotion.

In our usage the input is a string of the entire article loaded from the Wikispeedia dataset, In this case, it first tokenizes the text into words(punctuation, space and abbreviations are handled at this stage), then it Lowercases each word because sentiment analysis is case insensitive. Next it calculates the sentiment of each word by consulting the predefined sentiment [dictionary](https://github.com/clips/pattern/blob/master/pattern/text/en/en-sentiment.xml)(modifiers and negations are also considered at this time). Finally it returns the average of all the words as the sentiment of the text.

Pattern is a classic and well-known non-commercial library for the sentiment analysis task solution, the module itself lasts over 10 years and has 8.6k stars on github. It provides detailed results (polarity and subjectivity) for many mainstream languages, and its API is fast and easy to use with other NLP preprocessing tools embedded. More details can be found in the [official documentation](https://digiasset.org/html/pattern.html) and [repository](https://digiasset.org/html/pattern.html).




### METHOD && RESULTS
Each selected article is loaded and feeded into the `sentiment` function, the polarity is then printed out. The results are shown below.

In [None]:
from pattern.text.en import sentiment

for file_name in file_names:
    with open('data/plaintext_articles/'+file_name+'.txt', 'r', encoding='utf-8') as file:
        data = file.read()
        polarity, subjectivity = sentiment(data)
        print(f"{file_name}: {polarity}")

1755_Lisbon_earthquake: 0.0816923282902664
1896_Summer_Olympics: 0.11173071331653418
1997_Pacific_hurricane_season: 0.051366249491249474
Actinium: 0.03542682926829269
Barracuda: 0.1132213321465658
Basketball: 0.08600218021995364
Bath_School_disaster: 0.010679336219336222
Chicago: 0.10564842500695483
Chocolate: 0.07674311830989919
Diamond: 0.12359236785162714
Dice: -0.0001423413188119043
Drinking_water: 0.12155415214866433
Duchenne_muscular_dystrophy: 0.051342562953478464
Geography_of_Ireland: 0.06101678376268537
Giraffe: 0.04892030793508625
Gunpowder: 0.01667841269841271
Osama_bin_Laden: 0.04482218734525007
Palm_oil: 0.10377811870669014
Peace: 0.07109719189365207
Pellagra: 0.013132859204287773
Phishing: 0.03130031080031078
Plant: 0.09383068133068131
Plato: 0.16051446416831025
Pneumonia: 0.05341062158293232
Poison_gas_in_World_War_I: 0.08464027042373903
Politics: 0.10395405509821987
Pollution: 0.08715013543960917
Red_Kite: 0.035212025919573085
Rice: 0.08478642004761416
Rio_de_Janeiro: 0

The MSE is compared with the human labeled result(trinary value among -1 0 1) which is around **0.7**

### DISCUSSION
The performance of this model is generally not good compared to other models we tested, the possible reason could be that it uses a fixed sentiment dictionary which may contain bias between different domains. It's observed that the model tends to give a very small value between [0,0.1] despite the articles, a reason could be that Wikipedia articles are designed to be neutral and objective, It generally looks at problems dialectically and rarely produces strong emotions. However, in the range of this project, we care more about the sentiment level of the article (the word itself) rather than the way it is written, the model considers all the words on the page evenly and thus may produce results that are not what we expect.

This is not the optimal model for our task, a ML-based approach is generally more flexible and powerful in this case.

# 3. Vader

VADER (Valence Aware Dictionary and sEntiment Reasoner) is a sentiment analysis method attuned expressions from social media. This model classies the text into positive, neutral, and negative parts, then by leveraging lexicon and grammatical rules, from valence scores compute the 'compound score' which is normalized weighted composite score of the sentiment of the text.

Utilizing NLTK, VADER takes into condsideration the text nuances, informal phrases, and even non-English text sentences.

In [None]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from tqdm.auto import tqdm
import os
import pandas as pd

## Load articles' titles and content

In [None]:

titles_path = "Feuille de calcul sans titre - results.csv"
html_dir = "plaintext_articles"

def load_titles(file_path=titles_path):
    """
    Load the titles of Wikispeedia pages

    Input args:
        file_path: path to the text file which contains the list of wikipedia articles
    Output:
        titles: List, list of Wikipedia pages used in Wikispeedia
    """
    titles = pd.read_csv(file_path, header=None)
    titles.columns = ['title']
    return titles

def load_html(titles, dir=html_dir):
    """
    Load the text of Wikispeedia pages

    Input args:
        titles: pandas Dataframe of wikipedia titles for which to retrieve html content
        dir: path to the directory containing the wikipedia articles' html content
    Output:
        content: Dict, dictionary of (title: html content) pairs
    """
    content = []
    for title in titles.title:
        path = os.path.join(dir,title+".txt")
        if os.path.exists(path):
            with open(path, 'r', encoding='utf-8') as file:
                html_content = file.readlines()
                content.append("".join(html_content[1:]))
        else:
            print("WARNING: file", path, "is missing")
    titles['content'] = content
    return titles

In [None]:
articles = load_titles()
len(articles)

35

In [None]:
content = load_html(articles)
content

Unnamed: 0,title,content
0,1755_Lisbon_earthquake,\n1755 Lisbon earthquake\n\n2007 Schools Wikip...
1,1896_Summer_Olympics,\n1896 Summer Olympics\n\n2007 Schools Wikiped...
2,1997_Pacific_hurricane_season,\n1997 Pacific hurricane season\n\n2007 School...
3,Actinium,\nActinium\n\n2007 Schools Wikipedia Selection...
4,Barracuda,\nBarracuda\n\n2007 Schools Wikipedia Selectio...
5,Basketball,\nBasketball\n\n2007 Schools Wikipedia Selecti...
6,Bath_School_disaster,\nBath School disaster\n\n2007 Schools Wikiped...
7,Chicago,\nChicago\n\n2007 Schools Wikipedia Selection....
8,Chocolate,\nChocolate\n\n2007 Schools Wikipedia Selectio...
9,Diamond,\nDiamond\n\n2007 Schools Wikipedia Selection....


## Analysing the compound score

In [None]:
scores = []
analyzer = SentimentIntensityAnalyzer()
for article_content in tqdm(content.iloc):
    vs = analyzer.polarity_scores(article_content.content)
    scores.append(vs['compound'])


0it [00:00, ?it/s]

In [None]:
content['score'] = scores
content

Unnamed: 0,title,content,score
0,1755_Lisbon_earthquake,\n1755 Lisbon earthquake\n\n2007 Schools Wikip...,-0.997
1,1896_Summer_Olympics,\n1896 Summer Olympics\n\n2007 Schools Wikiped...,0.9999
2,1997_Pacific_hurricane_season,\n1997 Pacific hurricane season\n\n2007 School...,-0.9998
3,Actinium,\nActinium\n\n2007 Schools Wikipedia Selection...,-0.8792
4,Barracuda,\nBarracuda\n\n2007 Schools Wikipedia Selectio...,0.9944
5,Basketball,\nBasketball\n\n2007 Schools Wikipedia Selecti...,0.9999
6,Bath_School_disaster,\nBath School disaster\n\n2007 Schools Wikiped...,-0.9998
7,Chicago,\nChicago\n\n2007 Schools Wikipedia Selection....,0.9999
8,Chocolate,\nChocolate\n\n2007 Schools Wikipedia Selectio...,0.9996
9,Diamond,\nDiamond\n\n2007 Schools Wikipedia Selection....,1.0


As can be inferred from the prediction, the model is good at correctly splitting the articles into two separate clusters, with a quite impressive squared error of 0.2. However, the obtained scores are quite extreme, making it impractical to infer the intensity of the negative or positive sense of each article. This could be attributed to the prevalence of neutral words, causing smaller differences between negative and positive counterparts.

In [None]:
content.drop(columns='content').to_csv('results.csv',index=False,header=False)

# 4. Roberta

In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import numpy as np
from scipy.special import softmax
import csv
import urllib.request

def preprocess(text):
    new_text = []


    for t in text.split(" "):
        t = '@user' if t.startswith('@') and len(t) > 1 else t
        t = 'http' if t.startswith('http') else t
        new_text.append(t)
    return " ".join(new_text)

def chunk_text(text, max_length):
    words = text.split()
    for i in range(0, len(words), max_length):
        yield ' '.join(words[i:i+max_length])

task = 'sentiment'
MODEL = f"cardiffnlp/twitter-roberta-base-{task}"

tokenizer = AutoTokenizer.from_pretrained(MODEL)
max_length = 512

# download label mapping
labels=[]
mapping_link = f"https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/{task}/mapping.txt"
with urllib.request.urlopen(mapping_link) as f:
    html = f.read().decode('utf-8').split("\n")
    csvreader = csv.reader(html, delimiter='\t')
labels = [row[1] for row in csvreader if len(row) > 1]

model = AutoModelForSequenceClassification.from_pretrained(MODEL)

text = input("Enter the text you want to analyze: ")
chunks = chunk_text(preprocess(text), max_length)


# Assume labels are in the order of [negative, neutral, positive]
# Adjust the scores to be in the range of [-1, 1]
# For simplicity, this assumes three classes. If there are more classes, the mapping will need to be adjusted accordingly.
final_scores = np.zeros(len(labels))
for chunk in chunks:
    encoded_input = tokenizer(chunk, return_tensors='pt', truncation=True, max_length=max_length)
    output = model(**encoded_input)
    scores = softmax(output[0][0].detach().numpy())
    final_scores += scores

# Normalize the final_scores
final_scores /= final_scores.sum()

# Now, map the scores to the range of [-1, 1]
# This mapping assumes the labels are in the correct order.
if len(labels) == 3:  # Check if there are three sentiment categories
    # Mapping: Negative * -1 + Neutral * 0 + Positive * 1
    sentiment_score = final_scores[2] - final_scores[0]  # positive score minus negative score
else:
    raise ValueError("The number of sentiment labels is not equal to 3, and the scoring system needs to be adjusted.")

print(f"{sentiment_score:.2f}")



Enter the text you want to analyze:    #copyright  Drought  2007 Schools Wikipedia Selection. Related subjects: Climate and the Weather     Fields outside Benambra, Victoria suffering from drought conditions    Fields outside Benambra, Victoria suffering from drought conditions     A drought is an extended period of months or years when a region notes    a deficiency in its water supply. Generally, this occurs when a region    receives consistently below average precipitation. However, it can also    be worsened by man. It can have a substantial impact on the ecosystem    and agriculture of the affected region. Although droughts can be    long-lived, even a short, intense drought can cause significant damage    .     Man-made erosion can play a role, as it did in the North American Dust    Bowl in the 1930s, as can general climate change. Many speculate that    global warming will have a substantial impact on agriculture throughout    the world, and especially in developing nations. Fo

In [None]:

from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import numpy as np
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')


# Assuming the file is already extracted and available as 'SentiWordNet.txt'

sentiwordnet_dict = {}
file_path = '/content/SentiWordNet_3.0.0.txt'  # Update with your actual file path

# Read the file and parse the sentiment scores
with open(file_path, 'r') as swndata:
    for line in swndata:
        # Skip comments and blank lines
        if line.startswith('#') or not line.strip():
            continue
        # Split the line by tab and ensure it has 6 elements
        parts = line.strip().split('\t')
        if len(parts) == 6:
            pos, id, pos_score, neg_score, synset_terms, gloss = parts
            if float(pos_score) == 0.0 and float(neg_score) == 0.0:
                continue
            # Some terms have multiple words separated by space, take the first one
            term = synset_terms.split(' ')[0].split('#')[0]  # Get the term without sense number
            if term:  # Check if the term is not empty
                # The dictionary key is the term and the values are the positive and negative scores
                sentiwordnet_dict[term] = {'PosScore': float(pos_score), 'NegScore': float(neg_score)}

# After populating sentiwordnet_dict
print(list(sentiwordnet_dict.items())[:10])  # Print the first 10 entries



# Now you can use sentiwordnet_dict to look up sentiment scores


# Define the preprocess and sentiment analysis functions
def preprocess(text):
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))
    tokens = word_tokenize(text)
    processed_tokens = [lemmatizer.lemmatize(token) for token in tokens if token.lower() not in stop_words and token.isalpha()]
    #print("Processed words:", processed_tokens)  # This line will print the preprocessed words
    return processed_tokens

def analyze_sentiment(words, sentiwordnet):
    positive_score = negative_score = word_count = 0

    for word in words:
        if word in sentiwordnet:
            print(f"Word: {word}, PosScore: {sentiwordnet[word]['PosScore']}, NegScore: {sentiwordnet[word]['NegScore']}")
            positive_score += sentiwordnet[word]['PosScore']
            negative_score += sentiwordnet[word]['NegScore']
            word_count += 1
    # ...

    if word_count == 0:
        return 0  # Neutral if no words found in SentiWordNet

    # Normalize scores
    avg_positive_score = positive_score / word_count
    avg_negative_score = negative_score / word_count
    total_score = avg_positive_score - avg_negative_score

    return np.clip(total_score, -1, 1)


# Input text
text = input("Enter the text for sentiment analysis: ")
preprocessed_text = preprocess(text)

# Analyze sentiment
sentiment_score = analyze_sentiment(preprocessed_text, sentiwordnet_dict)
print(f"Sentiment score: {sentiment_score}")


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


[('able', {'PosScore': 0.125, 'NegScore': 0.0}), ('unable', {'PosScore': 0.125, 'NegScore': 0.25}), ('dissilient', {'PosScore': 0.25, 'NegScore': 0.0}), ('parturient', {'PosScore': 0.25, 'NegScore': 0.0}), ('uncut', {'PosScore': 0.375, 'NegScore': 0.25}), ('absolute', {'PosScore': 0.0, 'NegScore': 0.25}), ('direct', {'PosScore': 0.125, 'NegScore': 0.0}), ('unquestioning', {'PosScore': 0.25, 'NegScore': 0.625}), ('infinite', {'PosScore': 0.125, 'NegScore': 0.5}), ('living', {'PosScore': 0.0, 'NegScore': 0.125})]
Enter the text for sentiment analysis:    #copyright  Drought  2007 Schools Wikipedia Selection. Related subjects: Climate and the Weather     Fields outside Benambra, Victoria suffering from drought conditions    Fields outside Benambra, Victoria suffering from drought conditions     A drought is an extended period of months or years when a region notes    a deficiency in its water supply. Generally, this occurs when a region    receives consistently below average precipitation

In [None]:
articles_to_analyze = [
    '1755_Lisbon_earthquake.txt',
    '1896_Summer_Olympics.txt',
    '1997_Pacific_hurricane_season.txt',
    'Actinium.txt',
    'Barracuda.txt',
    'Basketball.txt',
    'Bath_School_disaster.txt',
    'Chicago.txt',
    'Chocolate.txt',
    'Diamond.txt',
    'Dice.txt',
    'Drinking_water.txt',
    'Duchenne_muscular_dystrophy.txt',
    'Geography_of_Ireland.txt',
    'George_S_Richardson_engineer.txt',
    'Giraffe.txt',
    'Gunpowder.txt',
    'Ordinal_number.txt',
    'Osama_bin_Laden.txt',
    'Palm_oil.txt',
    'Peace.txt',
    'Pellagra.txt',
    'Phishing.txt',
    'Plant.txt',
    'Plato.txt',
    'Pneumonia.txt',
    'Poison_gas_in_World_War_I.txt',
    'Politics.txt',
    'Pollution.txt',
    'Pompeii.txt',
    'Recycling.txt',
    'Red_Kite.txt',
    'Rice.txt',
    'Rio_de_Janeiro.txt',
    'Robert_K_Beck.txt',
    'Romeo_and_Juliet.txt',
    'Rugby_World_Cup.txt',
    'Rwandan_Genocide.txt',
    'Salt.txt',
    'Sand.txt',
    'Santa_Claus.txt',
    'Scooby-Doo.txt',
    'Seed.txt',
    'Sequoia.txt'
]



In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import numpy as np
from scipy.special import softmax
import csv
import urllib.request

# Function to preprocess the text
def preprocess(text):
    new_text = []
    for t in text.split(" "):
        t = '@user' if t.startswith('@') and len(t) > 1 else t
        t = 'http' if t.startswith('http') else t
        new_text.append(t)
    return " ".join(new_text)

# Function to chunk text into manageable parts
def chunk_text(text, max_length):
    words = text.split()
    for i in range(0, len(words), max_length):
        yield ' '.join(words[i:i+max_length])

# Function to adjust score with the lexicon
def adjust_score_with_lexicon(text, score, positive_words, negative_words, max_adjustment=0.5):
    words = text.split()
    positive_count = sum(word in positive_words for word in words)
    negative_count = sum(word in negative_words for word in words)

    if abs(score) >= 0.2:
        difference = positive_count - negative_count
        adjustment = (np.log(abs(difference) + 1) / np.log(max_adjustment + 1)) * np.sign(difference)
        adjustment = np.clip(adjustment, -max_adjustment, max_adjustment)
    else:
        return score

    score += adjustment
    score = np.clip(score, -1, 1)
    return score

In [None]:
import os
import zipfile

zip_file_path = '/content/plaintext_articles.zip'
extract_dir = '/content/extracted_articles'


# Extract the zip file
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(extract_dir)

# After extracting, get the list of all files in the directory
extracted_files = os.listdir(extract_dir)
print("Extracted files:", extracted_files)
extracted_files = os.listdir(extract_dir +'/' + extracted_files[0])
print("Extracted files:", extracted_files)
# Ensure the files to analyze are in the extracted files list
text_files = [os.path.join(extract_dir +'/plaintext_articles' , file) for file in articles_to_analyze if file in extracted_files]
print(text_files)

for file_path in text_files:
    with open(file_path, 'r') as file:
        text = file.read()

    # Define the task and model
    task = 'sentiment'
    MODEL = f"cardiffnlp/twitter-roberta-base-{task}"

    # Load tokenizer and model from Hugging Face
    tokenizer = AutoTokenizer.from_pretrained(MODEL)
    model = AutoModelForSequenceClassification.from_pretrained(MODEL)

    # Set the max_length for tokenization
    max_length = 512

    # Download and load the label mapping
    labels = []
    mapping_link = f"https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/{task}/mapping.txt"
    with urllib.request.urlopen(mapping_link) as f:
        html = f.read().decode('utf-8').split("\n")
        csvreader = csv.reader(html, delimiter='\t')
    labels = [row[1] for row in csvreader if len(row) > 1]

    # Load positive and negative words lists (assuming you have them as text files)
    positive_words = set(open('/content/Positive words.txt').read().splitlines())
    negative_words = set(open('/content/Negative words.txt').read().splitlines())

    chunks = chunk_text(preprocess(text), max_length)
    final_scores = np.zeros(len(labels))
    for chunk in chunks:
        encoded_input = tokenizer(chunk, return_tensors='pt', truncation=True, max_length=max_length)
        output = model(**encoded_input)
        scores = softmax(output[0][0].detach().numpy())
        final_scores += scores


    final_scores /= final_scores.sum()

    sentiment_score = final_scores[2] - final_scores[0]  # Positive score minus negative score
    sentiment_score = adjust_score_with_lexicon(preprocess(text), sentiment_score, positive_words, negative_words)

    print(f"File: {os.path.basename(file_path)} - Sentiment score: {sentiment_score:.2f}")

Extracted files: ['plaintext_articles']
Extracted files: ['Palace_of_Westminster.txt', 'Romulus_Augustus.txt', 'The_Holocaust.txt', 'Wikimedia_Foundation.txt', 'Mickey_Mantle.txt', 'Suriname.txt', 'South_America.txt', 'Foie_gras.txt', 'Spacecraft_propulsion.txt', 'Augustan_literature.txt', 'Cubism.txt', 'Wikipedia_Text_of_the_GNU_Free_Documentation_License.txt', 'Northern_Mariana_Islands.txt', 'Roan_Antelope.txt', 'Trade_union.txt', 'Cologne.txt', 'Isambard_Kingdom_Brunel.txt', '47_Ursae_Majoris_b.txt', 'Gliese_876_b.txt', 'Tiktaalik.txt', 'Rubik%27s_Cube.txt', 'Invasion.txt', 'Cape_Lion.txt', 'Economy_of_Africa.txt', 'England.txt', 'Mark_Antony.txt', 'Italian_War_of_1521.txt', 'Spider.txt', 'Dada.txt', 'Ipswich.txt', 'Lisbon.txt', 'Stephen_of_England.txt', 'Ghost_Dance.txt', 'Comoros.txt', 'Crab_Nebula.txt', 'Zinc.txt', 'Santorini.txt', 'United_States.txt', 'Strait_of_Malacca.txt', 'Hugo_Wolf.txt', 'Lovage.txt', 'Goleta%2C_California.txt', 'James_I_of_England.txt', 'Okapi.txt', 'Medie

# 5. 

In [None]:
file_names = ['1755_Lisbon_earthquake','1896_Summer_Olympics','1997_Pacific_hurricane_season','Actinium','Barracuda','Basketball','Bath_School_disaster','Chicago','Chocolate','Diamond','Dice','Drinking_water','Duchenne_muscular_dystrophy','Geography_of_Ireland','Giraffe','Gunpowder','Osama_bin_Laden','Palm_oil','Peace','Pellagra','Phishing','Plant','Plato','Pneumonia','Poison_gas_in_World_War_I','Politics','Pollution','Red_Kite','Rice','Rio_de_Janeiro','Romeo_and_Juliet','Rugby_World_Cup','Rwandan_Genocide','Santa_Claus','Scooby-Doo']
len(file_names)

35

### MODEL DESCRIPTION
Pattern is a multipurpose library that is capable of handling NLP operations, data mining, machine learning etc. It also contains sentiment analysis functionality which is suitable for our task.
The `sentiment` function under `pattern.text.en` module is used to calculate the sentiment of a given text, it takes a sentence as input which can also be a string, Synset, word or document, and returns a (polarity, subjectivity)-tuple with polarity between -1.0 and +1.0 and subjectivity between 0.0 and 1.0. polarity describes the emotional leaning of the text, while subjectivity describes the strength of such emotion.

In our usage the input is a string of the entire article loaded from the Wikispeedia dataset, In this case, it first tokenizes the text into words(punctuation, space and abbreviations are handled at this stage), then it Lowercases each word because sentiment analysis is case insensitive. Next it calculates the sentiment of each word by consulting the predefined sentiment [dictionary](https://github.com/clips/pattern/blob/master/pattern/text/en/en-sentiment.xml)(modifiers and negations are also considered at this time). Finally it returns the average of all the words as the sentiment of the text.

Pattern is a classic and well-known non-commercial library for the sentiment analysis task solution, the module itself lasts over 10 years and has 8.6k stars on github. It provides detailed results (polarity and subjectivity) for many mainstream languages, and its API is fast and easy to use with other NLP preprocessing tools embedded. More details can be found in the [official documentation](https://digiasset.org/html/pattern.html) and [repository](https://digiasset.org/html/pattern.html).




### METHOD && RESULTS
Each selected article is loaded and feeded into the `sentiment` function, the polarity is then printed out. The results are shown below.

In [None]:
from pattern.text.en import sentiment

for file_name in file_names:
    with open('data/plaintext_articles/'+file_name+'.txt', 'r', encoding='utf-8') as file:
        data = file.read()
        polarity, subjectivity = sentiment(data)
        print(f"{file_name}: {polarity}")

1755_Lisbon_earthquake: 0.0816923282902664
1896_Summer_Olympics: 0.11173071331653418
1997_Pacific_hurricane_season: 0.051366249491249474
Actinium: 0.03542682926829269
Barracuda: 0.1132213321465658
Basketball: 0.08600218021995364
Bath_School_disaster: 0.010679336219336222
Chicago: 0.10564842500695483
Chocolate: 0.07674311830989919
Diamond: 0.12359236785162714
Dice: -0.0001423413188119043
Drinking_water: 0.12155415214866433
Duchenne_muscular_dystrophy: 0.051342562953478464
Geography_of_Ireland: 0.06101678376268537
Giraffe: 0.04892030793508625
Gunpowder: 0.01667841269841271
Osama_bin_Laden: 0.04482218734525007
Palm_oil: 0.10377811870669014
Peace: 0.07109719189365207
Pellagra: 0.013132859204287773
Phishing: 0.03130031080031078
Plant: 0.09383068133068131
Plato: 0.16051446416831025
Pneumonia: 0.05341062158293232
Poison_gas_in_World_War_I: 0.08464027042373903
Politics: 0.10395405509821987
Pollution: 0.08715013543960917
Red_Kite: 0.035212025919573085
Rice: 0.08478642004761416
Rio_de_Janeiro: 0

The MSE is compared with the human labeled result(trinary value among -1 0 1) which is around **0.7**

### DISCUSSION
The performance of this model is generally not good compared to other models we tested, the possible reason could be that it uses a fixed sentiment dictionary which may contain bias between different domains. It's observed that the model tends to give a very small value between [0,0.1] despite the articles, a reason could be that Wikipedia articles are designed to be neutral and objective, It generally looks at problems dialectically and rarely produces strong emotions. However, in the range of this project, we care more about the sentiment level of the article (the word itself) rather than the way it is written, the model considers all the words on the page evenly and thus may produce results that are not what we expect.

This is not the optimal model for our task, a ML-based approach is generally more flexible and powerful in this case.