The first pip install is super important, make sure it is installed and you restart the kernel. **Make sure you import in the text files for the articles into your working director.**

In [4]:
!pip install https://github.com/explosion/spacy-models/releases/download/en_core_web_trf-3.7.3/en_core_web_trf-3.7.3-py3-none-any.whl


Collecting en-core-web-trf==3.7.3
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_trf-3.7.3/en_core_web_trf-3.7.3-py3-none-any.whl (457.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m457.4/457.4 MB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting spacy<3.8.0,>=3.7.2 (from en-core-web-trf==3.7.3)
  Downloading spacy-3.7.5-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (27 kB)
Collecting spacy-curated-transformers<0.3.0,>=0.2.0 (from en-core-web-trf==3.7.3)
  Downloading spacy_curated_transformers-0.2.2-py2.py3-none-any.whl.metadata (2.7 kB)
Collecting thinc<8.3.0,>=8.2.2 (from spacy<3.8.0,>=3.7.2->en-core-web-trf==3.7.3)
  Downloading thinc-8.2.5-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (15 kB)
Collecting langcodes<4.0.0,>=3.2.0 (from spacy<3.8.0,>=3.7.2->en-core-web-trf==3.7.3)
  Downloading langcodes-3.5.1-py3-none-any.whl.metadata (30 kB)
Collecting curated-transform

Below we begin by importing the text files, we suggest you do sentiment analysis for each topic separately and create a CSV file after each sentiment analysis. Later we will show you how we create the CSV.

In [1]:
# This makes it easy to grab everything titled (blah blah blah)whateveryouwant.txt
import glob
file_list = glob.glob('*Israel.txt')

In [2]:
# Double check that the files imported properly
print(file_list)

['Wall_Street_Journal_Israel.txt', 'NBC_Israel.txt', 'Forbes_Israel.txt', 'MSNBC_Israel.txt', 'USA_Today_Israel.txt', 'CBS_News_Israel.txt', 'Fox_News_Israel.txt', 'NY Post_Israel.txt', 'WP_Israel.txt', 'NYT_Israel.txt', 'Associated_Press_Israel.txt', 'BBC_Israel.txt', 'NPR_Israel.txt', 'CNN_Israel.txt', 'ABC_Israel.txt', 'PBS_Israel.txt']


In [4]:
# Function to load in the articles and read them
def load_article(path):
    with open(path, "r", encoding="utf-8") as f:
        return f.read()


In [5]:
# This is our main thing we are using to split up the text and allow it to be
# used to create our model for sentiment analysis
import spacy
nlp = spacy.load("en_core_web_trf")

def segment_article(text):
    doc = nlp(text)
    sentences = [sent.text.strip() for sent in doc.sents]
    return sentences


In [8]:
def chunk_sentences(sentences, chunk_size=5):
  """A function designed to split up the text into chunks to
  then input into the model"""
  chunks = []
  for i in range(0, len(sentences), chunk_size):
      chunk = " ".join(sentences[i:i+chunk_size])
      chunks.append(chunk)
  return chunks


In [9]:
# Very important, importing from the hugging face library a pre-trained model
# based off of twitter data that better understands context for sentiment
# analysis
from transformers import pipeline

sentiment_model = pipeline(
    "text-classification",
    model="cardiffnlp/twitter-roberta-base-sentiment-latest",
    return_all_scores=True
)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/929 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/501M [00:00<?, ?B/s]

Loading weights:   0%|          | 0/201 [00:00<?, ?it/s]

model.safetensors:   0%|          | 0.00/501M [00:00<?, ?B/s]

RobertaForSequenceClassification LOAD REPORT from: cardiffnlp/twitter-roberta-base-sentiment-latest
Key                             | Status     |  | 
--------------------------------+------------+--+-
roberta.pooler.dense.bias       | UNEXPECTED |  | 
roberta.embeddings.position_ids | UNEXPECTED |  | 
roberta.pooler.dense.weight     | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

In [11]:
def normalize_sentiment_output(output):
    if isinstance(output, list) and isinstance(output[0], dict):
        return output
    if isinstance(output, dict):
        return [output]
    raise ValueError(f"Unexpected sentiment output: {output}")


In [12]:
def discourse_weight(text):
  """Weighs some text slightly more compared to other words considering
  their impact in the text"""
  text = text.lower()
  if "in conclusion" in text or "overall" in text:
      return 1.5
  if "however" in text or "but" in text:
      return 1.3
  return 1.0


In [13]:
def analyze_chunks(chunks):
  """Analyzes the chunks individually to obtain a score for each chunk"""
  results = []

  for chunk in chunks:
      raw_output = sentiment_model(chunk)
      scores = normalize_sentiment_output(raw_output)

      weight = discourse_weight(chunk)

      weighted_scores = {}
      for s in scores:
          weighted_scores[s["label"]] = s["score"] * weight

      results.append(weighted_scores)

  return results


In [14]:
from collections import defaultdict

def aggregate_sentiment(chunk_results):
  """Function to aggregate all chunk sentiment values to get a total document
     wide understanding of the sentiment"""
  totals = defaultdict(float)

  for chunk in chunk_results:
      for label, score in chunk.items():
          totals[label] += score

  total = sum(totals.values())
  return {k: v / total for k, v in totals.items()}


Below is putting everything together in a for loop to go over each document and obtain a sentiment value for each document. We then append it to a list, and convert it into a dictionary for easy conversion into a CSV file.

In [16]:
positive = []
neutral = []
negative = []

In [17]:
for txt in file_list:
  article = load_article(txt)
  sentences = segment_article(article)
  chunks = chunk_sentences(sentences)
  chunk_results = analyze_chunks(chunks)
  doc_sentiment = aggregate_sentiment(chunk_results)
  positive.append(doc_sentiment.get('positive', 0.0))
  neutral.append(doc_sentiment.get('neutral', 0.0))
  negative.append(doc_sentiment.get('negative', 0.0))

# Checking that we have sentiment scores
print(positive)
print(neutral)
print(negative)

  with torch.cuda.amp.autocast(self._mixed_precision):


[0.0951187061794788, 1.0, 0.26819287797910557, 0.3869727598352495, 0.2803675762588341, 0.3086556493425071, 0.46914932297250295, 1.0, 0.0, 0.07645806273460051, 0.0, 0.11535468799154439, 0.12018378808432753, 0.1893670133569353, 0.6878327470587272, 0.0]
[0.5903005494006713, 0.0, 0.7318071220208945, 0.39635752623241455, 0.6460847987358737, 0.6913443506574928, 0.5308506770274971, 0.0, 0.6800553155410661, 0.4560140558516734, 0.6482989062682624, 0.2242632174064751, 0.5269122496119601, 0.7437079787781695, 0.31216725294127284, 0.8302281786370932]
[0.3145807444198498, 0.0, 0.0, 0.216669713932336, 0.07354762500529216, 0.0, 0.0, 0.0, 0.3199446844589339, 0.46752788141372603, 0.35170109373173764, 0.6603820946019805, 0.3529039623037123, 0.06692500786489528, 0.0, 0.16977182136290686]


In [None]:
dictionary_for_csv = {
    'File Name': file_list,
    'Positive': positive,
    'Neutral': neutral,
    'Negative': negative
}

In [None]:
import pandas as pd

# Converting into a CSV, rename to whatever data you are working with
df = pd.DataFrame(dictionary_for_csv)
df.to_csv('full_israel_sentiment.csv', index = False)