The first pip install is super important, make sure it is installed and you restart the kernel. **Make sure you import in the text files for the articles into your working director.**

In [None]:
!pip install https://github.com/explosion/spacy-models/releases/download/en_core_web_trf-3.7.3/en_core_web_trf-3.7.3-py3-none-any.whl


Below we begin by importing the text files, we suggest you do sentiment analysis for each topic separately and create a CSV file after each sentiment analysis. Later we will show you how we create the CSV.

In [None]:
# This makes it easy to grab everything titled (blah blah blah)whateveryouwant.txt
import glob
file_list = glob.glob('*Israel.txt')

In [None]:
# Double check that the files imported properly
print(file_list)

In [None]:
# Function to load in the articles and read them
def load_article(path):
    with open(path, "r", encoding="utf-8") as f:
        return f.read()


In [None]:
# This is our main thing we are using to split up the text and allow it to be
# used to create our model for sentiment analysis
import spacy
nlp = spacy.load("en_core_web_trf")

def segment_article(text):
    doc = nlp(text)
    sentences = [sent.text.strip() for sent in doc.sents]
    return sentences


In [None]:
def chunk_sentences(sentences, chunk_size=5):
  """A function designed to split up the text into chunks to
  then input into the model"""
  chunks = []
  for i in range(0, len(sentences), chunk_size):
      chunk = " ".join(sentences[i:i+chunk_size])
      chunks.append(chunk)
  return chunks


In [None]:
# Very important, importing from the hugging face library a pre-trained model
# based off of twitter data that better understands context for sentiment
# analysis
from transformers import pipeline

sentiment_model = pipeline(
    "text-classification",
    model="cardiffnlp/twitter-roberta-base-sentiment-latest",
    return_all_scores=True
)


In [None]:
def normalize_sentiment_output(output):
    if isinstance(output, list) and isinstance(output[0], dict):
        return output
    if isinstance(output, dict):
        return [output]
    raise ValueError(f"Unexpected sentiment output: {output}")


In [None]:
def discourse_weight(text):
  """Weighs some text slightly more compared to other words considering
  their impact in the text"""
  text = text.lower()
  if "in conclusion" in text or "overall" in text:
      return 1.5
  if "however" in text or "but" in text:
      return 1.3
  return 1.0


In [None]:
def analyze_chunks(chunks):
  """Analyzes the chunks individually to obtain a score for each chunk"""
  results = []

  for chunk in chunks:
      raw_output = sentiment_model(chunk)
      scores = normalize_sentiment_output(raw_output)

      weight = discourse_weight(chunk)

      weighted_scores = {}
      for s in scores:
          weighted_scores[s["label"]] = s["score"] * weight

      results.append(weighted_scores)

  return results


In [None]:
from collections import defaultdict

def aggregate_sentiment(chunk_results):
  """Function to aggregate all chunk sentiment values to get a total document
     wide understanding of the sentiment"""
  totals = defaultdict(float)

  for chunk in chunk_results:
      for label, score in chunk.items():
          totals[label] += score

  total = sum(totals.values())
  return {k: v / total for k, v in totals.items()}


Below is putting everything together in a for loop to go over each document and obtain a sentiment value for each document. We then append it to a list, and convert it into a dictionary for easy conversion into a CSV file.

In [None]:
positive = []
neutral = []
negative = []

In [None]:
for txt in file_list:
  article = load_article(txt)
  sentences = segment_article(article)
  chunks = chunk_sentences(sentences)
  chunk_results = analyze_chunks(chunks)
  doc_sentiment = aggregate_sentiment(chunk_results)
  positive.append(doc_sentiment.get('positive', 0.0))
  neutral.append(doc_sentiment.get('neutral', 0.0))
  negative.append(doc_sentiment.get('negative', 0.0))

# Checking that we have sentiment scores
print(positive)
print(neutral)
print(negative)

In [None]:
dictionary_for_csv = {
    'File Name': file_list,
    'Positive': positive,
    'Neutral': neutral,
    'Negative': negative
}

In [None]:
import pandas as pd

# Converting into a CSV, rename to whatever data you are working with
df = pd.DataFrame(dictionary_for_csv)
df.to_csv('full_israel_sentiment.csv', index = False)