In [None]:
# Implementation of the dataset speech-b.tsv as an enrichment of the dataset speech-a.tsv, containing propaganda detection information for various speeches.

In [1]:
# 1
# Addition of informations about the speech author.

# Importing the necessary libraries
import pandas as pd
import requests
import re
from bs4 import BeautifulSoup

# Definition of functions
def extract_info(text, name, surname):
    
    info = {
        "Name": name,
        "Surname": surname,
        "Date of birth": None,
        "Place of birth": None,
        "Date of death": None,
        "Place of death": None,
        "Nationality": None
    }
    # text extraction with Regex
    match = re.search(r"born\s([^)]*)\)", text)
    if match:
        # date and place of birth patterns
        birth_match = match.group(1).split("—")[0]
        birth_list = birth_match.split(", ")
        
        info["Date of birth"] = ", ".join(birth_list[:2]) if len(birth_list) > 1 else birth_list[0]
        info["Place of birth"] = ", ".join(birth_list[2:4]) if len(birth_list) > 3 else birth_list[2] if len(birth_list) > 2 else None
        info["Nationality"] = birth_list[-1] if birth_list else None

        # date and place of death patterns
        d_match = re.search(r"died ([A-Za-z]+ \d{1,2}, \d{4})(?:, ([A-Za-z\s]+))?", match.group(1))
        if d_match:
            info["Date of death"] = d_match.group(1)
            info["Place of death"] = d_match.group(2) if d_match.group(2) else None
    
    return info

def get_author_info(name, surname):
    
    url = f"https://www.britannica.com/biography/{name}-{surname}"
    try:
        response = requests.get(url, headers={"User-Agent": "Mozilla/5.0"}, timeout=5)
       
        # status_code=200 means connection was successful 
        if response.status_code != 200:
            return {"Error": "Page not found", "Name": name, "Surname": surname}

        # response analysis with Beautiful Soup
        soup = BeautifulSoup(response.text, 'html.parser')
        intro_section = soup.find('p', class_='topic-paragraph')
        if not intro_section:
            return {"Error": "Introduction text not found", "Name": name, "Surname": surname}
        
        return extract_info(intro_section.get_text(), name, surname)
    except requests.exceptions.RequestException:
        return {"Error": "Request failed", "Name": name, "Surname": surname}

# Dataset loading
df = pd.read_csv("speech-a.tsv", sep="\t", header=None)

# Author's info saving
authors = {
    "Barack": "Obama",
    "Winston": "Churchill",
    "Donald": "Trump",
    "Joseph": "Goebbels"
}

# Enrichment author's info
author_info_list = [get_author_info(name, surname) for name, surname in authors.items()]

# Info saving
dfb = pd.DataFrame(author_info_list)
display(dfb)

Unnamed: 0,Name,Surname,Date of birth,Place of birth,Date of death,Place of death,Nationality
0,Barack,Obama,"August 4, 1961","Honolulu, Hawaii",,,U.S.
1,Winston,Churchill,"November 30, 1874","Blenheim Palace, Oxfordshire","January 24, 1965",London,England
2,Donald,Trump,"June 14, 1946","New York, New York",,,U.S.
3,Joseph,Goebbels,"October 29, 1897","Rheydt, Germany","May 1, 1945",Berlin,Germany


In [2]:
# 2
# Addition of speech's metadata.

# Importing the necessary libraries
import spacy

# Spacy's NLP model loading
nlp = spacy.load("en_core_web_trf")

# Definition of functions
def clean_occasion(text):
    
    # time pattern definition 
    time_pattern = r"\b(\d{1,2}:\d{2}\s*[apAP][mM]|\d{1,2}\s*[apAP][mM])\b"
    
    # replaces time_pattern with "" in text and removes spaces 
    # at the beginning and at the end with strip
    text = re.sub(time_pattern, "", text).strip() 

    # removes some irrilevant occurances
    irrelevant_phrases = ["less than", "more than", "over", "under", "is very"]
    for phrase in irrelevant_phrases:
        if phrase in text:
            return ""  

    # removes "-" at the beginning and at the end
    text = re.sub(r"^[–\s]+|[–\s]+$", "", text)

    # cannot return a single word
    return text if len(text.split()) > 1 else ""  

def extract_speech_info(text):
    
    # speech's metadata 
    info = {
        "Date": None,
        "Place": None,
        "Occasion": None,
        "Speech": None
    }

    # text cleaning with nlp model: doc creates a sequence of token 
    text = text.strip()
    doc = nlp(text)

    # finds named entities and verifies if its label is date
    for ent in doc.ents:
        if ent.label_ == "DATE":
            # saves the named entity's textual representation in dict
            info["Date"] = ent.text
            break  

    # saves the named entity's textual representation in dict if its label is GPE
    place_candidates = [ent.text for ent in doc.ents if ent.label_ == "GPE"]
    if len(place_candidates) >= 2:
        info["Place"] = f"{place_candidates[0]}, {place_candidates[1]}"  
    elif place_candidates:
        info["Place"] = place_candidates[0]  

    # finds named entities and verifies if its label isn't date or GPE and not a single word
    occasion_candidates = [
        ent.text for ent in doc.ents 
        if ent.label_ not in ["DATE", "GPE"] and len(ent.text.split()) > 1  
    ]

    # occasion cleaning
    if occasion_candidates:
        occasion_text = clean_occasion(occasion_candidates[0])
    else:
        occasion_text = ""

    # if occasion isn't found and there's a date
    # identifies the text before date index and creates a new doc
    if not occasion_text and info["Date"]:
        date_index = text.find(info["Date"])
        text_before_date = text[:date_index].strip()
        doc_before_date = nlp(text_before_date)
        
        # divides the text into sentences and takes the no-short ones
        alternative_candidates = [
            sent.text.strip() for sent in doc_before_date.sents
            if len(sent.text.split()) > 1  
        ]

        # cleans the last found sentence before the date
        if alternative_candidates:
            occasion_text = clean_occasion(alternative_candidates[-1])  

    info["Occasion"] = occasion_text if occasion_text else None

    # speech extraction (it starts after date)
    if info["Date"]:
        speech_start = text.find(info["Date"]) + len(info["Date"])
        speech_text = text[speech_start:].strip()

        # speech cleaning from occasion or time
        if info["Occasion"]:
            speech_text = speech_text.replace(info["Occasion"], "").strip()
        speech_text = re.sub(r"\b(\d{1,2}:\d{2}([ap]m)?)\b", "", speech_text).strip()  

        # remove - at the beginning or at the end
        speech_text = re.sub(r"^[–\s]+|[–\s]+$", "", speech_text)

        info["Speech"] = speech_text

    return info

# Uses map() to call extract_speech_info() on the first 20 rows for CPU limitation
data_extracted = list(map(extract_speech_info, df.iloc[:20,2]))
df_extracted = pd.DataFrame(data_extracted)
df_extracted.insert(0, "Author", df.iloc[:20, 0].values)

display(df_extracted.head())

Unnamed: 0,Author,Date,Place,Occasion,Speech
0,Obama,20th January 2009,"America, America",Obama Inaugural Address,My fellow citizens: I stand here today humble...
1,Obama,"November 4, 2008","Grant Park, Illinois",Election Night,If there is anyone out there who still doubts ...
2,Obama,"November 3, 2008","Manassas, Prince William County",Night Before the Election,What a scene. What a crowd. Thank you for Virg...
3,Obama,"August 28, 2008","Denver, Colorado",the Democratic Convention Mile High Stadium,To Chairman Dean and my great friend Dick Durb...
4,Obama,"June 3, 2008","St. Paul, Minnesota",Primary Night,"Tonight, after fifty-four hard-fought contests..."


In [3]:
# 3
# Addition of features text-based.

# Importing the necessary libraries
import textstat
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from spacy.lang.en.stop_words import STOP_WORDS
from transformers import RobertaTokenizer
from transformers import pipeline
from collections import Counter

# Spacy NLP model loading optimized
nlp = spacy.load("en_core_web_sm", disable=["parser", "tagger"])

# Models loading
sentiment_analyzer = SentimentIntensityAnalyzer()
emotion_analyzer = pipeline("text-classification", model="j-hartmann/emotion-english-distilroberta-base")
tokenizer = RobertaTokenizer.from_pretrained("distilroberta-base")

# Definition of functions

# Split text into smaller parts for Transformer model limitations
def split_text(text, max_tokens=512):

    # text is converted into a list of tokens
    tokens = tokenizer.encode(text, add_special_tokens=False)
    # creating a token collection with a maximum size of 512 
    for i in range(0, len(tokens), max_tokens):
        # tokens are decoded to get back the original text
        yield tokenizer.decode(tokens[i:i + max_tokens], skip_special_tokens=True)

def analyze_text_features(text):

    # all lowercase
    text = str(text).lower()
    features = {}
    # uses textstat library to get a text readability grade
    features["Readability"] = textstat.flesch_kincaid_grade(text)
    
    sentiment_scores, emotions = [], []
    for segment in split_text(text):
        # polarity_scores provides a float indicating the strength of the text's sentiment
        # compound is an aggregate score (between pos and neg)
        sentiment_scores.append(sentiment_analyzer.polarity_scores(segment)["compound"])
        try:
            # analyzes the first group to avoid errors and returns the label indicating the emotion
            emotions.append(emotion_analyzer(segment[:512])[0]["label"])
        except:
            continue
    # calculates an average of the various scores obtained
    features["Sentiment"] = sum(sentiment_scores) / len(sentiment_scores) if sentiment_scores else 0
    features["Emotion"] = max(set(emotions), key=emotions.count) if emotions else "neutral"

    # nlp model identifies the most frequent words excluding stop words and not alphabetic words
    doc = nlp(text)
    words = (token.text for token in doc if token.text not in STOP_WORDS and token.is_alpha)
    features["Frequent Words"] = Counter(words).most_common(10)
    
    return features

# Uses map() to call analyze_text_features on Speech column
df_features = list(map(analyze_text_features, df_extracted["Speech"]))
df_features = pd.DataFrame(df_features)
df_final = pd.concat([df_extracted, df_features], axis=1)

display(df_final.head())

Device set to use cpu
Token indices sequence length is longer than the specified maximum sequence length for this model (2808 > 512). Running this sequence through the model will result in indexing errors


Unnamed: 0,Author,Date,Place,Occasion,Speech,Readability,Sentiment,Emotion,Frequent Words
0,Obama,20th January 2009,"America, America",Obama Inaugural Address,My fellow citizens: I stand here today humble...,10.7,0.641,neutral,"[(nation, 12), (new, 11), (america, 10), (peop..."
1,Obama,"November 4, 2008","Grant Park, Illinois",Election Night,If there is anyone out there who still doubts ...,10.4,0.97854,neutral,"[(america, 14), (tonight, 13), (people, 12), (..."
2,Obama,"November 3, 2008","Manassas, Prince William County",Night Before the Election,What a scene. What a crowd. Thank you for Virg...,6.0,0.1242,neutral,"[(change, 14), (fired, 14), (ready, 13), (virg..."
3,Obama,"August 28, 2008","Denver, Colorado",the Democratic Convention Mile High Stadium,To Chairman Dean and my great friend Dick Durb...,10.1,0.914136,neutral,"[(promise, 31), (america, 26), (mccain, 21), (..."
4,Obama,"June 3, 2008","St. Paul, Minnesota",Primary Night,"Tonight, after fifty-four hard-fought contests...",12.1,0.673517,joy,"[(change, 16), (time, 14), (new, 10), (country..."


In [8]:
# 4
# Adding more text features 

# Importing the necessary libraries
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
from sumy.parsers.plaintext import PlaintextParser
from sumy.summarizers.lsa import LsaSummarizer
from sumy.nlp.tokenizers import Tokenizer
from keybert import KeyBERT
import nltk

# Download of punkt module if not present 
nltk.download('punkt')

# Models loading 
keybert_model = KeyBERT()
vectorizer = CountVectorizer(stop_words="english", max_features=5000)
lda = LatentDirichletAllocation(n_components=5, random_state=42)

# Definition of functions 

# sentences=3 takes only the first 3 most representative sentences
def generate_summary(text, sentences=3):

    # divides the text into sentences, creating an object with its structure inside 
    parser = PlaintextParser.from_string(text, Tokenizer("english"))
    # Summarizer() loading
    summarizer = LsaSummarizer()
    # generates a summary of the most representative sentences, joining the selected ones
    return " ".join(str(sentence) for sentence in summarizer(parser.document, sentences))

def extract_keywords_keybert(text, num_keywords=5):

    # BERT model identifies the text's keywords, considering single or pairs of words, excluding stopwords
    keywords = keybert_model.extract_keywords(text, keyphrase_ngram_range=(1, 2), stop_words='english')
    # takes and joins the first 5 keywords, excluding associated scores (kw is a tuple, where kw[1] is score)
    return ", ".join(kw[0] for kw in keywords[:num_keywords])

# Extracts keywords for each topic
def get_topic_words(topic, n_words=3):

    # returns topics with the highest weight (last 3 because the weight order is ascending)
    # reason why the order is then reversed [::-1]
    return ", ".join(words[i] for i in topic.argsort()[-n_words:][::-1])

def get_total_topics(probs, threshold=0.1):

    # selects topics with probability greater than 0.1
    topics = [idx for idx, prob in enumerate(probs) if prob > threshold]

    # returns the total of the most probable topics and the associated keywords, using idx in topic_words dict 
    return f"{len(topics)}:" + ", ".join(topic_words[idx] for idx in topics)

# Uses map() to call functions on Speech column
df_final["Abstract"] = list(map(generate_summary, df_final["Speech"]))
df_final["Keywords"] = list(map(extract_keywords_keybert, df_final["Speech"]))

# Speechs vectorization for working with LDA (it doesn't recognize raw text)
X = vectorizer.fit_transform(df_final["Speech"])
# Identifies different topics 
lda.fit(X)

# Calculates topics probability identified within the speech 
topic_probs = lda.transform(X)

# df has row=speechs and column=topics, a couple is the probability 
topic_probs_df = pd.DataFrame(topic_probs)

# Extract keywords from the text
words = vectorizer.get_feature_names_out()

# Creates a dict having key=topic index and value=most significant words associated 
# lda.components_ is a matrix, with row=topics and column=words associated (as numeric weighted values)
# get_topic_words() performs a numeric to string conversion on the words
topic_words = {idx: get_topic_words(topic) for idx, topic in enumerate(lda.components_)}

# applies get_topic_names() on every row of the df (on probabilities)
df_final["Topics"] = topic_probs_df.apply(lambda row: get_total_topics(row, threshold=0.1), axis=1)

display(df_final.head())

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/davidebottiglieri98/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


Unnamed: 0,Author,Date,Place,Occasion,Speech,Readability,Sentiment,Emotion,Frequent Words,Abstract,Keywords,Topics
0,Obama,20th January 2009,"America, America",Obama Inaugural Address,My fellow citizens: I stand here today humble...,10.7,0.641,neutral,"[(nation, 12), (new, 11), (america, 10), (peop...","Our economy is badly weakened, a consequence o...","presidential oath, oath taken, oath words, oat...","1:america, change, people"
1,Obama,"November 4, 2008","Grant Park, Illinois",Election Night,If there is anyone out there who still doubts ...,10.4,0.97854,neutral,"[(america, 14), (tonight, 13), (people, 12), (...","To my campaign manager David Plouffe, my chief...","doubts america, democracy tonight, wondered am...","1:america, change, people"
2,Obama,"November 3, 2008","Manassas, Prince William County",Night Before the Election,What a scene. What a crowd. Thank you for Virg...,6.0,0.1242,neutral,"[(change, 14), (fired, 14), (ready, 13), (virg...","And that is what this campaign has been about,...","virginia rally, thank virginia, fulfill virgin...","1:change, fired, ready"
3,Obama,"August 28, 2008","Denver, Colorado",the Democratic Convention Mile High Stadium,To Chairman Dean and my great friend Dick Durb...,10.1,0.914136,neutral,"[(promise, 31), (america, 26), (mccain, 21), (...","Because next week, in Minnesota, the same part...","presidency united, hillary rodham, rodham clin...","1:america, change, people"
4,Obama,"June 3, 2008","St. Paul, Minnesota",Primary Night,"Tonight, after fifty-four hard-fought contests...",12.1,0.673517,joy,"[(change, 16), (time, 14), (new, 10), (country...",And for all those who dream of that future ton...,"fellow candidates, campaign presidency, presid...","1:america, change, people"


In [9]:
# 5
# Propaganda classification

# Importing the necessary libraries
import numpy as np

# Defines propaganda indicators 
propaganda_flags = {
    'emotional_words': ['enemy', 'evil', 'destroy', 'traitor', 'patriot', 
                       'threat', 'danger', 'crisis', 'attack'],
    'persuasive_phrases': ['we must', 'act now', 'urgent action', 
                          'our very survival', 'fight for'],
    'fallacies': ['everyone knows', 'true patriots believe', 
                 'no decent person would'],
    'extreme_terms': ['worst ever', 'absolute disaster', 
                     'complete failure', 'greatest in history']
}

# Definition of functions

def is_propaganda(text):
    
    text = str(text).lower()
    
    # returns the sum of true (1) and false (0) values
    # if at least one of the propaganda_flags values is in the text: true; if not present: false
    matches = sum(any(phrase in text for phrase in phrases) for phrases in propaganda_flags.values())
      
    # only takes words consisting at least 4 characters
    words = re.findall(r'\w{4,}', text)
    # counts its frequencies
    word_counts = Counter(words)
    # only takes those that appear more than 3 times among the 5 most frequent 
    repeats = sum(1 for _, count in word_counts.most_common(5) if count >= 3)
    
    # creates sentences from text using (.!?) as separators and filtering out blank spaces
    sentences = [s.strip() for s in re.split(r'[.!?]', text) if s.strip()]
    # avoid warnings (senteces=0)
    length_variation = 0
    
    if sentences:
        s_lengths = [len(s.split()) for s in sentences]
        # calculates the standard deviation on the sentences length and normalizes it with the mean
        # a, high value means high variation
        length_variation = np.std(s_lengths) / np.mean(s_lengths)
    
    # normalization of values to avoid the predominance of one of them
    # different weights are assigned to the various scores
    score = (
        # greater weight of the match as it's a strong indicator
        0.4 * min(matches / 2, 1) +  
        # medium weight because the repetition isn't necessarily manipulative
        0.2 * min(repeats / 2, 1) +   
        # low weight because the variation is not a reliable parameter
        0.1 * min(length_variation, 1)     
    )
    
    return 1 if score >= 0.5 else 0

# Uses map() to call is_propaganda on Speech column
df_final['propaganda_target'] = list(map(is_propaganda, df_final['Speech']))

display(df_final.head())

Unnamed: 0,Author,Date,Place,Occasion,Speech,Readability,Sentiment,Emotion,Frequent Words,Abstract,Keywords,Topics,propaganda_target
0,Obama,20th January 2009,"America, America",Obama Inaugural Address,My fellow citizens: I stand here today humble...,10.7,0.641,neutral,"[(nation, 12), (new, 11), (america, 10), (peop...","Our economy is badly weakened, a consequence o...","presidential oath, oath taken, oath words, oat...","1:america, change, people",1
1,Obama,"November 4, 2008","Grant Park, Illinois",Election Night,If there is anyone out there who still doubts ...,10.4,0.97854,neutral,"[(america, 14), (tonight, 13), (people, 12), (...","To my campaign manager David Plouffe, my chief...","doubts america, democracy tonight, wondered am...","1:america, change, people",0
2,Obama,"November 3, 2008","Manassas, Prince William County",Night Before the Election,What a scene. What a crowd. Thank you for Virg...,6.0,0.1242,neutral,"[(change, 14), (fired, 14), (ready, 13), (virg...","And that is what this campaign has been about,...","virginia rally, thank virginia, fulfill virgin...","1:change, fired, ready",0
3,Obama,"August 28, 2008","Denver, Colorado",the Democratic Convention Mile High Stadium,To Chairman Dean and my great friend Dick Durb...,10.1,0.914136,neutral,"[(promise, 31), (america, 26), (mccain, 21), (...","Because next week, in Minnesota, the same part...","presidency united, hillary rodham, rodham clin...","1:america, change, people",1
4,Obama,"June 3, 2008","St. Paul, Minnesota",Primary Night,"Tonight, after fifty-four hard-fought contests...",12.1,0.673517,joy,"[(change, 16), (time, 14), (new, 10), (country...",And for all those who dream of that future ton...,"fellow candidates, campaign presidency, presid...","1:america, change, people",1


In [10]:
# 6
# Propaganda classification

# Definition of functions
def get_propaganda_segments(text, is_propaganda):

    # isinstance is true if text is a str 
    if not is_propaganda or not isinstance(text, str):
        return None

    text_lower = text.lower()
    # takes every words of the sentence
    words = text.split()
    # uses set to avoid duplicates
    segments = set()

    # reads all propaganda_flags and searches for matches with the sentences of the text
    for phrase_list in propaganda_flags.values():
        for phrase in phrase_list:
            
            # finditer match a pattern (re.escape avoid special chars) with a text
            for match in re.finditer(re.escape(phrase), text_lower):
                
                # if match it takes the index of the phrase (where it starts)
                idx = match.start()
                
                # splits all words before the matched sentence
                word_pos = len(text_lower[:idx].split())
                
                #consider a block of words: 5 words before and after the match
                start = max(0, word_pos - 5)
                end = min(len(words), word_pos + len(phrase.split()) + 5)
                
                # saves the phrase in segments 
                segments.add(' '.join(words[start:end]))

    # If nothing matched, splits phrases with special chars (.!?)
    if not segments:
        sentences = re.split(r'(?<=[.!?])\s+', text)
        
        if sentences:

            # finds sentences with more than 7 chars
            # each sentence s is splitted in words, then verifies if word>7chars
            # saves sentences with highest score calculates with key func
            suspicious = max(sentences, key=lambda s: sum(len(w) > 7 for w in s.split()))

            # return at least 200 chars, adding "..."
            return [suspicious[:200] + "..."] if suspicious else None

    return list(segments) if segments else None

# Uses map() to call get_propaganda_segments() on Speech column
df_final['propaganda_segments'] = list(map(lambda r: get_propaganda_segments(r[0], r[1]),
    zip(df_final['Speech'], df_final['propaganda_target']))
                                      )
display(df_final.head())

Unnamed: 0,Author,Date,Place,Occasion,Speech,Readability,Sentiment,Emotion,Frequent Words,Abstract,Keywords,Topics,propaganda_target,propaganda_segments
0,Obama,20th January 2009,"America, America",Obama Inaugural Address,My fellow citizens: I stand here today humble...,10.7,0.641,neutral,"[(nation, 12), (new, 11), (america, 10), (peop...","Our economy is badly weakened, a consequence o...","presidential oath, oath taken, oath words, oat...","1:america, change, people",1,"[These are the indicators of crisis, subject t..."
1,Obama,"November 4, 2008","Grant Park, Illinois",Election Night,If there is anyone out there who still doubts ...,10.4,0.97854,neutral,"[(america, 14), (tonight, 13), (people, 12), (...","To my campaign manager David Plouffe, my chief...","doubts america, democracy tonight, wondered am...","1:america, change, people",0,
2,Obama,"November 3, 2008","Manassas, Prince William County",Night Before the Election,What a scene. What a crowd. Thank you for Virg...,6.0,0.1242,neutral,"[(change, 14), (fired, 14), (ready, 13), (virg...","And that is what this campaign has been about,...","virginia rally, thank virginia, fulfill virgin...","1:change, fired, ready",0,
3,Obama,"August 28, 2008","Denver, Colorado",the Democratic Convention Mile High Stadium,To Chairman Dean and my great friend Dick Durb...,10.1,0.914136,neutral,"[(promise, 31), (america, 26), (mccain, 21), (...","Because next week, in Minnesota, the same part...","presidency united, hillary rodham, rodham clin...","1:america, change, people",1,"[9/11, and made clear that we must take out Os..."
4,Obama,"June 3, 2008","St. Paul, Minnesota",Primary Night,"Tonight, after fifty-four hard-fought contests...",12.1,0.673517,joy,"[(change, 16), (time, 14), (new, 10), (country...",And for all those who dream of that future ton...,"fellow candidates, campaign presidency, presid...","1:america, change, people",1,[the world against the common threats of the 2...


In [11]:
# 7
# Narrative scheme identification

# Define narrative scheme patterns
narrative_scheme = {
    'us_vs_them': [
        'enemy', 'threat', 'other side', 'they want', 'destroy us',
        'our country', 'real americans', 'patriots vs'
    ],
    'appeal_to_fear': [
        'danger', 'crisis', 'under attack', 'at risk', 'threat to our',
        'if we don\'t act', 'urgent', 'catastrophe'
    ],
    'glorification': [
        'great again', 'strongest', 'triumph', 'victory', 'best ever',
        'historic', 'unprecedented', 'miracle'
    ],
    'victimhood': [
        'persecuted', 'unfair', 'against us', 'they attack', 'victims',
        'silent majority', 'they laugh at us'
    ],
    'simplification': [
        'simple truth', 'everyone knows', 'only one solution',
        'no alternative', 'this is why we'
    ]
}

# Definition of functions
def detect_narrative_scheme(text):
    
    if not isinstance(text, str):
        return None
    
    text_lower = text.lower()

    # counter loading
    scheme_scores = Counter()

    # check if text contains schemes keywords. The score will be the sum of occurrences
    for scheme, keywords in narrative_scheme.items():
        scheme_scores[scheme] = sum(1 for kw in keywords if kw in text_lower)

    # sorts scores in descending order and takes the first 2 most common
    top_schemes = [scheme for scheme, _ in scheme_scores.most_common(2)]

    # if the first score is at least 2
    if scheme_scores[top_schemes[0]] >= 2:

        # if also the second scheme has at least 2 occurrences, both are reported
        if len(top_schemes) > 1 and scheme_scores[top_schemes[1]] >= 2:
            return f"{top_schemes[0]} + {top_schemes[1]}"
            
        return top_schemes[0]
        
    return "no_scheme_detected"

# Uses map() to call get_propaganda_segments() on Speech column
df_final['narrative_scheme'] = list(map(lambda r: detect_narrative_scheme(r),
    df_final['Speech'])
                                   )
display(df_final.head())

Unnamed: 0,Author,Date,Place,Occasion,Speech,Readability,Sentiment,Emotion,Frequent Words,Abstract,Keywords,Topics,propaganda_target,propaganda_segments,narrative_scheme
0,Obama,20th January 2009,"America, America",Obama Inaugural Address,My fellow citizens: I stand here today humble...,10.7,0.641,neutral,"[(nation, 12), (new, 11), (america, 10), (peop...","Our economy is badly weakened, a consequence o...","presidential oath, oath taken, oath words, oat...","1:america, change, people",1,"[These are the indicators of crisis, subject t...",us_vs_them + appeal_to_fear
1,Obama,"November 4, 2008","Grant Park, Illinois",Election Night,If there is anyone out there who still doubts ...,10.4,0.97854,neutral,"[(america, 14), (tonight, 13), (people, 12), (...","To my campaign manager David Plouffe, my chief...","doubts america, democracy tonight, wondered am...","1:america, change, people",0,,no_scheme_detected
2,Obama,"November 3, 2008","Manassas, Prince William County",Night Before the Election,What a scene. What a crowd. Thank you for Virg...,6.0,0.1242,neutral,"[(change, 14), (fired, 14), (ready, 13), (virg...","And that is what this campaign has been about,...","virginia rally, thank virginia, fulfill virgin...","1:change, fired, ready",0,,no_scheme_detected
3,Obama,"August 28, 2008","Denver, Colorado",the Democratic Convention Mile High Stadium,To Chairman Dean and my great friend Dick Durb...,10.1,0.914136,neutral,"[(promise, 31), (america, 26), (mccain, 21), (...","Because next week, in Minnesota, the same part...","presidency united, hillary rodham, rodham clin...","1:america, change, people",1,"[9/11, and made clear that we must take out Os...",us_vs_them
4,Obama,"June 3, 2008","St. Paul, Minnesota",Primary Night,"Tonight, after fifty-four hard-fought contests...",12.1,0.673517,joy,"[(change, 16), (time, 14), (new, 10), (country...",And for all those who dream of that future ton...,"fellow candidates, campaign presidency, presid...","1:america, change, people",1,[the world against the common threats of the 2...,us_vs_them + glorification


In [12]:
# Saving speech_b.tsv

# Merges df_final with dfb
speech_b = pd.merge(dfb, df_final, left_on='Surname', right_on='Author', how='left')
speech_b.drop(columns=['Author'], inplace=True)
display(speech_b)

# saving new file
speech_b.to_csv('speech_b.tsv', sep='\t', index=False)

Unnamed: 0,Name,Surname,Date of birth,Place of birth,Date of death,Place of death,Nationality,Date,Place,Occasion,...,Readability,Sentiment,Emotion,Frequent Words,Abstract,Keywords,Topics,propaganda_target,propaganda_segments,narrative_scheme
0,Barack,Obama,"August 4, 1961","Honolulu, Hawaii",,,U.S.,20th January 2009,"America, America",Obama Inaugural Address,...,10.7,0.641,neutral,"[(nation, 12), (new, 11), (america, 10), (peop...","Our economy is badly weakened, a consequence o...","presidential oath, oath taken, oath words, oat...","1:america, change, people",1.0,"[These are the indicators of crisis, subject t...",us_vs_them + appeal_to_fear
1,Barack,Obama,"August 4, 1961","Honolulu, Hawaii",,,U.S.,"November 4, 2008","Grant Park, Illinois",Election Night,...,10.4,0.97854,neutral,"[(america, 14), (tonight, 13), (people, 12), (...","To my campaign manager David Plouffe, my chief...","doubts america, democracy tonight, wondered am...","1:america, change, people",0.0,,no_scheme_detected
2,Barack,Obama,"August 4, 1961","Honolulu, Hawaii",,,U.S.,"November 3, 2008","Manassas, Prince William County",Night Before the Election,...,6.0,0.1242,neutral,"[(change, 14), (fired, 14), (ready, 13), (virg...","And that is what this campaign has been about,...","virginia rally, thank virginia, fulfill virgin...","1:change, fired, ready",0.0,,no_scheme_detected
3,Barack,Obama,"August 4, 1961","Honolulu, Hawaii",,,U.S.,"August 28, 2008","Denver, Colorado",the Democratic Convention Mile High Stadium,...,10.1,0.914136,neutral,"[(promise, 31), (america, 26), (mccain, 21), (...","Because next week, in Minnesota, the same part...","presidency united, hillary rodham, rodham clin...","1:america, change, people",1.0,"[9/11, and made clear that we must take out Os...",us_vs_them
4,Barack,Obama,"August 4, 1961","Honolulu, Hawaii",,,U.S.,"June 3, 2008","St. Paul, Minnesota",Primary Night,...,12.1,0.673517,joy,"[(change, 16), (time, 14), (new, 10), (country...",And for all those who dream of that future ton...,"fellow candidates, campaign presidency, presid...","1:america, change, people",1.0,[the world against the common threats of the 2...,us_vs_them + glorification
5,Barack,Obama,"August 4, 1961","Honolulu, Hawaii",,,U.S.,"May 6, 2008","North Carolina, Raleigh","North Carolina Primary Night Raleigh, NC",...,10.8,0.97485,fear,"[(country, 15), (american, 13), (people, 10), ...",She can't afford four more years of tax breaks...,"changing washington, win democratic, clinton v...","2:health, care, iraq, america, change, people",0.0,,us_vs_them
6,Barack,Obama,"August 4, 1961","Honolulu, Hawaii",,,U.S.,"April 22, 2008","Evansville, Indiana",Pennsylvania Primary Night,...,9.9,0.20656,neutral,"[(time, 13), (election, 12), (change, 12), (co...",We're here because of the young man I met in Y...,"make election, campaign today, election plan, ...","1:america, change, people",1.0,"[as a tactic, and the threat of terrorism to s...",us_vs_them
7,Barack,Obama,"August 4, 1961","Honolulu, Hawaii",,,U.S.,"April 14, 2008","Washington, DC",Good afternoon,...,11.3,0.016233,neutral,"[(people, 18), (years, 15), (believe, 10), (jo...",It's a philosophy that says there's no role fo...,"speech faith, speech obama, power faith, bitte...","1:america, change, people",1.0,[the victims of this housing crisis; that we h...,us_vs_them
8,Barack,Obama,"August 4, 1961","Honolulu, Hawaii",,,U.S.,"March 18, 2008","Philadelphia, PA",The Race Speech,...,12.2,0.181233,anger,"[(black, 34), (white, 27), (american, 20), (ti...",A lack of economic opportunity among black men...,"1787 document, answer slavery, slavery questio...","1:iraq, american, time",0.0,,appeal_to_fear
9,Barack,Obama,"August 4, 1961","Honolulu, Hawaii",,,U.S.,March 4th,"Texas, Ohio",this morning,...,10.2,0.85422,neutral,"[(know, 13), (world, 10), (believe, 10), (cour...",It's a course that further divides Wall Street...,"won state, nomination election, winning nomina...","2:iraq, american, time, america, change, people",0.0,,us_vs_them


OSError: Cannot save file into a non-existent directory: '/Users/davidebottiglieri/Desktop/Esame-1'