In [1]:
import skimpy
import pandas as pd
import sqlite3

In [10]:
con = sqlite3.connect("propaganda.db")
query = "SELECT * FROM df_tweets_HiQualProp"
propaganda = pd.read_sql_query(query, con)
con.close()

skimpy.skim(propaganda)

propaganda

# add min and max separately
print("\nMinimum values:")
print(propaganda.min(numeric_only=True))

print("\nMaximum values:")
print(propaganda.max(numeric_only=True))

# Filter where punctuation == 1 and show cleaned_text
punctuation_1_texts = propaganda.loc[propaganda["punctuation"] == 1, "cleaned_text"]
# print(punctuation_1_texts)

repetition_1_texts = propaganda.loc[propaganda["repetition"] == 1, "cleaned_text"]
# print(repetition_1_texts)

subjectivity_1_texts = propaganda.loc[propaganda["subjectivity"] == 1, "cleaned_text"]
# print(subjectivity_1_texts)

capitalization_1_texts = propaganda.loc[
    propaganda["capitalization"] == 1, "cleaned_text"
]
# print(capitalization_1_texts)


Minimum values:
id                208.000000
labels              0.000000
author_id          14.000000
toxicity            0.000481
subjectivity        0.000000
profanity           0.000110
punctuation         0.000000
capitalization      0.000000
repetition          0.000000
dtype: float64

Maximum values:
id                1.575986e+18
labels            1.000000e+00
author_id         1.575234e+18
toxicity          9.991755e-01
subjectivity      1.000000e+00
profanity         1.000000e+00
punctuation       1.000000e+00
capitalization    1.000000e+00
repetition        1.000000e+00
dtype: float64
5        Russia sees potential cooperation with Taliban...
14       For weeks , the Biden administration said that...
36       Confirmed ! Bodyguard of Ethiopian president a...
125                         Ye say it to cnn , fake news .
139      Russia had to fight the terrorists that NATO w...
                               ...                        
29433    nice to see you using a slogan in

In [4]:
import string
import re
import math


def punctuation_proportion(text: str) -> float:
    """Returns the proportion of punctuation characters in the text."""
    if not text:
        return 0.0
    punct_count = sum(1 for char in text if char in string.punctuation)
    return punct_count / len(text)


def exclam_proportion(text: str) -> float:
    """Returns the proportion of exclamation points among all punctuation characters."""
    punct_chars = [char for char in text if char in string.punctuation]
    if not punct_chars:
        return 0.0
    exclam_count = punct_chars.count("!")
    return exclam_count / len(punct_chars)


def capital_letter_proportion(
    text: str,
) -> float:  # perhps ratio of num caps to num words?
    """Returns the proportion of capital letters in the text."""
    capital_count = sum(1 for char in text if char.isupper())
    return capital_count / len(text)


def count_sentences(text: str) -> int:
    """
    Returns the number of sentences in the text based on punctuation.
    Sentences are split on ., !, or ? followed by space or end of string.
    """
    if not text.strip():
        return 0

    # Split on end punctuation followed by space or end of string
    sentences = re.split(r"(?<=[.!?])\s+", text.strip())

    # Filter out empty strings (in case of multiple spaces or weird endings)
    sentences = [s for s in sentences if s.strip()]
    return len(sentences)


def immaturity_score(text: str) -> dict:
    """
    Returns a dict with an immaturity score and a qualitative label.
    Higher scores indicate more 'immature' or exaggerated
    punctuation/capitalization use.
    """
    if not text:
        return {"score": 0.0, "label": "Mature"}

    num_sentences = count_sentences(text)

    # Pattern-based counts
    exclam_streaks = text.count("!") / num_sentences  # len(re.findall(r"!{2,}", text))
    question_streaks = (
        text.count("?") / num_sentences
    )  # len(re.findall(r"\?{2,}", text))
    interrobangs = len(re.findall(r"\?\!|\!\?", text))
    ellipses = len(re.findall(r"\.\.\.+", text))
    quotes = text.count('"') + text.count("'")
    parens = text.count("(") + text.count(")")

    all_caps_words = len(
        re.findall(r"\b[A-Z]{2,}\b", text)
    )  # 2 or more capitalized letters in a row

    # Weights for each type
    weights = {
        "all_caps_words": 3.0,
        "exclam_streaks": 3.0,
        "question_streaks": 3.0,
        "interrobangs": 2.5,
        "ellipses": 2.0,
        "parens": 1.0,
        "quotes": 0.3,
    }

    raw_score = (
        weights["all_caps_words"] * all_caps_words
        + weights["exclam_streaks"] * exclam_streaks
        + weights["question_streaks"] * question_streaks
        + weights["interrobangs"] * interrobangs
        + weights["ellipses"] * ellipses
        + weights["parens"] * parens
        + weights["quotes"] * quotes
    )

    char_count = len(text)
    smoothing_factor = (
        math.log2(char_count + 10) / 5
    )  # avoid harsh drop for longer texts
    score = raw_score / smoothing_factor

    # Rough qualitative label based on score ranges
    if score < 4:
        label = "Mature"
    else:
        label = "Immature"

    return {"score": round(score, 2), "label": label}


tweet1 = "So interesting to see 'Progressive' Democrat Congresswomen, who originally came from countries whose governments are a complete and total catastrophe, the worst, most corrupt and inept anywhere in the world (if they even have a functioning government at all), now loudly....and viciously telling the people of the United States, the greatest and most powerful Nation on earth, how our government is to be run. Why don't they go back and help fix the totally broken and crime infested places from which they came. Then come back and show us how....it is done. These places need your help badly, you can't leave fast enough. I'm sure that Nancy Pelosi would be very happy to quickly work out free travel arrangements!"

tweet2 = "To Iranian President Rouhani: NEVER, EVER THREATEN THE UNITED STATES AGAIN OR YOU WILL SUFFER CONSEQUENCES THE LIKES OF WHICH FEW THROUGHOUT HISTORY HAVE EVER SUFFERED BEFORE. WE ARE NO LONGER A COUNTRY THAT WILL STAND FOR YOUR DEMENTED WORDS OF VIOLENCE & DEATH. BE CAUTIOUS!"

tweet3 = 'Who can figure out the true meaning of "covfefe" ???  Enjoy!'

tweet4 = "Lowest rated Oscars in HISTORY. Problem is, we don’t have Stars anymore - except your President (just kidding, of course)!"

tweet5 = 'Not just ""weird"": actively white supremacist, fascist, nazi imagery being used to sow disinformation, falsely claiming far-right Western figures are ""Ukrainian heroes.""  Like i said the other day, the forces of this dis- &amp; misinformation war are EVERYWHERE right now. Be careful.'

normal1 = "Google was founded on September 4, 1998, by American computer scientists Larry Page and Sergey Brin while they were PhD students at Stanford University in California. Together, they own about 14 percent of its publicly listed shares and control 56 percent of its stockholder voting power through super-voting stock."

normal2 = "I am excited! But I am also normal!"

normal3 = 'The director of the NOAA then said, "The U.S. is pulling out of NATO!"'  # limitations of labels, raw score may be better

# print(immaturity_score(normal2))
# print(immaturity_score(tweet5))

In [2]:
from empath import Empath

lexicon = Empath()

# lexicon.create_category("politics",['Progressive', 'Democrat', 'Congresswomen'])

# lexicon.create_category("colors",["red","blue","green"])

lexicon.create_category("elections", ["elections"], model="nytimes")

lexicon.create_category("politicians", ["politicians"], model="nytimes")

lexicon.create_category("world_leaders", ["world_leaders"], model="reddit")

lexicon.create_category("patriotism", ["patriotism"], model="reddit")

lexicon.create_category("war", ["war"], model="reddit")

lexicon.create_category("violence", ["violence"], model="fiction")

lexicon.create_category("great", ["violence"], model="fiction")

["elections", "election", "national_elections", "presidential_elections", "local_elections", "parliamentary_elections", "presidential_election", "general_elections", "municipal_elections", "national_election", "legislative_elections", "opposition_parties", "voting", "new_elections", "free_elections", "election_results", "parliamentary_election", "ruling_party", "new_Parliament", "state_elections", "new_government", "general_election", "plebiscite", "PRI", "referendum", "opposition_candidates", "electoral_process", "next_election", "interim_government", "balloting", "governing_party", "next_elections", "Socialists", "coming_elections", "election_day", "political_parties", "nationwide_elections", "election_process", "new_Government", "democratic_elections", "election_campaign", "international_observers", "early_elections", "Christian_Democrats", "party_leaders", "congress", "party_members", "direct_elections", "free_and_fair_elections", "opposition_party", "Liberal_Democrats", "major_pol

In [7]:
lexicon.create_category("hate", ["hate"], model="nytimes")

lexicon.create_category("hate", ["hate"], model="fiction")

["hate", "love", "like", "stupid", "loathe", "hates", "despise", "hated", "afraid", "like", "same_things", "crazy", "dislike", "bad_things", "know", "wrong_thing", "curse", "anymore", "ashamed", "mad", "want", "bothers", "O.K.", "scared", "detest", "sorry", "hating", "think", "insult", "suppose", "guess", "adore", "fools", "bad_people", "dumb", "hey", "laugh", "liking", "anyway", "certain_things", "good_people", "Personally", "Sure", "so_many_things", "pretend", "anyhow", "whine", "O.K.", "ca", "yell", "okay", "gonna", "damn_thing", "scream", "hell", "really", "white_people", "damn", "certain_people", "nice_things", "gotta", "absolutely_nothing", "me", "jerks", "damned"]
["hate", "hates", "despise", "dislike", "hated", "love", "hating", "like", "mean", "care", "annoys", "loathe", "Because", "bugs", "loves", "honestly", "pity", "HATE", "understand", "bothers", "mean", "likes", "adore", "Mainly", "horrible", "stupid", "loved", "disgusts", "why", "annoy", "blame", "annoying", "bully", "de

In [5]:
def keyword_ratio(text):
    """
    Returns ratio of found keywords to total words in text
    """

    keywords = [
        "world leaders",
        "other world leaders",
        "political leaders",
        "political figures",
        "World leaders",
        "US politicians",
        "American politicians",
        "leaders",
        "diplomats",
        "western leaders",
        "military leaders",
        "jihadis",
        "world leader",
        "European leaders",
        "religious fanatics",
        "government officials",
        "US president",
        "US leaders",
        "jihadists",
        "presidents",
        "mullahs",
        "world powers",
        "powerful people",
        "American presidents",
        "western governments",
        "terrorists",
        "political stage",
        "American leaders",
        "American government",
        "Middle Eastern countries",
        "Jihadists",
        "Islamic terrorists",
        "intelligence services",
        "war criminals",
        "prime ministers",
        "Islamists",
        "top officials",
        "Western governments",
        "national leaders",
        "U.N.",
        "foreign leaders",
        "politicians",
        "Islamic extremists",
        "North Koreans",
        "most powerful people",
        "CIA agents",
        "religious extremists",
        "Osama Bin Laden",
        "opposition leaders",
        "Israeli politicians",
        "terrorist groups",
        "secret police",
        "major world powers",
        "military officials",
        "Saudi royals",
        "political activists",
        "defectors",
        "ordinary people",
        "Iranian government",
        "genocidal maniacs",
        "Western leaders",
        "American public",
        "terror attacks",
        "whole nation",
        "US President",
        "US officials",
        "dictators",
        "free world",
        "Islamic radicals",
        "Putin",
        "Iranian regime",
        "militant groups",
        "American people",
        "Russian leaders",
        "dissidents",
        "current leaders",
        "US presidents",
        "hardliners",
        "intelligence agencies",
        "American foreign policy",
        "terrorist organizations",
        "Muslim terrorists",
        "own governments",
        "military officers",
        "false flag attacks",
        "extremist groups",
        "peace keepers",
        "rebel leaders",
        "radical islamists",
        "North Korean government",
        "own people",
        "top leadership",
        "patriotism",
        "blind patriotism",
        "jingoism",
        "Patriotism",
        "nationalism",
        "exceptionalism",
        "national pride",
        "American exceptionalism",
        "blind nationalism",
        "American values",
        "militarism",
        "nationalistic pride",
        "Nationalism",
        "religious fanaticism",
        "fanaticism",
        "xenophobia",
        "patriotic",
        "military worship",
        "glorification",
        "American nationalism",
        "hero worship",
        "zealotry",
        "propaganda",
        "cowardice",
        "fascism",
        "American freedom",
        "radicalism",
        "flag waving",
        "American patriotism",
        "American ideals",
        "red scare",
        "facism",
        "political correctness",
        "imperialism",
        "fellow countrymen",
        "propoganda",
        "national unity",
        "individualism",
        "democracy",
        "isolationism",
        "multiculturalism",
        "national identity",
        "liberal values",
        "extreme nationalism",
        "war propaganda",
        "ethnocentrism",
        "moral superiority",
        "humanitarianism",
        "religious fundamentalism",
        "chauvinism",
        "unpatriotic",
        "nationalistic",
        "secularism",
        "religious ideology",
        "radical Islam",
        "fellow Americans",
        "US imperialism",
        "American imperialism",
        "ideology",
        "state propaganda",
        "backwardness",
        "pacifists",
        "opportunism",
        "willful ignorance",
        "American way",
        "democratic values",
        "religious extremism",
        "bigotry",
        "narrow mindedness",
        "cultural superiority",
        "rhetoric",
        "tribalism",
        "intellectualism",
        "western democracy",
        "patriotic duty",
        "Nazism",
        "realpolitik",
        "brainwashing",
        "fundamentalism",
        "ultranationalism",
        "religious fanatics",
        "warmongering",
        "authoritarianism",
        "imperialists",
        "sectarianism",
        "populism",
        "conservatism",
        "American Exceptionalism",
        "religious zealotry",
        "appeasement",
        "liberalism",
        "military culture",
        "authoritarian government",
        "war",
        "war",
        "total war",
        "wars",
        "global war",
        "world war",
        "multiple wars",
        "waged",
        "uprising",
        "Afghanistan",
        "wage",
        "civil war",
        "illegal war",
        "brutal war",
        "endless war",
        "bloody war",
        "more wars",
        "revolution",
        "military force",
        "declared war",
        "armed conflict",
        "world war II",
        "World War Two",
        "waging",
        "military conflict",
        "more war",
        "bullshit war",
        "surrender",
        "military conflicts",
        "great war",
        "large scale war",
        "new wars",
        "soviets",
        "endless wars",
        "actual war",
        "gaddafi",
        "insurrection",
        "cold war",
        "invasion",
        "pointless wars",
        "ww1",
        "bombing campaign",
        "whole war",
        "military intervention",
        "wartime",
        "war-",
        "war time",
        "military action",
        "allied powers",
        "open war",
        "real war",
        "pointless war",
        "stupid war",
        "Iraqi people",
        "conscription",
        "foreign wars",
        "conventional warfare",
        "unwinnable war",
        "proxy war",
        "military aggression",
        "hostilities",
        "perpetual war",
        "bloody civil war",
        "Nazis",
        "war mongers",
        "proxy wars",
        "armed rebellion",
        "&gt;Russia",
        "USSR",
        "armed conflicts",
        "global conflict",
        "entire war",
        "military operation",
        "USSR.",
        "casus belli",
        "liberate",
        "open warfare",
        "military actions",
        "Ukraine",
        "full scale war",
        "Russia",
        "ISIS",
        "open conflict",
        "regime change",
        "huge war",
        "massive war",
        "belligerents",
        "foreign intervention",
        "World War III",
        "uprisings",
        "communist revolution",
        "glorious",
        "heroic",
        "evil",
        "corrupt",
        "treasonous",
        "traitorous",
        "supreme",
        "monstrous",
        "disgusting",
        "vile",
        "decadent",
        "brilliant",
        "immortal",
        "divine",
        "pathetic",
        "unpatriotic",
        "magnificent",
        "cowardly",
        "savage",
        "merciless",
        "tyrannical",
        "righteous",
        "unholy",
        "depraved",
        "barbaric",
        "legendary",
        "catastrophic",
        "disgraceful",
        "shameful",
        "godless",
        "fearless",
        "relentless",
        "explosive",
        "damned",
        "sinful",
        "blasphemous",
        "heretic",
        "evil",
        "demonic",
        "satanic",
        "antichrist",
        "judgment",
        "hellfire",
        "doomed",
        "fallen",
        "corrupt",
        "wicked",
        "unclean",
        "end times",
        "tribulation",
        "wrath",
        "curse",
        "punishment",
        "always",
        "never",
        "forever",
        "completely",
        "totally",
        "entirely",
        "absolutely",
        "no one",
        "everyone",
        "all",
        "none",
        "every",
        "nothing",
        "everything",
        "without exception",
        "without question",
        "beyond doubt",
        "no doubt",
        "no way",
        "without a doubt",
        "in every case",
        "in all circumstances",
        "guaranteed",
        "100%",
        "at all times",
        "under no circumstances",
        "irrefutable",
        "indisputable",
        "undeniably",
        "permanent",
        "unchanging",
        "infinite",
        "non-negotiable",
    ]
    # count words
    words = re.findall(r"\b\w+\b", text.lower())

    keyword_count = len([word for word in words if word in keywords])

    if keyword_count == 0:
        return 0
    return keyword_count / len(words)

In [6]:
keyword_ratio(tweet5)

0.023255813953488372

In [9]:
from keywords import KEYWORDS, STOPWORDS
from nltk.stem import PorterStemmer
from rapidfuzz import fuzz

stemmer = PorterStemmer()

def normalize(word):
    return stemmer.stem(word.lower())

# Preprocess keywords only once
SINGLE_KEYWORDS = set()
MULTI_KEYWORDS = []

for keyword in KEYWORDS:
    tokens = keyword.lower().split()
    norm_tokens = tuple(normalize(w) for w in tokens)
    if len(norm_tokens) == 1:
        SINGLE_KEYWORDS.add(norm_tokens[0])
    else:
        MULTI_KEYWORDS.append(norm_tokens)

def keywords(text, fuzzy_threshold=85, enable_fuzzy=True):
    text_tokens = [w for w in text.lower().split() if w not in STOPWORDS]
    norm_tokens = [normalize(w) for w in text_tokens]
    total_tokens = len(norm_tokens)
    counted_indices = set()

    # Fast match for single-word keywords
    for i, tok in enumerate(norm_tokens):
        if tok in SINGLE_KEYWORDS:
            counted_indices.add(i)
        elif enable_fuzzy:
            # Fuzzy match only if not matched exactly
            for kw in SINGLE_KEYWORDS:
                if fuzz.ratio(tok, kw) >= fuzzy_threshold:
                    counted_indices.add(i)
                    break

    # Match multi-word phrases
    for kw_tokens in MULTI_KEYWORDS:
        n = len(kw_tokens)
        for i in range(len(norm_tokens) - n + 1):
            if any((i + j) in counted_indices for j in range(n)):
                continue
            if norm_tokens[i:i+n] == list(kw_tokens):
                for j in range(n):
                    counted_indices.add(i + j)

    return (len(counted_indices) / total_tokens) if total_tokens else 0

In [10]:
keywords(tweet5)

0.034482758620689655

In [16]:
from nrclex import NRCLex

text_object = NRCLex(tweet)

MissingCorpusError: 
Looks like you are missing some required data for this feature.

To download the necessary data, simply run

    python -m textblob.download_corpora

or use the NLTK downloader to download the missing data: http://nltk.org/data.html
If this doesn't fix the problem, file an issue at https://github.com/sloria/TextBlob/issues.
