In [1]:
from clean_new import clean_nazi
import re
from collections import Counter
import pandas as pd
from nltk.corpus import stopwords
import nltk


nltk.download("stopwords", quiet=True)
stop_words = set(stopwords.words("english"))


def most_common_words(df, column, top_n=1000):
    """
    Returns the top N most common words in a specified text column of a DataFrame.

    Parameters:
    df (pd.DataFrame): The DataFrame containing the data.
    column (str): The column name containing text data.
    top_n (int): Number of top common words to return (default is 10).

    Returns:
    List of tuples (word, count).
    """
    # Join all text entries in the column into one string
    text = " ".join(df[column].dropna().astype(str))

    # Use regex to find words (alphanumeric), convert to lowercase
    words = re.findall(r"\b\w+\b", text.lower())

    filtered_words = [word for word in words if word not in stop_words]

    # Count word frequencies
    word_counts = Counter(filtered_words)

    # Return the most common words
    return word_counts.most_common(top_n)

In [9]:
df_nazi = pd.read_csv("nazi_cleaned.csv")
df_normal = pd.read_csv("normal_cleaned.csv")

In [11]:
print(most_common_words(df_nazi, "text"))
print(most_common_words(df_normal, "text"))

[('rt', 39498), ('white', 7164), ('people', 5685), ('like', 3972), ('amp', 3699), ('realdonaldtrump', 3100), ('one', 3075), ('de', 2905), ('us', 2784), ('trump', 2706), ('get', 2594), ('via', 2453), ('would', 2330), ('know', 2204), ('right', 2083), ('good', 2061), ('new', 1996), ('want', 1974), ('time', 1931), ('youtube', 1844), ('think', 1810), ('la', 1793), ('world', 1745), ('see', 1727), ('jews', 1706), ('man', 1700), ('never', 1669), ('need', 1623), ('black', 1599), ('back', 1589), ('whites', 1571), ('race', 1510), ('go', 1508), ('great', 1429), ('make', 1415), ('2', 1393), ('america', 1365), ('today', 1364), ('say', 1354), ('country', 1351), ('video', 1351), ('en', 1333), ('lol', 1317), ('must', 1287), ('hate', 1276), ('u', 1269), ('well', 1265), ('day', 1262), ('going', 1253), ('even', 1248), ('1', 1248), ('take', 1239), ('love', 1217), ('women', 1194), ('let', 1191), ('stop', 1190), ('war', 1173), ('que', 1172), ('whitegenocide', 1150), ('still', 1146), ('really', 1117), ('way',

In [None]:
import pandas as pd
from collections import Counter
import re
from nltk.corpus import stopwords
import nltk

nltk.download("stopwords", quiet=True)
stop_words = set(stopwords.words("english"))

# You can extend this list as needed
extra_non_salient = {
    "also",
    "would",
    "could",
    "one",
    "two",
    "get",
    "go",
    "like",
    "even",
}


def top_word_proportions(df, column, top_n=50):
    """
    Returns the top N most salient words and their proportions in a text column.

    Parameters:
    df (pd.DataFrame): DataFrame with text data.
    column (str): Name of the column.
    top_n (int): Number of top words to return.

    Returns:
    List of tuples (word, proportion)
    """
    text = " ".join(df[column].dropna().astype(str))
    words = re.findall(r"\b\w+\b", text.lower())

    # Filter out stopwords, extra non-salient terms, short words, numbers
    filtered = [
        word
        for word in words
        if word not in stop_words
        and word not in extra_non_salient
        and not word.isdigit()
    ]

    total_words = len(filtered)
    word_counts = Counter(filtered)

    # Compute proportions
    word_proportions = {
        word: count / total_words for word, count in word_counts.items()
    }

    # Return top N sorted by proportion
    return sorted(word_proportions.items(), key=lambda x: x[1], reverse=True)[:top_n]

In [19]:
print(top_word_proportions(df_nazi, "text"))
print(top_word_proportions(df_normal, "text"))

[('rt', 0.04), ('people', 0.01), ('white', 0.01), ('alxndrblack', 0.0), ('ianmaegaoidh', 0.0), ('apurposefulwife', 0.0), ('westernsoldier', 0.0), ('omg', 0.0), ('haha', 0.0), ('wow', 0.0), ('ancestors', 0.0), ('whitehistory', 0.0), ('degenerate', 0.0), ('become', 0.0), ('itcanwait', 0.0), ('att', 0.0), ('husband', 0.0), ('laughing', 0.0), ('fake', 0.0), ('least', 0.0), ('shown', 0.0), ('something', 0.0), ('actually', 0.0), ('happened', 0.0), ('instead', 0.0), ('fantasy', 0.0), ('land', 0.0), ('moviebuffchick1', 0.0), ('aeroape51', 0.0), ('lindsaymbksm', 0.0), ('benshapiro', 0.0), ('obviously', 0.0), ('never', 0.0), ('raped', 0.0), ('take', 0.0), ('protection', 0.0), ('ya', 0.0), ('got', 0.0), ('disgusting', 0.0), ('antihuman', 0.0), ('promotes', 0.0), ('chattelism', 0.0), ('unbecoming', 0.0), ('female', 0.0), ('exactly', 0.0), ('planned', 0.0), ('since', 0.0), ('past', 0.0), ('gates', 0.0), ('vienna', 0.0)]
[('today', 0.01), ('lol', 0.01), ('got', 0.01), ('love', 0.01), ('day', 0.01), 

In [4]:
from better_profanity import profanity

profanity.load_censor_words()

# Convert VaryingString objects to plain strings and sort
words = sorted(str(word) for word in profanity.CENSOR_WORDSET)

# Write to file
with open("profanity_list.txt", "w") as f:
    for word in words:
        f.write(word + "\n")

In [3]:
import re
from keywords import STOPWORDS
from collections import Counter

def excess_capitalization(text):
    """
    Calculates the ratio of fully capitalized words to total words in
    the input text exceeds.

    A fully capitalized word is defined as a word with at least two consecutive
    uppercase letters and no lowercase letters (e.g., 'WARNING', 'HELP').

    Parameters:
    text (str): The input string to analyze.
    """
    # Count words
    words = len(re.findall(r"\b\w+\b", text))
    if words == 0:
        return 0

    # Count all uppercase words
    uppercase_words = len(re.findall(r"\b[A-Z]{2,}\b", text))

    return uppercase_words / words


def excess_repetition(text):
    """
    Analyzes the frequency of non-stop words in a text and calculates the ratio of
    the total number of repeated occurrences of non-stop words to the total number
    of non-stop words in the text.

    Parameters:
    text (str): The input string to analyze, typically a sentence or passage of text.

    Returns:
    float: The ratio of the total occurrences of repeated non-stop words to the total
           number of non-stop words in the text. If no non-stop words are present,
           the function returns 0.
    """

    # Extract words and normalize to lowercase
    words = re.findall(r"\b\w+\b", text.lower())

    # Filter out stop words
    non_stop_words = [word for word in words if word not in STOPWORDS]

    if not non_stop_words:
        return 0

    # Count word frequencies
    word_counts = Counter(non_stop_words)
    repeated_words = sum(word_counts.values())

    # Calculate repetition ratio
    ratio = repeated_words / len(non_stop_words)

    return ratio

In [5]:
str = "ITS THE LAST DAY OF SCHOOL!!4 ME AND I AM SO HAPPY. AND SAD CUZ I WANT GET GEt GET GET GET get get get get TO C ALL MY FRIENDS"

excess_repetition(str)

1.0