In [1]:
import math
from collections import Counter
import pandas as pd
import re

def tokenize(text):
    # Lowercase and extract words
    return re.findall(r"\b[a-zA-Z]+\b", text.lower())

def load_and_tokenize(filepath):
    with open(filepath, "r", encoding="utf-8") as f:
        text = f.read()
    return tokenize(text)

def log_likelihood(a, b, c, d):
    """
    Computes log-likelihood for:
    a = freq in text1
    b = freq in text2
    c = size of text1 (total tokens)
    d = size of text2 (total tokens)
    Based on Dunning (1993)
    """
    E1 = c * (a + b) / (c + d)
    E2 = d * (a + b) / (c + d)

    # Avoid log(0)
    if a == 0 or b == 0:
        # still allowed but must avoid log(0)
        a = a if a != 0 else 0.000001
        b = b if b != 0 else 0.000001

    LL = 2 * (
        a * math.log(a / E1) +
        b * math.log(b / E2)
    )
    return LL

def compute_keyness(file1, file2):
    # Load and tokenize
    tokens1 = load_and_tokenize(file1)
    tokens2 = load_and_tokenize(file2)

    # Count words
    freq1 = Counter(tokens1)
    freq2 = Counter(tokens2)

    total1 = sum(freq1.values())
    total2 = sum(freq2.values())

    # All unique words
    vocab = set(freq1.keys()) | set(freq2.keys())

    rows = []
    for word in vocab:
        a = freq1[word]
        b = freq2[word]
        LL = log_likelihood(a, b, total1, total2)

        # Direction of keyness
        if a/total1 > b/total2:
            key_in = "Text1"
        else:
            key_in = "Text2"

        rows.append([word, a, b, LL, key_in])

    df = pd.DataFrame(rows, columns=["word", "freq_text1", "freq_text2", "log_likelihood", "key_in"])
    df = df.sort_values("log_likelihood", ascending=False)
    return df


In [2]:

# ------- RUN THE ANALYSIS --------
file1 = "A&SE_dialogue.txt"
file2 = "ALL_non-Asian high gross film.txt"

df = compute_keyness(file1, file2)
print(df.head(50))
df.to_csv("keyness_results.csv", index=False)

                word  freq_text1  freq_text2  log_likelihood key_in
9092           peter          20         178       44.737150  Text1
7754          cradle           6           3       39.365993  Text1
10278        chirrut           5           1       36.901902  Text1
6714            hong           6           8       31.848721  Text1
3442            kong           6           9       30.809178  Text1
3028    regeneration           3           0       25.367524  Text1
5057            dude           8          42       24.913781  Text1
703             wang           4           3       24.350832  Text1
126          tracker           4           5       21.604997  Text1
1327           force           9          73       21.502563  Text1
7275            babe           5          14       20.789879  Text1
2309     codebreaker           4           6       20.539452  Text1
7596         monster           6          27       20.235172  Text1
6967            finn           8          62    