In [None]:
import pandas as pd
import nltk
import string
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from tqdm import tqdm
tqdm.pandas()

# Download necessary nltk resources
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

In [None]:
# Find word that are most associated with mutlilingual clusters
df = pd.read_csv("../Data/df_with_clusters_local_translated_with_clusters.csv")
# get all cluster_0.875 that contains >1 language
multi_cluster = df.groupby("cluster_0.875").apply(lambda x: len(set(x.language)) > 1)
multilingual_clusters = multi_cluster[multi_cluster].index
df["is_multilingual"] = df["cluster_0.875"].isin(multilingual_clusters)
df["is_singleton"] = df["cluster_0.875"] == 0
df = df[~df.tranlated_claimReviewed.isna()]

# Function to convert nltk POS tags to first character used by WordNetLemmatizer
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('N'):
        return wordnet.NOUN
    return None

# Initialize WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

# Function to process text
def process_text(text):
    try:
        # Step 1: Cast to lower
        text = text.lower()
        
        # Step 2: Remove punctuation
        text = text.translate(str.maketrans('', '', string.punctuation))
        
        # Step 3: Remove non-noun words
        tokens = nltk.word_tokenize(text)
        pos_tags = nltk.pos_tag(tokens)
        nouns = [word for word, pos in pos_tags if pos in ['NN', 'NNS', 'NNP', 'NNPS']]
        
        # Step 4: Lemmatize
        #lemmatized_nouns = [lemmatizer.lemmatize(noun, get_wordnet_pos('N')) for noun in nouns]
        lemmatized_nouns = [lemmatizer.lemmatize(noun) for noun in nouns]
        
        # remove words with length < 3
        lemmatized_nouns = [word for word in lemmatized_nouns if len(word) > 2]
        

        return ' '.join(lemmatized_nouns)
    except:
        return None

# Apply the function to the 'Translation' column
df['Processed_Translation'] = df['tranlated_claimReviewed'].progress_apply(process_text)
df["tokens"] = df["Processed_Translation"].apply(lambda x: set(x.split(" ")) if isinstance(x,str) else set())

In [None]:
all_tokens = {}
all_tokens = set.union(*df["tokens"])

singleton_tokens = {True:{},False:{}}
for i,r in tqdm(df.iterrows()):
    for token in r["tokens"]:
        if token in singleton_tokens[r["is_singleton"]]:
            singleton_tokens[r["is_singleton"]][token] += 1
            if token not in singleton_tokens[not r["is_singleton"]]:
                singleton_tokens[not r["is_singleton"]][token] = 1
        else:
            singleton_tokens[r["is_singleton"]][token] = 1
            if token not in singleton_tokens[not r["is_singleton"]]:
                singleton_tokens[not r["is_singleton"]][token] = 1

# Filter singleton_tokens True & False for at least 50 occurences
singleton_tokens[True] = {k:v/sum(list(singleton_tokens[True].values())) for k,v in singleton_tokens[True].items() if v > 50}
singleton_tokens[False] = {k:v/sum(list(singleton_tokens[False].values())) for k,v in singleton_tokens[False].items() if v > 50}
relative = {}
for token in set(singleton_tokens[True].keys()):
    if token in singleton_tokens[False]:
        relative[token] = singleton_tokens[True][token] / (singleton_tokens[False][token])

most_single, least_single = [], []
# Print TOP 10 MOST SINGLTO
print("TOP 10 MOST SINGLTON")
for i,(k,v) in enumerate(sorted(relative.items(),key=lambda x: x[1],reverse=True)[:10]):
    print(i, ": ", k,f"{v:.2f}")
    most_single.append(f"{k} ({1/v:.2f})")

# Print TOP 10 LEAST SINGLTON
print("TOP 10 LEAST SINGLTON")
for i,(k,v) in enumerate(sorted(relative.items(),key=lambda x: x[1],reverse=False)[:10]):
    print(i, ": ", k,f"{v:.2f}")
    least_single.append(f"{k} ({1/v:.2f})")

In [None]:
singleton_tokens = {True:{},False:{}}
for i,r in tqdm(df.loc[~df["is_singleton"]].iterrows()):
    for token in r["tokens"]:
        if token in singleton_tokens[r["is_multilingual"]]:
            singleton_tokens[r["is_multilingual"]][token] +=1
            if token not in singleton_tokens[not r["is_multilingual"]]:
                singleton_tokens[not r["is_multilingual"]][token] = 1
        else:
            singleton_tokens[r["is_multilingual"]][token] = 1
            if token not in singleton_tokens[not r["is_multilingual"]]:
                singleton_tokens[not r["is_multilingual"]][token] = 1

# Filter singleton_tokens True & False for at least 50 occurences
singleton_tokens[True] = {k:v/sum(list(singleton_tokens[True].values())) for k,v in singleton_tokens[True].items() if v > 50}
singleton_tokens[False] = {k:v/sum(list(singleton_tokens[False].values())) for k,v in singleton_tokens[False].items() if v > 50}
relative = {}
for token in set(singleton_tokens[True].keys()):
    if token in singleton_tokens[False]:
        relative[token] = singleton_tokens[True][token] / (singleton_tokens[False][token])

most_multilingual, least_multilingual = [], []
# Print TOP 10 MOST SINGLTO
print("TOP 10 MOST SINGLTON")
for i,(k,v) in enumerate(sorted(relative.items(),key=lambda x: x[1],reverse=True)[:10]):
    print(i, ": ", k,f"{v:.2f}")
    most_multilingual.append(f"{k} ({v:.2f})")

# Print TOP 10 LEAST SINGLTON
print("TOP 10 LEAST SINGLTON")
for i,(k,v) in enumerate(sorted(relative.items(),key=lambda x: x[1],reverse=False)[:10]):
    print(i, ": ", k,f"{v:.2f}")
    least_multilingual.append(f"{k} ({v:.2f})")

In [None]:
# Apply the function to the 'Translation' column
clustered = df[~(df["is_singleton"]) & ~(df["datePublished"].isna())]
clustered["datePublished"] = pd.to_datetime(clustered["datePublished"], utc = True, errors = "coerce")
islonglasting = clustered.groupby("cluster_0.875").apply(lambda x: (x.datePublished.max() - x.datePublished.min()).days > 30)
longlasting = islonglasting[islonglasting].index
clustered["is_longlasting"] = clustered["cluster_0.875"].isin(longlasting)

clustered['Processed_Translation'] = clustered['tranlated_claimReviewed'].progress_apply(process_text)
clustered["tokens"] = clustered["Processed_Translation"].apply(lambda x: set(x.split(" ")) if isinstance(x,str) else set())

# Repeat for clustered and longlasting
longlasting_tokens = {True:{},False:{}}
for i,r in tqdm(clustered.iterrows()):
    for token in r["tokens"]:
        if token in longlasting_tokens[r["is_longlasting"]]:
            longlasting_tokens[r["is_longlasting"]][token] +=1
            if token not in longlasting_tokens[not r["is_longlasting"]]:
                longlasting_tokens[not r["is_longlasting"]][token] = 1
        else:
            longlasting_tokens[r["is_longlasting"]][token] = 1
            if token not in longlasting_tokens[not r["is_longlasting"]]:
                longlasting_tokens[not r["is_longlasting"]][token] = 1

# Filter singleton_tokens True & False for at least 50 occurences
longlasting_tokens[True] = {k:v/sum(list(longlasting_tokens[True].values())) for k,v in longlasting_tokens[True].items() if v > 50}
longlasting_tokens[False] = {k:v/sum(list(longlasting_tokens[False].values())) for k,v in longlasting_tokens[False].items() if v > 50}
relative = {}
for token in set(longlasting_tokens[True].keys()):
    if token in longlasting_tokens[False]:
        relative[token] = longlasting_tokens[True][token] / (longlasting_tokens[False][token])

most_longlasting, least_longlasting = [], []
# Print TOP 10 MOST SINGLTO
print("TOP 10 MOST LONG LASTING")
for i,(k,v) in enumerate(sorted(relative.items(),key=lambda x: x[1],reverse=True)[:10]):
    print(i, ": ", k,f"{v:.2f}")
    most_longlasting.append(f"{k} ({v:.2f})")

# Print TOP 10 LEAST SINGLTON
print("TOP 10 LEAST LONG LASTING")
for i,(k,v) in enumerate(sorted(relative.items(),key=lambda x: x[1],reverse=False)[:10]):
    print(i, ": ", k,f"{v:.2f}")
    least_longlasting.append(f"{k} ({v:.2f})")

In [None]:
pd.DataFrame({"most_single":most_single,"least_single":least_single,"most_multilingual":most_multilingual,"least_multilingual":least_multilingual, "most_longlasting":most_longlasting,"least_longlasting":least_longlasting})