In [3]:
#All necessary libraries
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
# Using for cleaning and Pre-Processing
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk import pos_tag
import spacy
en = spacy.load('en_core_web_sm')
# Loading transformers library
import torch
from transformers import AutoTokenizer, BertModel,AutoModel
tokenizer = AutoTokenizer.from_pretrained('Dhanush66/AntismetisimLargedata-finetuned-MLM-NEW')
model = BertModel.from_pretrained('Dhanush66/AntismetisimLargedata-finetuned-MLM-NEW')
# To generate embedding
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models.phrases import Phrases, ENGLISH_CONNECTOR_WORDS,Phraser
from gensim.models import Word2Vec, KeyedVectors #To load the model
from cleantext import clean
#Visualisations
import matplotlib.pyplot as plt
plt.rcParams["font.family"] = "Calibri"
import matplotlib.gridspec as gridspec
import os
from tqdm import tqdm
from collections import Counter
from concurrent.futures import ProcessPoolExecutor
from preprocesss import preprocess_batch
#To check for performance
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix

from keybert import KeyBERT
from pygtrie import CharTrie
from collections import Counter
stopword=list(stopwords.words('english'))

In [14]:
data=pd.read_csv("Unmasking Antisemitism SRI Data Set - Reporting Layer.csv")
data=data[['Term or Phrase','Post Text']]
data['Term or Phrase'].unique()

#Get the number of rows
len(data)

667

In [15]:
#Drop the duplicates
data=data.drop_duplicates()
len(data)

644

In [16]:
glossary=['cabal','cosmopolitan elite','cultural marxism','deicide','holocough','jewish capitalist','the goyim know',
           'jewish communist','jewish lobby','new world order','rothschild', 'soros','zionist',
         'zionist occupied government','jew down','not the real jews'] 
emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002500-\U00002BEF"  # chinese char
                               u"\U00002702-\U000027B0"
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               u"\U0001f926-\U0001f937"
                               u"\U00010000-\U0010ffff"
                               u"\u2640-\u2642"
                               u"\u2600-\u2B55"
                               u"\u200d"
                               u"\u23cf"
                               u"\u23e9"
                               u"\u231a"
                               u"\ufe0f"  # dingbats
                               u"\u3030"
                                "]+", flags=re.UNICODE)

In [17]:
data=data[-data["Post Text"].isnull()]


In [31]:
raw_text=data['Post Text']

In [18]:
lematizer=WordNetLemmatizer()
def lematize(text):
    text=text.split()
    lema=[]
    for i in text:
        lema.append(lematizer.lemmatize(i))
    return (" ".join(lema))

In [19]:
data['clean']=data['Post Text'].apply(lambda x:x.lower())
#removing the links:
data['clean']=data['clean'].apply(lambda x:re.sub(r"http\S+","",str(x)))
#Getting the lematised text to get the original form of the word
data['lematize']=data['clean'].apply(lambda x:lematize(x))
#data['clean']=data['clean'].apply(lambda x:x.translate(str.maketrans("","",string.punctuation)))
uncleaned_texts=list(data["clean"])
uncleaned_lematised_text=list(data['lematize'])

In [12]:
def preprocess(text,postag,stopword,lematizer):
    pos_removal=[tag[0] for tag in pos_tag(text.split()) if tag[1] in postag]
    return(" ".join([lematizer.lemmatize(i) for i in pos_removal if i not in stopword]))

In [20]:
postag=["JJ",'NN','NNS','NNP','VBP']
stopword=list(stopwords.words('english'))
def preprocess_parallel(sentences, postag, stopword, num_workers):
    chunk_size = len(sentences) // num_workers
    chunks = [sentences[i:i + chunk_size] for i in range(0, len(sentences)+1, chunk_size)]
    with ProcessPoolExecutor(max_workers=num_workers) as executor:
        results = executor.map(preprocess_batch, chunks, [postag] * num_workers, [stopword] * num_workers)
    processed_sentences = [item for sublist in results for item in sublist]
    return processed_sentences

In [21]:
processed_sentences_parallel = preprocess_parallel(sentences=uncleaned_texts, 
                                                   postag=postag, stopword=stopword, num_workers=1)

In [22]:
data["clean"]=processed_sentences_parallel

In [23]:
data['lematize']=data['clean'].apply(lambda x:lematize(x))

In [24]:
post_text=list(data['Post Text'])

lematised_texts=list(data['lematize'])

## Finding Important terms

In [17]:
def emerging(terms):

    check=True
    while(tqdm(check)):
        #For each time step
        important_terms={}
        text=' '.join(data.clean)
        #Generate TF-IDF Matrix
        tf_idfvectorizer=TfidfVectorizer(ngram_range=(2,3))
        tfidf=tf_idfvectorizer.fit_transform(data.clean)
        features=tf_idfvectorizer.get_feature_names_out()
        
        #Exporting all Bigrams
        #pd.DataFrame(features).to_csv("Bigrams "+str(z)+".csv",header=['All Bigrams'])
        
        #Finding Emerging terms/Initializing glossary
        emerging_terms=list(set(features).difference(glossary))
        emerging_terms_trie = CharTrie()
        for term in emerging_terms:
            emerging_terms_trie[term] = True

                #glossary.append(i)
        print("started")
        #Finding the index of emerging terms
        findex=[i for i, word in enumerate(features) if word in emerging_terms_trie]
        
        tfidf_highest=tfidf.max(axis=0)
        #Finding the highest tf-idf value for all the feature names across all documents
        tfidf_values={}
        for i in tqdm(findex):
            tfidf_values[features[i]]=tfidf_highest.toarray()[0][i]
        
        tfidf_values=sorted(tfidf_values.items(),key=lambda x:x[1],reverse=True)
        print("Ended")
        #Exporting Top Tf-idf values with bigrams
        
        #pd.DataFrame(tfidf_values).to_csv("SubtractionBigram-Tfidf "+str(z)+".csv",header  = ['Bigrams','TF-IDF Values'])
        #print(tfidf_values[:20])
    
        #Extracting bigrams after TF-IDF threshold cut off
        a=sum(j for i, j in tfidf_values)
        print("Threshold ",a/len(tfidf_values))
        th=a/len(tfidf_values)
        final_tfidf={}
        for i in tfidf_values:
            if i[1]>th and len(i[0])<25:
                final_tfidf[i[0]]=i[1]
        #Exporting Tf-idf values above threshold
        #pd.DataFrame.from_dict(data=final_tfidf, orient='index').to_csv("Bigram_threshold "+str(z)+".csv",index_label='Bigrams',header  = ['TF-IDF Values'])
        
        #Frequency of words in that window size
        words_frequency={}
        text1=text.split(" ")
        #pairs=list(zip(text1[:-1],text1[1:]))
        #trigram_pairs=list(zip(text1[:-2],text1[1:-1],text1[2:]))
        #bigrams=[' '.join(i) for i in pairs]
        #trigrams=[" ".join(i) for i in trigram_pairs]
        #bigram_frequency=Counter(bigrams)
        #trigram_frequency=Counter(trigrams)
        unigram_frequency=Counter(text1)
        for word in (final_tfidf.keys()):
            words_frequency[word]=unigram_frequency[word]
                
        words_frequency=sorted(words_frequency.items(),key=lambda x:x[1],reverse=True)
        
        #Appending terms in Imp terms list
        for i in range(terms):
            important_terms[words_frequency[i][0]] = words_frequency[i][1]
        
        
        # Exporting Bigrams with frequency
        #pd.DataFrame(words_frequency).to_csv("Bigram_frequency "+str(z)+".csv",header  = ['Bigrams','Frequency'])
        
        #Exporting Important terms that is coming from  window
        #pd.DataFrame.from_dict(data=important_terms, orient='index').to_csv("Important_terms "+str(z)+".csv",index_label='Bigrams',header  = ['Frequency'])

        return important_terms


In [20]:
imp_terms=emerging(200)

0it [00:00, ?it/s]


started


100%|██████████| 61750/61750 [00:03<00:00, 15488.51it/s]


Ended
Threshold  0.07511801034902785


In [21]:
imp_terms

{'new world': 68,
 'world order': 42,
 'zionist occupied': 32,
 'united state': 26,
 'goy know': 25,
 'george soros': 22,
 'blood type': 22,
 'vatican ii': 22,
 'nostra aetate': 21,
 'occupied government': 17,
 'human right': 16,
 'jesus christ': 15,
 'deep state': 13,
 'zionist reptilian': 12,
 'far right': 11,
 'white people': 11,
 'jewish community': 11,
 'fascist white': 11,
 'new york': 10,
 'including zionist': 10,
 'church always': 10,
 'un special': 9,
 'late 20th': 9,
 'space time': 9,
 'space time information': 9,
 'time information': 9,
 'white power': 9,
 'african identity': 8,
 'zionist jewish': 8,
 'moon landing': 8,
 'middle east': 8,
 'critical race': 8,
 'church teaching': 8,
 'people color': 8,
 'civil right': 8,
 'world war': 7,
 'old school': 7,
 'zionist time': 7,
 'special rapporteur': 7,
 'un special rapporteur': 7,
 'federal reserve': 7,
 'white men': 7,
 'interfere behalf': 7,
 'say jew': 7,
 'throw jew': 6,
 'get rid': 6,
 'need get': 6,
 'zionist jew': 6,
 'k

In [22]:
important_terms=[]
for key in imp_terms:
    important_terms.append(key)

In [176]:
important_terms

['new world',
 'world order',
 'zionist occupied',
 'united state',
 'goy know',
 'george soros',
 'blood type',
 'vatican ii',
 'nostra aetate',
 'occupied government',
 'human right',
 'jesus christ',
 'deep state',
 'zionist reptilian',
 'far right',
 'white people',
 'jewish community',
 'fascist white',
 'new york',
 'including zionist',
 'church always',
 'un special',
 'late 20th',
 'space time',
 'space time information',
 'time information',
 'white power',
 'african identity',
 'zionist jewish',
 'moon landing',
 'middle east',
 'critical race',
 'church teaching',
 'people color',
 'civil right',
 'world war',
 'old school',
 'zionist time',
 'special rapporteur',
 'un special rapporteur',
 'federal reserve',
 'white men',
 'interfere behalf',
 'say jew',
 'throw jew',
 'get rid',
 'need get',
 'zionist jew',
 'klaus schwab',
 'khazarian satanist',
 'mel gibson',
 'central bank',
 'sound like',
 'social justice',
 'american jew',
 'jewish lobby group',
 'lobby group',
 'publ

In [23]:
merged_text=" ".join(uncleaned_lematised_text)


In [24]:
new_terms=[]
for term in important_terms:
    match=re.search(r'\b'+re.escape(term)+r'\b',merged_text)
    if match:
        new_terms.append(match.group(0))
        

In [25]:
len(new_terms)

163

### Lets first remove the appearence terms to make it emerging terms

In [26]:
new_glossary=[]
for term in glossary:
    if len(term.split(" "))==1:
        new_glossary.append(term)
    elif len(term.split(" "))==2:
        new_glossary.append(term.split()[0])
        new_glossary.append(term.split()[1])
    else:
        terms=list(zip(term.split()[:-1],term.split()[1:]))
        pairs=[' '.join(i)  for i in terms]
        pairs.append(' '.join([term.split()[0],term.split()[-1]]))
        new_glossary+=pairs
    

In [27]:
for g in new_glossary:
    for t in new_terms:
        if g in t:
            new_terms.remove(t)

In [541]:
if 'cabal' in 'statecabalone':
    print('hi')

hi


In [28]:
new_terms

['united state',
 'goy know',
 'blood type',
 'vatican ii',
 'nostra aetate',
 'human right',
 'jesus christ',
 'deep state',
 'far right',
 'white people',
 'fascist white',
 'new york',
 'church always',
 'un special',
 'late 20th',
 'space time',
 'space time information',
 'time information',
 'white power',
 'african identity',
 'moon landing',
 'middle east',
 'critical race',
 'church teaching',
 'civil right',
 'world war',
 'old school',
 'special rapporteur',
 'un special rapporteur',
 'federal reserve',
 'white men',
 'get rid',
 'klaus schwab',
 'khazarian satanist',
 'mel gibson',
 'central bank',
 'sound like',
 'social justice',
 'always taught',
 'church always taught',
 'ruling class',
 'go back',
 'khazarian mafia',
 'white house',
 'national socialist',
 'destroying nation',
 'destroying entire',
 'full disclosure',
 'military operation',
 'year ago',
 'air traffic',
 'public service',
 'white american',
 'german people',
 '20th century',
 'white race',
 'roy cohn',


### Removing obvious jewish relevent terms to make it coded

In [29]:
jewish_topics=['kike','jew','zionist','nazi']
for j in jewish_topics:
    for t in new_terms:
        if j in t:
            new_terms.remove(t)

In [30]:
len(new_terms)

127

### Get the original form of the term

In [31]:
trigrams=[]
for term in new_terms:
    if len(term.split())==3:
        trigrams.append(term)


In [32]:
#Removing bigrams if they are already in trigrams
for exp in trigrams:
    terms=list(zip(exp.split()[:-1],exp.split()[1:]))
    pairs=[' '.join(i)  for i in terms]
    pairs.append(' '.join([exp.split()[0],exp.split()[-1]]))
    if pairs[0] in new_terms:
        new_terms.remove(pairs[0])
    if pairs[1] in new_terms:
        new_terms.remove(pairs[1])
    if pairs[2] in new_terms:
        new_terms.remove(pairs[2])
    

In [33]:
len(new_terms)

103

In [227]:
new_terms

['united state',
 'goy know',
 'blood type',
 'vatican ii',
 'nostra aetate',
 'human right',
 'jesus christ',
 'far right',
 'white people',
 'fascist white',
 'late 20th',
 'space time information',
 'white power',
 'moon landing',
 'middle east',
 'critical race',
 'church teaching',
 'civil right',
 'world war',
 'old school',
 'un special rapporteur',
 'federal reserve',
 'white men',
 'get rid',
 'klaus schwab',
 'khazarian satanist',
 'mel gibson',
 'central bank',
 'sound like',
 'social justice',
 'church always taught',
 'ruling class',
 'go back',
 'khazarian mafia',
 'white house',
 'national socialist',
 'destroying nation',
 'full disclosure',
 'year ago',
 'public service',
 'white american',
 'german people',
 '20th century',
 'white race',
 'roy cohn',
 'hollywood elite',
 'world economic',
 'ethnic foids',
 'manhattan new york',
 'new york city',
 'mayer amschel',
 'political correctness',
 'interest group',
 'african identity black',
 'frankfurt school',
 'coup attem

In [143]:
'''
for t in glossary:
    s=[]
    if len(t.split())==2:
        s.append(t.split()[0])
        s.append(t.split()[1])
        for term in new_terms:
            if t in term or s[0] in term or s[1] in term:
                new_terms.remove(term)
    else:
        for term in new_terms:
            if t in term:
                new_terms.remove(term)
'''

'\nfor t in glossary:\n    s=[]\n    if len(t.split())==2:\n        s.append(t.split()[0])\n        s.append(t.split()[1])\n        for term in new_terms:\n            if t in term or s[0] in term or s[1] in term:\n                new_terms.remove(term)\n    else:\n        for term in new_terms:\n            if t in term:\n                new_terms.remove(term)\n'

In [316]:
['zionist occupied government'.split()[0],'zionist occupied government'.split()[-1]]

['zionist', 'government']

In [56]:
important_terms.remove('zog zionist')

In [34]:
glossary=['cabal','cultural marxism','deicide','holocough','jewish capitalist','the goyim know',
           'jewish communist','jewish lobby','new world order','rothschild', 'soros','zionist',
         'zionist occupied government','not the real jews','jew down','cosmopolitan elite'] 

In [61]:
expressions=glossary+new_terms


In [59]:
list(set(glossary).difference(new_terms))

['holocough',
 'not the real jews',
 'cultural marxism',
 'jew down',
 'the goyim know',
 'deicide',
 'jewish capitalist',
 'soros',
 'cabal',
 'zionist',
 'zionist occupied government',
 'cosmopolitan elite',
 'jewish communist',
 'jewish lobby',
 'rothschild',
 'new world order']

In [1164]:

for text in uncleaned_lematised_text:
    if "fema camp" in text:
        print(text)
        print('++++++++++++++++++++++++++++++')
        

fema is not a good thing! fema camp are concentration camps. fema camp are the end game of the new world order
++++++++++++++++++++++++++++++
i am the deep state. i hacked into the irs and photoshoped this image under direction from the george soros initiative. later today i will connect to the 5g tower to let dark brandon know our mission is going a planned. our set up and partnership with the doj and jack smith will soon be complete. once we indict trump then we will censor magas and start putting them into fema camp for detention.
++++++++++++++++++++++++++++++


In [1138]:
tokenizer.tokenize("jesuit joe biden, globalist puppet. tool of the trillionaire, jacob rothschild and his khazarian mafia minions. the globalists are being exposed worldwide and the mass are rejecting their 'great reset. now they have intensified their dangerous and desperate tactic to fight back against the serf and slave they wish to control. a main goal is to depopulate the earth. it easier to control le of us. biden = bilderberg's idiot destroying entire nation.")

['jesuit',
 'joe',
 'bid',
 '##en',
 ',',
 'global',
 '##ist',
 'puppet',
 '.',
 'tool',
 'of',
 'the',
 'trillion',
 '##aire',
 ',',
 'jacob',
 'rothschild',
 'and',
 'his',
 'k',
 '##ha',
 '##zar',
 '##ian',
 'mafia',
 'minions',
 '.',
 'the',
 'global',
 '##ists',
 'are',
 'being',
 'exposed',
 'worldwide',
 'and',
 'the',
 'mass',
 'are',
 'rejecting',
 'their',
 "'",
 'great',
 'reset',
 '.',
 'now',
 'they',
 'have',
 'intensified',
 'their',
 'dangerous',
 'and',
 'desperate',
 'tactic',
 'to',
 'fight',
 'back',
 'against',
 'the',
 'ser',
 '##f',
 'and',
 'slave',
 'they',
 'wish',
 'to',
 'control',
 '.',
 'a',
 'main',
 'goal',
 'is',
 'to',
 'de',
 '##pop',
 '##ulate',
 'the',
 'earth',
 '.',
 'it',
 'easier',
 'to',
 'control',
 'le',
 'of',
 'us',
 '.',
 'bid',
 '##en',
 '=',
 'bi',
 '##lder',
 '##berg',
 "'",
 's',
 'idiot',
 'destroying',
 'entire',
 'nation',
 '.']

In [1058]:
Counter(great_reset.split())

Counter({'>>414172699': 1,
         'great': 3,
         'reset': 2,
         'next': 2,
         'week>jfk': 1,
         'princess': 2,
         'diana': 1,
         'back>trump': 1,
         'president>trump': 1,
         'biden': 10,
         'crime': 3,
         'did>the': 1,
         'cabal': 1,
         'exterminated>god': 1,
         'bless': 1,
         'youqtards': 1,
         'live': 1,
         'imaginary': 1,
         'world.article': 1,
         "putin's": 1,
         'role': 1,
         'reset/globalization': 1,
         'putin': 1,
         'xi': 1,
         'multipolar': 1,
         'eurasian': 1,
         'plan': 3,
         '(global': 1,
         'jewish-communist': 1,
         'slavery)': 1,
         'eastern': 1,
         'western': 7,
         'jew': 3,
         'lockstep': 1,
         'kissinger': 1,
         'dugin': 1,
         'jewish': 2,
         'collapse': 4,
         'west': 5,
         'full': 1,
         'swing': 1,
         'europe': 1,
         'destro

In [786]:
lematize("the goyim know")

'the goy know'

## Extracting semantic similarity

In [35]:
def extract_context_words_bigram(sentence, target_bigram, num_words_before, num_words_after):
    words = sentence.split()
    context_words = []
    if len(target_bigram.split())==4:
        for i in range(len(words)-3):
            fourgram=" ".join([words[i],words[i+1],words[i+2],words[i+3]])
            if fourgram==target_bigram:
                start_index = max(0, i - num_words_before)
                end_index = min(len(words), i + num_words_after + 4)
                context_words.append(words[start_index:end_index])            
    elif len(target_bigram.split())==3:
        for i in range(len(words) - 2):  # Loop through pairs of consecutive words
            trigram=" ".join([words[i],words[i+1],words[i+2]])
            if trigram == target_bigram: 
                start_index = max(0, i - num_words_before)
                end_index = min(len(words), i + num_words_after + 3)
                context_words.append(words[start_index:end_index])
    else:
        for i in range(len(words) - 1):  # Loop through pairs of consecutive words
            bigram = " ".join([words[i], words[i + 1]])
            if bigram == target_bigram or words[i]==target_bigram:
                start_index = max(0, i - num_words_before)
                end_index = min(len(words), i + num_words_after + 2)
                context_words.append(words[start_index:end_index])

    return [" ".join(i) for i in context_words]

In [167]:
t=tokenizer(["hey buddy whtaspapp","I am good man"],return_tensors='pt',padding=True)
m=model(**t)
m

BaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=tensor([[[-0.4085,  0.1897, -0.1800,  ..., -0.1144,  0.3407,  0.2681],
         [-0.5121,  0.7198,  1.0164,  ..., -0.2408,  0.2927,  0.4201],
         [ 0.0975,  0.1419,  0.9499,  ...,  0.0544,  0.1118,  0.5880],
         ...,
         [-0.1907,  0.2208,  0.6329,  ...,  0.2263, -0.1473, -0.2052],
         [-0.1924, -0.1282, -0.0755,  ...,  0.9328,  0.2452, -0.8285],
         [ 0.9104,  0.0798, -0.2145,  ...,  0.4522, -0.6772, -0.1402]],

        [[-0.0407,  0.5450, -0.0261,  ..., -0.0495,  0.3516,  0.2268],
         [ 0.4781,  0.0024, -0.1707,  ..., -0.4226,  0.6331, -0.0256],
         [ 0.0359, -0.0916,  0.1373,  ..., -0.3269,  0.5405, -0.0678],
         ...,
         [ 0.8295,  0.0728, -0.2119,  ...,  0.1286, -0.5100, -0.2414],
         [ 0.0700,  0.0281,  0.3272,  ...,  0.2974,  0.1447, -0.2613],
         [-0.1064, -0.0894,  0.3124,  ...,  0.4626,  0.1871, -0.3429]]],
       grad_fn=<NativeLayerNormBackward0>), pooler_ou

In [1183]:
def extract_embeddings(Posts:list[str],Expression:list[str],rAnge:int):
    terms_not_intext=[]
    embed_dict={}
    sent_dict={i:[] for i in Expression}
    for term in Expression:
        for post in Posts:
            if term in post:
                sents=extract_context_words_bigram(sentence=post,target_bigram=term,num_words_before=rAnge,num_words_after=rAnge)
                if bool(sents):
                    for sent in sents:
                        sent_dict[term].append(sent)
    for key in sent_dict:
        if bool(sent_dict[key]):
            sent_tokenize=tokenizer(sent_dict[key],return_tensors='pt',padding=True,max_length=512,truncation=True)
            with torch.no_grad():
                model_output=model(**sent_tokenize)['pooler_output']
                embed_dict[key]=model_output
        else:
            terms_not_intext.append(key)
        
    return embed_dict,terms_not_intext

### This is to extracting embeddings as words

In [36]:
new_text_lematize=[]
for text in uncleaned_lematised_text:
    t="[CLS] "+text+" [SEP]"
    text_tokenize=tokenizer.tokenize(t)
    if len(text_tokenize)>512:
        chunks=len(text_tokenize)//512
        chunk_size=len(text.split())//(chunks+1)
        for i in range(chunks+1):
            extracted_text=" ".join(text.split(" ")[i*chunk_size:(i+1)*chunk_size])
            new_text_lematize.append(extracted_text)
    else:
        new_text_lematize.append(text)
        

Token indices sequence length is longer than the specified maximum sequence length for this model (3539 > 512). Running this sequence through the model will result in indexing errors


In [37]:
len(new_text_lematize)

714

In [38]:
new_text_unclean=[]
for text in uncleaned_texts:
    t="[CLS] "+text+" [SEP]"
    text_tokenize=tokenizer.tokenize(t)
    if len(text_tokenize)>512:
        chunks=len(text_tokenize)//512
        chunk_size=len(text.split())//(chunks+1)
        for i in range(chunks+1):
            extracted_text=" ".join(text.split(" ")[i*chunk_size:(i+1)*chunk_size])
            new_text_unclean.append(extracted_text)
    else:
        new_text_unclean.append(text)

In [39]:
# Generate embeddings considering complete sentence and use surrounding words for embeddings
def extract_embeddings(Posts:list[str],Expression:list[str],Range:int):
    embed_dict={i:[] for i in Expression}
    for post in Posts:
        t="[CLS] "+post+" [SEP]"
        text_tokenize=tokenizer.tokenize(t)
        if len(text_tokenize)<512:
            tensor_input_ids=torch.tensor([tokenizer.convert_tokens_to_ids(text_tokenize)])
            tensor_segment_ids= torch.tensor([[1]*len(text_tokenize)])
        
            with torch.no_grad():
                outputs=model(tensor_input_ids,tensor_segment_ids)
            token_embeddings = torch.squeeze(outputs[0])
            for term in Expression:
                if term in post:
                    sent=extract_context_words_bigram(sentence=post,target_bigram=term,num_words_before=Range,num_words_after=Range)
                    for t in sent:
                        sent_tokenize=tokenizer.tokenize(t)
                        token_index=[index for index,token in enumerate(text_tokenize) if token in sent_tokenize]
                        sent_embed=[]
                        for index in token_index:
                            sent_embed.append(token_embeddings[index])
                        if sent_embed:
                            embed=torch.mean(torch.stack(sent_embed),dim=0)
                            embed_dict[term].append(embed)
    return embed_dict

In [154]:
t="[CLS] "+uncleaned_texts[1]+ " [SEP]"
text_tokenize=tokenizer.tokenize(t)
tensor_input_ids=torch.tensor([tokenizer.convert_tokens_to_ids(text_tokenize)])
tensor_segment_ids= torch.tensor([[1]*len(text_tokenize)])
        
with torch.no_grad():
    outputs=model(tensor_input_ids,tensor_segment_ids)

In [40]:
impembeddings=extract_embeddings(new_text_lematize,new_terms,Range=0)

In [41]:
no_embed_terms=[]
for i in impembeddings:
    if impembeddings[i]==[]:
        no_embed_terms.append(i)

In [255]:
no_embed_terms

[]

In [119]:
torch.stack((impembeddings['cabal']))

KeyError: 'cabal'

#### To get word embed

In [42]:
impterm_embeddings={}
for term in new_terms:
    if term not in no_embed_terms:
        impterm_embeddings[term]=torch.mean(torch.stack(impembeddings[term]),dim=0)

In [257]:
impterm_embeddings.keys()

dict_keys(['united state', 'goy know', 'blood type', 'vatican ii', 'nostra aetate', 'human right', 'jesus christ', 'far right', 'white people', 'fascist white', 'late 20th', 'space time information', 'white power', 'moon landing', 'middle east', 'critical race', 'church teaching', 'civil right', 'world war', 'old school', 'un special rapporteur', 'federal reserve', 'white men', 'get rid', 'klaus schwab', 'khazarian satanist', 'mel gibson', 'central bank', 'sound like', 'social justice', 'church always taught', 'ruling class', 'go back', 'khazarian mafia', 'white house', 'national socialist', 'destroying nation', 'full disclosure', 'year ago', 'public service', 'white american', 'german people', '20th century', 'white race', 'roy cohn', 'hollywood elite', 'world economic', 'ethnic foids', 'manhattan new york', 'new york city', 'mayer amschel', 'political correctness', 'interest group', 'african identity black', 'frankfurt school', 'coup attempt', 'western civilization', 'catholic church

In [43]:
g_embeddings=extract_embeddings(new_text_unclean,glossary,Range=0)

In [240]:
lematised_glossary

['cabal',
 'cosmopolitan elite',
 'cultural marxism',
 'deicide',
 'holocough',
 'jewish capitalist',
 'the goyim know',
 'jewish communist',
 'jewish lobby',
 'new world order',
 'rothschild',
 'soros',
 'zionist',
 'zionist occupied government',
 'jew down',
 'not the real jews']

In [73]:
torch.stack((g_embeddings['cabal']))

tensor([[ 0.1325,  0.3495,  0.7495,  ..., -0.2590, -0.0175,  0.0388],
        [ 0.2912, -0.1633,  0.5889,  ..., -0.4233,  0.1252,  0.2580],
        [ 0.3700,  0.2295,  0.5064,  ..., -0.2986,  0.2346, -0.3927],
        ...,
        [ 0.6371,  0.2989,  0.6596,  ..., -0.1963, -0.0051,  0.0954],
        [ 0.8584,  0.1012,  0.1747,  ..., -0.5123,  0.1665,  0.2528],
        [ 0.0639, -0.5284,  0.2625,  ..., -0.5178, -0.2199,  0.1731]])

In [44]:
no_embed_terms=[]
for i in g_embeddings:
    if g_embeddings[i]==[]:
        no_embed_terms.append(i)

In [260]:
no_embed_terms

[]

In [45]:
glossary_embeddings={}
for term in glossary:
    if term not in no_embed_terms:
        glossary_embeddings[term]=torch.mean(torch.stack(g_embeddings[term]),dim=0)

#### To get sentence embed

In [None]:
impembeddings,no_embed_terms=extract_embeddings(new_text_lematize,new_terms,rAnge=5)

In [937]:
impterm_embeddings={}
for term in new_terms:
    if term not in no_embed_terms:
        impterm_embeddings[term]=torch.mean(impembeddings[term],dim=0)

In [1186]:
g_embeddings,no_embed_terms=extract_embeddings(new_text_unclean,glossary,rAnge=5)

In [1187]:
glossary_embeddings={}
for term in glossary:
    glossary_embeddings[term]=torch.mean(g_embeddings[term],dim=0)

#### similar for both the embeddings

In [46]:
column=["Term"]+glossary+["Average"]
bert_similarity_df=pd.DataFrame(columns=column)

In [47]:
for term in impterm_embeddings:
    score=0
    sim_score=[term]
    for seed_word in glossary :
        s=np.array(torch.cosine_similarity(impterm_embeddings[term].reshape(1,-1),glossary_embeddings[seed_word].reshape(1,-1)))[0]
        sim_score.append(s)
        score=score+s
    sim_score.append(score/len(glossary))
    bert_similarity_df.loc[len(bert_similarity_df)]=sim_score

In [48]:
threshold=bert_similarity_df['Average'].quantile(0.50)
bert_similarity_df['predicted']=bert_similarity_df['Average'].apply(lambda x:1 if x>threshold else 0)

In [49]:
threshold

0.48371520452201366

In [50]:
pd.DataFrame(bert_similarity_df).to_csv('/Users/dhanushkikkisetti/Documents/Research Assistant/Scripts/NEWRESULTS_EXTRACTINGTERMS/bertbase_range0.csv')

## Finetuning the range parameter to check the best performance metrics

In [57]:
actual_labels=imp_terms

In [98]:
len(actual_labels)

125

### Fine tuning for word embed

In [1149]:
predicted_labels={}
column=["Term"]+glossary+["Average"]
for r in range(5,15):
    print("Range = "+str(r))
    impembeddings=extract_embeddings(new_text_lematize,new_terms,Range=r)
    g_embeddings=extract_embeddings(new_text_unclean,glossary,Range=r)
    impterm_embeddings={}
    for term in new_terms:
        impterm_embeddings[term]=torch.mean(torch.stack(impembeddings[term]),dim=0)
    glossary_embeddings={}
    for term in glossary:
        glossary_embeddings[term]=torch.mean(torch.stack(g_embeddings[term]),dim=0)
    bert_similarity_df=pd.DataFrame(columns=column)
    for term in impterm_embeddings:
        score=0
        sim_score=[term]
        for seed_word in glossary :
            s=np.array(torch.cosine_similarity(impterm_embeddings[term].reshape(1,-1),glossary_embeddings[seed_word].reshape(1,-1)))[0]
            sim_score.append(s)
            score=score+s
        sim_score.append(score/len(glossary))
        bert_similarity_df.loc[len(bert_similarity_df)]=sim_score
    threshold=bert_similarity_df['Average'].quantile(0.50)
    bert_similarity_df['predicted']=bert_similarity_df['Average'].apply(lambda x:1 if x>threshold else 0)
    predicted_labels[r]=bert_similarity_df['predicted']

Range = 5


Token indices sequence length is longer than the specified maximum sequence length for this model (538 > 512). Running this sequence through the model will result in indexing errors


Range = 6
Range = 7
Range = 8
Range = 9
Range = 10
Range = 11
Range = 12
Range = 13
Range = 14


### Fine tuning for sentence embed

In [965]:
predicted_labels={}
column=["Term"]+glossary+["Average"]
for r in range(5,15):
    print("Range = "+str(r))
    impembeddings,no_term_embed=extract_embeddings(new_text_lematize,new_terms,rAnge=r)
    g_embeddings,no_term_embed=extract_embeddings(new_text_unclean,glossary,rAnge=r)
    impterm_embeddings={}
    for term in new_terms:
        impterm_embeddings[term]=torch.mean(impembeddings[term],dim=0)
    glossary_embeddings={}
    for term in glossary:
        glossary_embeddings[term]=torch.mean(g_embeddings[term],dim=0)
    bert_similarity_df=pd.DataFrame(columns=column)
    for term in impterm_embeddings:
        score=0
        sim_score=[term]
        for seed_word in glossary :
            s=np.array(torch.cosine_similarity(impterm_embeddings[term].reshape(1,-1),glossary_embeddings[seed_word].reshape(1,-1)))[0]
            sim_score.append(s)
            score=score+s
        sim_score.append(score/len(glossary))
        bert_similarity_df.loc[len(bert_similarity_df)]=sim_score
    threshold=bert_similarity_df['Average'].quantile(0.50)
    bert_similarity_df['predicted']=bert_similarity_df['Average'].apply(lambda x:1 if x>threshold else 0)
    predicted_labels[r]=bert_similarity_df['predicted']

Range = 5
Range = 6
Range = 7
Range = 8
Range = 9
Range = 10
Range = 11
Range = 12
Range = 13
Range = 14


In [1150]:
predicted_labels

{5: 0     1
 1     0
 2     1
 3     0
 4     0
      ..
 89    0
 90    1
 91    1
 92    0
 93    1
 Name: predicted, Length: 94, dtype: int64,
 6: 0     1
 1     0
 2     1
 3     0
 4     0
      ..
 89    0
 90    0
 91    1
 92    0
 93    1
 Name: predicted, Length: 94, dtype: int64,
 7: 0     1
 1     0
 2     1
 3     0
 4     0
      ..
 89    0
 90    0
 91    1
 92    0
 93    1
 Name: predicted, Length: 94, dtype: int64,
 8: 0     1
 1     0
 2     1
 3     0
 4     0
      ..
 89    0
 90    0
 91    1
 92    0
 93    1
 Name: predicted, Length: 94, dtype: int64,
 9: 0     1
 1     0
 2     0
 3     0
 4     0
      ..
 89    0
 90    0
 91    1
 92    0
 93    1
 Name: predicted, Length: 94, dtype: int64,
 10: 0     1
 1     0
 2     0
 3     0
 4     0
      ..
 89    0
 90    0
 91    1
 92    0
 93    1
 Name: predicted, Length: 94, dtype: int64,
 11: 0     1
 1     0
 2     0
 3     0
 4     0
      ..
 89    0
 90    0
 91    1
 92    0
 93    1
 Name: predicted, Le

In [413]:
for r in range(3,13):
    print("Performance matrix with range of words :  "+ str(r))
    print("Accuracy : ",accuracy_score(predicted_labels[r],actual_labels))
    print("Precision : ",precision_score(predicted_labels[r],actual_labels))
    print("Recall  : ",recall_score(predicted_labels[r],actual_labels))
    print("F1 Score  : ",f1_score(predicted_labels[r],actual_labels))

Performance matrix with range of words :  3
Accuracy :  0.5873015873015873
Precision :  0.5675675675675675
Recall  :  0.6774193548387096
F1 Score  :  0.6176470588235294
Performance matrix with range of words :  4
Accuracy :  0.4603174603174603
Precision :  0.4594594594594595
Recall  :  0.5483870967741935
F1 Score  :  0.5
Performance matrix with range of words :  5
Accuracy :  0.5873015873015873
Precision :  0.5675675675675675
Recall  :  0.6774193548387096
F1 Score  :  0.6176470588235294
Performance matrix with range of words :  6
Accuracy :  0.6825396825396826
Precision :  0.6486486486486487
Recall  :  0.7741935483870968
F1 Score  :  0.7058823529411764
Performance matrix with range of words :  7
Accuracy :  0.6190476190476191
Precision :  0.5945945945945946
Recall  :  0.7096774193548387
F1 Score  :  0.6470588235294118
Performance matrix with range of words :  8
Accuracy :  0.6825396825396826
Precision :  0.6486486486486487
Recall  :  0.7741935483870968
F1 Score  :  0.7058823529411764
P

In [1151]:
results=pd.DataFrame.from_dict(predicted_labels)
results['avg']=results.mean(axis=1)


In [1152]:
results['Predicted']=results['avg'].apply(lambda x:1 if x>0.8 else 0)

In [969]:
results.Predicted

0     1
1     1
2     1
3     1
4     1
     ..
89    0
90    0
91    1
92    0
93    0
Name: Predicted, Length: 94, dtype: int64

In [1153]:
pd.DataFrame(results).to_csv("/Users/dhanushkikkisetti/Documents/Research Assistant/Scripts/Final fine tune bert word embed/finetune_range_r_parameter_lematize5_15.csv")

### Checking with antisemetic terms definations

In [163]:
terms_definition_df=pd.read_csv("Antisemetism_term_definition.csv")
terms_definition_df.head()

Unnamed: 0,Term or Phrase,Regular Definition,Antisemitic Definition,Related Terms,Direct or Indirect,Antisemitic Trope,Emerging Term (Y/N),Emerging Trope (Y/N),Extremism Rank,Notes,Means of Discovery,Discovery Date/Time (Emerging Only),Sources,Source Link,Social Media Site
0,Blood libel,perpetuated accusation that Jews have murdered...,The blood libel charge—also known as the ritua...,"Blood thirsty,",Indirect,,No,,,"A Canadian metal group is called ""Blood Libel""...",,,American Jewish Committee,Page 4: https://www.ajc.org/sites/default/file...,
1,Cabal,"a small, powerful group that seeks to establis...",Jews have long been accused of being part of a...,,Indirect,,Undetermined,,,,,,American Jewish Committee,Page 4: https://www.ajc.org/sites/default/file...,
2,Clannish,of or relating to a clan; tending to associate...,Referring to Jews as clannish is an antisemiti...,,Indirect,,No,,,,,,American Jewish Committee,Page 5: https://www.ajc.org/sites/default/file...,
3,Conspiracy theory,a belief that some covert but influential orga...,"From medieval times until the present day, con...",,Direct,,Yes,,,,,,American Jewish Committee,Page 5 and 6: https://www.ajc.org/sites/defaul...,
4,Control,power or authority to guide or manage,False reports that claim Jews control the medi...,,Indirect,,Undetermined,,,,,,American Jewish Committee,Page 6: https://www.ajc.org/sites/default/file...,


In [164]:
terms_definition_df=terms_definition_df[terms_definition_df["Term or Phrase"].apply(lambda x:True if x.lower() in glossary else False)==True]


In [165]:
terms_definition_df['Term or Phrase']

1                                 Cabal
5                    Cosmopolitan elite
8                      Cultural Marxism
10                              Deicide
15                     “The Goyim Know”
17                            Holocough
20                    Jewish capitalist
21                     Jewish communist
26                         Jewish lobby
30                      New World Order
31                  "not the real Jews"
35                           Rothschild
41                                Soros
43                      Zionist / “Zio”
44    Zionist Occupied Government (ZOG)
Name: Term or Phrase, dtype: object

In [166]:
antisemetic_definition={}
for i in range(len(terms_definition_df)):
    antisemetic_definition[terms_definition_df['Term or Phrase'].iloc[i]]=terms_definition_df['Antisemitic Definition'].iloc[i]

In [167]:
antisemetic_definition_embeddings={}
for term in antisemetic_definition:
    tokens=tokenizer(antisemetic_definition[term],return_tensors='pt')
    with torch.no_grad():
        output=model(**tokens)
    antisemetic_definition_embeddings[term.lower()]=output['pooler_output']

In [168]:
column=["Term"]+glossary+["Average"]
bert_similarity_df=pd.DataFrame(columns=column)

In [169]:
for term in impterm_embeddings:
    score=0
    sim_score=[term]
    for seed_word in glossary :
        s=np.array(torch.cosine_similarity(impterm_embeddings[term].reshape(1,-1),antisemetic_definition_embeddings[seed_word].reshape(1,-1)))[0]
        sim_score.append(s)
        score=score+s
    sim_score.append(score/len(glossary))
    bert_similarity_df.loc[len(bert_similarity_df)]=sim_score

In [170]:
threshold=sum(bert_similarity_df.Average)/len(bert_similarity_df)
bert_similarity_df['predicted']=bert_similarity_df['Average'].apply(lambda x:1 if x>threshold else 0)

In [172]:
threshold

0.8104986549226525

In [171]:
pd.DataFrame(bert_similarity_df).to_csv('Raza_bigrams_prediction.csv')

In [32]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
    print(bert_similarity_df)

                          Term     cabal  cosmopolitan elite  \
0             frankfurt school  0.989069            0.989927   
1              critical theory  0.981373            0.989109   
2                  world order  0.998177            0.987921   
3            conspiracy theory  0.990634            0.991050   
4                    new world  0.998318            0.986168   
5                 united state  0.997652            0.980121   
6             cultural marxist  0.979573            0.988594   
7                critical race  0.980110            0.988045   
8                  race theory  0.985699            0.989968   
9         critical race theory  0.981106            0.989057   
10       political correctness  0.980166            0.988972   
11              thierry baudet  0.980898            0.981127   
12              social justice  0.983762            0.989501   
13                  deep state  0.998774            0.986023   
14            marxism cultural  0.943070

## Classification of 4 Approaches

##### Finetune bert+ word embed

In [9]:
data_file=pd.read_csv("/Users/dhanushkikkisetti/Documents/Research Assistant/Scripts/Final fine tune bert word embed/finetune_range_r_parameter_lematize1_10.csv")


In [1158]:
data_file.Actual

0     0
1     1
2     1
3     0
4     0
     ..
89    0
90    0
91    0
92    0
93    0
Name: Actual, Length: 94, dtype: int64

In [10]:
### 1 using Bert model to extract embedding using tokens to compare similarity between imp terms and glossary
#data_file=pd.read_csv("/Users/dhanushkikkisetti/Documents/Research Assistant/Scripts/1.csv",encoding='latin-1')
print("Results for : Finetune bert+ word embed")
print("-----------------------------------------")
print("Accuracy : ",accuracy_score(data_file['Actual'],data_file['Predicted']))
print("Precision : ",precision_score(data_file['Actual'],data_file['Predicted']))
print("Recall  : ",recall_score(data_file['Actual'],data_file['Predicted']))
print("F1 Score  : ",f1_score(data_file['Actual'],data_file['Predicted']))

Results for : Finetune bert+ word embed
-----------------------------------------
Accuracy :  0.7978723404255319
Precision :  0.631578947368421
Recall  :  0.8275862068965517
F1 Score  :  0.716417910447761


In [1178]:
confusion_matrix(data_file['Actual'],data_file['Predicted'])

array([[51, 14],
       [ 5, 24]])

##### Bert model + word embed

In [6]:
data_file=pd.read_csv("/Users/dhanushkikkisetti/Documents/Research Assistant/Scripts/Final fine tune bert sentence embed/finetune_range_r_parameter_lematize5_15.csv")


In [983]:
print("Results for : bert model + word embed")
print("-----------------------------------------")
print("Accuracy : ",accuracy_score(data_file['Predicted'],data_file['Actual']))
print("Precision : ",precision_score(data_file['Predicted'],data_file['Actual']))
print("Recall  : ",recall_score(data_file['Predicted'],data_file['Actual']))
print("F1 Score  : ",f1_score(data_file['Predicted'],data_file['Actual']))

Results for : bert model + word embed
-----------------------------------------
Accuracy :  0.6914893617021277
Precision :  0.6896551724137931
Recall  :  0.5
F1 Score  :  0.5797101449275363


In [1171]:
confusion_matrix(data_file['Actual'],data_file['Predicted'])

array([[47, 18],
       [13, 16]])

##### Fine tune bert model + sentence embed

In [8]:
data_file=pd.read_csv("/Users/dhanushkikkisetti/Documents/Research Assistant/DSAA 2024/Solution 2-1.csv")


In [9]:
print("Results for : Fine tune bert model + sentence embed")
print("-----------------------------------------")
print("Accuracy : ",accuracy_score(data_file['Actual'],data_file['Predicted']))
print("Precision : ",precision_score(data_file['Actual'],data_file['Predicted']))
print("Recall  : ",recall_score(data_file['Actual'],data_file['Predicted']))
print("F1 Score  : ",f1_score(data_file['Actual'],data_file['Predicted']))

Results for : Fine tune bert model + sentence embed
-----------------------------------------
Accuracy :  0.6808510638297872
Precision :  0.4857142857142857
Recall  :  0.5862068965517241
F1 Score  :  0.53125


##### Bert model + Sentence embed

In [17]:
data_file=pd.read_csv("/Users/dhanushkikkisetti/Documents/Research Assistant/Scripts/baseline_sentence_embed.csv")

In [18]:
print("Results for : Fine tune bert model + sentence embed")
print("-----------------------------------------")
print("Accuracy : ",accuracy_score(data_file['Actual'],data_file['Predicted']))
print("Precision : ",precision_score(data_file['Actual'],data_file['Predicted']))
print("Recall  : ",recall_score(data_file['Actual'],data_file['Predicted']))
print("F1 Score  : ",f1_score(data_file['Actual'],data_file['Predicted']))

Results for : Fine tune bert model + sentence embed
-----------------------------------------
Accuracy :  0.8076923076923077
Precision :  0.38461538461538464
Recall  :  0.7142857142857143
F1 Score  :  0.5
