In [1]:
#All necessary libraries
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
# Using for cleaning and Pre-Processing
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk import pos_tag
import spacy
en = spacy.load('en_core_web_sm')
# Loading transformers library
import torch
from transformers import AutoTokenizer, BertModel,AutoModel
tokenizer = AutoTokenizer.from_pretrained('Dhanush66/AntismetisimLargedata-finetuned-MLM-NEW')
model = BertModel.from_pretrained('Dhanush66/AntismetisimLargedata-finetuned-MLM-NEW')
# To generate embedding
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models.phrases import Phrases, ENGLISH_CONNECTOR_WORDS,Phraser
from gensim.models import Word2Vec, KeyedVectors #To load the model
from cleantext import clean
#Visualisations
import matplotlib.pyplot as plt
plt.rcParams["font.family"] = "Calibri"
import matplotlib.gridspec as gridspec
import os
from tqdm import tqdm
from collections import Counter
from concurrent.futures import ProcessPoolExecutor
from preprocesss import preprocess_batch
#To check for performance
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix

from keybert import KeyBERT
from pygtrie import CharTrie
from collections import Counter
stopword=list(stopwords.words('english'))

In [2]:
data=pd.read_csv("Unmasking Antisemitism SRI Data Set - Reporting Layer.csv")
data=data[['Term or Phrase','Post Text']]
data['Term or Phrase'].unique()

#Get the number of rows
len(data)

667

In [3]:
#Drop the duplicates
data=data.drop_duplicates()
len(data)

644

In [4]:
glossary=['cabal','cosmopolitan elite','cultural marxism','deicide','holocough','jewish capitalist','the goyim know',
           'jewish communist','jewish lobby','new world order','rothschild', 'soros','zionist',
         'zionist occupied government','jew down','not the real jews'] 
emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002500-\U00002BEF"  # chinese char
                               u"\U00002702-\U000027B0"
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               u"\U0001f926-\U0001f937"
                               u"\U00010000-\U0010ffff"
                               u"\u2640-\u2642"
                               u"\u2600-\u2B55"
                               u"\u200d"
                               u"\u23cf"
                               u"\u23e9"
                               u"\u231a"
                               u"\ufe0f"  # dingbats
                               u"\u3030"
                                "]+", flags=re.UNICODE)

In [5]:
data=data[-data["Post Text"].isnull()]


In [6]:
lematizer=WordNetLemmatizer()
def lematize(text):
    text=text.split()
    lema=[]
    for i in text:
        lema.append(lematizer.lemmatize(i))
    return (" ".join(lema))

In [7]:
data['clean']=data['Post Text'].apply(lambda x:x.lower())
#removing the links:
data['clean']=data['clean'].apply(lambda x:re.sub(r"http\S+","",str(x)))
#Getting the lematised text to get the original form of the word
data['lematize']=data['clean'].apply(lambda x:lematize(x))
#data['clean']=data['clean'].apply(lambda x:x.translate(str.maketrans("","",string.punctuation)))
uncleaned_texts=list(data["clean"])
uncleaned_lematised_text=list(data['lematize'])

In [8]:
def preprocess(text,postag,stopword,lematizer):
    pos_removal=[tag[0] for tag in pos_tag(text.split()) if tag[1] in postag]
    return(" ".join([lematizer.lemmatize(i) for i in pos_removal if i not in stopword]))

In [9]:
postag=["JJ",'NN','NNS','NNP','VBP']
stopword=list(stopwords.words('english'))
def preprocess_parallel(sentences, postag, stopword, num_workers):
    chunk_size = len(sentences) // num_workers
    chunks = [sentences[i:i + chunk_size] for i in range(0, len(sentences)+1, chunk_size)]
    with ProcessPoolExecutor(max_workers=num_workers) as executor:
        results = executor.map(preprocess_batch, chunks, [postag] * num_workers, [stopword] * num_workers)
    processed_sentences = [item for sublist in results for item in sublist]
    return processed_sentences

In [10]:
processed_sentences_parallel = preprocess_parallel(sentences=uncleaned_texts, 
                                                   postag=postag, stopword=stopword, num_workers=1)

In [11]:
data["clean"]=processed_sentences_parallel

In [12]:
data['lematize']=data['clean'].apply(lambda x:lematize(x))

In [13]:
post_text=list(data['Post Text'])

lematised_texts=list(data['lematize'])

## Finding Important terms

In [14]:
def emerging(terms):

    check=True
    while(tqdm(check)):
        #For each time step
        important_terms={}
        text=' '.join(data.lematize)
        #Generate TF-IDF Matrix
        tf_idfvectorizer=TfidfVectorizer(ngram_range=(2,3))
        tfidf=tf_idfvectorizer.fit_transform(data.lematize)
        features=tf_idfvectorizer.get_feature_names_out()
        
        #Exporting all Bigrams
        #pd.DataFrame(features).to_csv("Bigrams "+str(z)+".csv",header=['All Bigrams'])
        
        #Finding Emerging terms/Initializing glossary
        emerging_terms=list(set(features).difference(glossary))

        emerging_terms_trie = CharTrie()
        for term in emerging_terms:
            emerging_terms_trie[term] = True

                #glossary.append(i)
        print("started")
        #Finding the index of emerging terms
        findex=[i for i, word in enumerate(features) if word in emerging_terms_trie]
        
        tfidf_highest=tfidf.max(axis=0)
        #Finding the highest tf-idf value for all the feature names across all documents
        tfidf_values={}
        for i in tqdm(findex):
            tfidf_values[features[i]]=tfidf_highest.toarray()[0][i]
        
        tfidf_values=sorted(tfidf_values.items(),key=lambda x:x[1],reverse=True)
        print("Ended")
        #Exporting Top Tf-idf values with bigrams
        
        #pd.DataFrame(tfidf_values).to_csv("SubtractionBigram-Tfidf "+str(z)+".csv",header  = ['Bigrams','TF-IDF Values'])
        #print(tfidf_values[:20])
    
        #Extracting bigrams after TF-IDF threshold cut off
        a=[j for i, j in tfidf_values]
        th=np.mean(a)
        print("Threshold ",th)
        final_tfidf={}
        for i in tfidf_values:
            if i[1]>th and len(i[0].split()[0])<20 and len(i[0].split()[1])<20:
                final_tfidf[i[0]]=i[1]
        #Exporting Tf-idf values above threshold
        #pd.DataFrame.from_dict(data=final_tfidf, orient='index').to_csv("Bigram_threshold "+str(z)+".csv",index_label='Bigrams',header  = ['TF-IDF Values'])
        
        #Frequency of words in that window size


        words_frequency={}
        text1=text.split(" ")
        pairs=list(zip(text1[:-1],text1[1:]))
        trigram_pairs=list(zip(text1[:-2],text1[1:-1],text1[2:]))
        bigrams=[' '.join(i) for i in pairs]
        trigrams=[" ".join(i) for i in trigram_pairs]
        bigram_frequency=Counter(bigrams)
        trigram_frequency=Counter(trigrams)
        for word in (final_tfidf.keys()):
            if len(word.split())==2:
                words_frequency[word]=bigram_frequency[word]
            else:
                words_frequency[word]=trigram_frequency[word]
                
        words_frequency=sorted(words_frequency.items(),key=lambda x:x[1],reverse=True)
        
        #Appending terms in Imp terms list
        for i in range(terms):
            important_terms[words_frequency[i][0]] = words_frequency[i][1]
        
        
        # Exporting Bigrams with frequency
        #pd.DataFrame(words_frequency).to_csv("Bigram_frequency "+str(z)+".csv",header  = ['Bigrams','Frequency'])
        
        #Exporting Important terms that is coming from  window
        #pd.DataFrame.from_dict(data=important_terms, orient='index').to_csv("Important_terms "+str(z)+".csv",index_label='Bigrams',header  = ['Frequency'])

        return important_terms


In [15]:
imp_terms=emerging(200)

0it [00:00, ?it/s]


started


100%|██████████| 61752/61752 [00:04<00:00, 12621.60it/s]


Ended
Threshold  0.07511590097790932


In [16]:
imp_terms

{'new world': 68,
 'world order': 42,
 'zionist occupied': 32,
 'united state': 26,
 'goy know': 25,
 'george soros': 22,
 'blood type': 22,
 'vatican ii': 22,
 'nostra aetate': 21,
 'occupied government': 17,
 'human right': 16,
 'jesus christ': 15,
 'deep state': 13,
 'zionist reptilian': 12,
 'far right': 11,
 'white people': 11,
 'jewish community': 11,
 'fascist white': 11,
 'new york': 10,
 'including zionist': 10,
 'church always': 10,
 'un special': 9,
 'late 20th': 9,
 'space time': 9,
 'space time information': 9,
 'time information': 9,
 'white power': 9,
 'african identity': 8,
 'zionist jewish': 8,
 'moon landing': 8,
 'middle east': 8,
 'critical race': 8,
 'church teaching': 8,
 'people color': 8,
 'civil right': 8,
 'world war': 7,
 'old school': 7,
 'zionist time': 7,
 'special rapporteur': 7,
 'un special rapporteur': 7,
 'federal reserve': 7,
 'white men': 7,
 'interfere behalf': 7,
 'say jew': 7,
 'throw jew': 6,
 'get rid': 6,
 'need get': 6,
 'zionist jew': 6,
 'k

In [17]:
important_terms=[]
for key in imp_terms:
    important_terms.append(key)

In [31]:
important_terms

['new world',
 'world order',
 'zionist occupied',
 'united state',
 'goy know',
 'george soros',
 'blood type',
 'vatican ii',
 'nostra aetate',
 'occupied government',
 'human right',
 'jesus christ',
 'deep state',
 'zionist reptilian',
 'far right',
 'white people',
 'jewish community',
 'fascist white',
 'new york',
 'including zionist',
 'church always',
 'un special',
 'late 20th',
 'space time',
 'space time information',
 'time information',
 'white power',
 'african identity',
 'zionist jewish',
 'moon landing',
 'middle east',
 'critical race',
 'church teaching',
 'people color',
 'civil right',
 'world war',
 'old school',
 'zionist time',
 'special rapporteur',
 'un special rapporteur',
 'federal reserve',
 'white men',
 'interfere behalf',
 'say jew',
 'throw jew',
 'get rid',
 'need get',
 'zionist jew',
 'klaus schwab',
 'khazarian satanist',
 'mel gibson',
 'central bank',
 'sound like',
 'social justice',
 'american jew',
 'jewish lobby group',
 'lobby group',
 'publ

In [64]:
#Only consider original form of words.
new_terms=[]

for t in important_terms:
    for text in uncleaned_texts:
        if t in text:
            new_terms.append(t)
            break

In [65]:
len(new_terms)

155

In [66]:
### Lets first remove the appearence terms to make it emerging terms
new_glossary=[]
for term in glossary:
    if len(term.split(" "))==1:
        new_glossary.append(term)
    elif len(term.split(" "))==2:
        new_glossary.append(term.split()[0])
        new_glossary.append(term.split()[1])
    else:
        terms=list(zip(term.split()[:-1],term.split()[1:]))
        pairs=[' '.join(i)  for i in terms]
        pairs.append(' '.join([term.split()[0],term.split()[-1]]))
        new_glossary+=pairs
    

In [67]:
for g in new_glossary:
    for t in new_terms:
        if g in t:
            new_terms.remove(t)

In [68]:
len(new_terms)

121

### Removing obvious jewish relevent terms to make it coded

In [69]:
jewish_topics=['kike','jew','zionist','nazi']
for j in jewish_topics:
    for t in new_terms:
        if j in t:
            new_terms.remove(t)

In [70]:
len(new_terms)

119

### Get the original form of the term

In [59]:
trigrams=[]
for term in new_terms:
    if len(term.split())==3:
        trigrams.append(term)


In [60]:
#Removing bigrams if they are already in trigrams
for exp in trigrams:
    terms=list(zip(exp.split()[:-1],exp.split()[1:]))
    pairs=[' '.join(i)  for i in terms]
    pairs.append(' '.join([exp.split()[0],exp.split()[-1]]))
    if pairs[0] in new_terms:
        new_terms.remove(pairs[0])
    if pairs[1] in new_terms:
        new_terms.remove(pairs[1])
    if pairs[2] in new_terms:
        new_terms.remove(pairs[2])
    

In [61]:
len(new_terms)

97

In [62]:
new_terms

['united state',
 'blood type',
 'vatican ii',
 'nostra aetate',
 'human right',
 'jesus christ',
 'far right',
 'white people',
 'fascist white',
 'late 20th',
 'space time information',
 'white power',
 'moon landing',
 'middle east',
 'critical race',
 'church teaching',
 'civil right',
 'world war',
 'old school',
 'un special rapporteur',
 'federal reserve',
 'white men',
 'get rid',
 'klaus schwab',
 'khazarian satanist',
 'mel gibson',
 'central bank',
 'sound like',
 'social justice',
 'church always taught',
 'ruling class',
 'go back',
 'khazarian mafia',
 'white house',
 'national socialist',
 'destroying nation',
 'full disclosure',
 'public service',
 'white american',
 'german people',
 '20th century',
 'white race',
 'roy cohn',
 'world economic',
 'ethnic foids',
 'manhattan new york',
 'new york city',
 'mayer amschel',
 'political correctness',
 'interest group',
 'african identity black',
 'frankfurt school',
 'coup attempt',
 'western civilization',
 'catholic churc

In [71]:
glossary=['cabal','cultural marxism','deicide','holocough','jewish capitalist','the goyim know',
           'jewish communist','jewish lobby','new world order','rothschild', 'soros','zionist',
         'zionist occupied government','not the real jews','jew down','cosmopolitan elite'] 

In [72]:
expressions=glossary+new_terms


## Extracting semantic similarity

In [73]:
def extract_context_words_bigram(sentence, target_bigram, num_words_before, num_words_after):
    words = sentence.split()
    context_words = []
    if len(target_bigram.split())==4:
        for i in range(len(words)-3):
            fourgram=" ".join([words[i],words[i+1],words[i+2],words[i+3]])
            if fourgram==target_bigram:
                start_index = max(0, i - num_words_before)
                end_index = min(len(words), i + num_words_after + 4)
                context_words.append(words[start_index:end_index])            
    elif len(target_bigram.split())==3:
        for i in range(len(words) - 2):  # Loop through pairs of consecutive words
            trigram=" ".join([words[i],words[i+1],words[i+2]])
            if trigram == target_bigram: 
                start_index = max(0, i - num_words_before)
                end_index = min(len(words), i + num_words_after + 3)
                context_words.append(words[start_index:end_index])
    else:
        for i in range(len(words) - 1):  # Loop through pairs of consecutive words
            bigram = " ".join([words[i], words[i + 1]])
            if bigram == target_bigram or words[i]==target_bigram:
                start_index = max(0, i - num_words_before)
                end_index = min(len(words), i + num_words_after + 2)
                context_words.append(words[start_index:end_index])

    return [" ".join(i) for i in context_words]

### This is to extracting embeddings as words

In [75]:
new_text_lematize=[]
for text in uncleaned_lematised_text:
    t="[CLS] "+text+" [SEP]"
    text_tokenize=tokenizer.tokenize(t)
    if len(text_tokenize)>512:
        chunks=len(text_tokenize)//512
        chunk_size=len(text.split())//(chunks+1)
        for i in range(chunks+1):
            extracted_text=" ".join(text.split(" ")[i*chunk_size:(i+1)*chunk_size])
            new_text_lematize.append(extracted_text)
    else:
        new_text_lematize.append(text)
        

Token indices sequence length is longer than the specified maximum sequence length for this model (3539 > 512). Running this sequence through the model will result in indexing errors


In [76]:
len(new_text_lematize)

714

In [38]:
new_text_unclean=[]
for text in uncleaned_texts:
    t="[CLS] "+text+" [SEP]"
    text_tokenize=tokenizer.tokenize(t)
    if len(text_tokenize)>512:
        chunks=len(text_tokenize)//512
        chunk_size=len(text.split())//(chunks+1)
        for i in range(chunks+1):
            extracted_text=" ".join(text.split(" ")[i*chunk_size:(i+1)*chunk_size])
            new_text_unclean.append(extracted_text)
    else:
        new_text_unclean.append(text)

In [78]:
# Generate embeddings considering complete sentence and use surrounding words for embeddings
def extract_embeddings(Posts:list[str],Expression:list[str],Range:int):
    embed_dict={i:[] for i in Expression}
    for post in Posts:
        t="[CLS] "+post+" [SEP]"
        text_tokenize=tokenizer.tokenize(t)
        if len(text_tokenize)<512:
            tensor_input_ids=torch.tensor([tokenizer.convert_tokens_to_ids(text_tokenize)])
            tensor_segment_ids= torch.tensor([[1]*len(text_tokenize)])
        
            with torch.no_grad():
                outputs=model(tensor_input_ids,tensor_segment_ids)
            token_embeddings = torch.squeeze(outputs[0])
            for term in Expression:
                if term in post:
                    sent=extract_context_words_bigram(sentence=post,target_bigram=term,num_words_before=Range,num_words_after=Range)
                    for t in sent:
                        sent_tokenize=tokenizer.tokenize(t)
                        token_index=[index for index,token in enumerate(text_tokenize) if token in sent_tokenize]
                        sent_embed=[]
                        for index in token_index:
                            sent_embed.append(token_embeddings[index])
                        if sent_embed:
                            embed=torch.mean(torch.stack(sent_embed),dim=0)
                            embed_dict[term].append(embed)
    return embed_dict

In [79]:
impembeddings=extract_embeddings(new_text_lematize,new_terms,Range=0)

In [41]:
no_embed_terms=[]
for i in impembeddings:
    if impembeddings[i]==[]:
        no_embed_terms.append(i)

In [42]:
impterm_embeddings={}
for term in new_terms:
    if term not in no_embed_terms:
        impterm_embeddings[term]=torch.mean(torch.stack(impembeddings[term]),dim=0)

In [43]:
g_embeddings=extract_embeddings(new_text_unclean,glossary,Range=0)

In [44]:
no_embed_terms=[]
for i in g_embeddings:
    if g_embeddings[i]==[]:
        no_embed_terms.append(i)

In [45]:
glossary_embeddings={}
for term in glossary:
    if term not in no_embed_terms:
        glossary_embeddings[term]=torch.mean(torch.stack(g_embeddings[term]),dim=0)

#### To get sentence embed

In [74]:
def extract_embeddings(Posts:list[str],Expression:list[str],rAnge:int):
    terms_not_intext=[]
    embed_dict={}
    sent_dict={i:[] for i in Expression}
    for term in Expression:
        for post in Posts:
            if term in post:
                sents=extract_context_words_bigram(sentence=post,target_bigram=term,num_words_before=rAnge,num_words_after=rAnge)
                if bool(sents):
                    for sent in sents:
                        sent_dict[term].append(sent)
    for key in sent_dict:
        if bool(sent_dict[key]):
            sent_tokenize=tokenizer(sent_dict[key],return_tensors='pt',padding=True,max_length=512,truncation=True)
            with torch.no_grad():
                model_output=model(**sent_tokenize)['pooler_output']
                embed_dict[key]=model_output
        else:
            terms_not_intext.append(key)
        
    return embed_dict,terms_not_intext

In [None]:
impembeddings,no_embed_terms=extract_embeddings(new_text_lematize,new_terms,rAnge=5)

In [937]:
impterm_embeddings={}
for term in new_terms:
    if term not in no_embed_terms:
        impterm_embeddings[term]=torch.mean(impembeddings[term],dim=0)

In [1186]:
g_embeddings,no_embed_terms=extract_embeddings(new_text_unclean,glossary,rAnge=5)

In [1187]:
glossary_embeddings={}
for term in glossary:
    glossary_embeddings[term]=torch.mean(g_embeddings[term],dim=0)

### Fine tuning for word embed

In [1149]:
predicted_labels={}
column=["Term"]+glossary+["Average"]
for r in range(5,15):
    print("Range = "+str(r))
    impembeddings=extract_embeddings(new_text_lematize,new_terms,Range=r)
    g_embeddings=extract_embeddings(new_text_unclean,glossary,Range=r)
    impterm_embeddings={}
    for term in new_terms:
        impterm_embeddings[term]=torch.mean(torch.stack(impembeddings[term]),dim=0)
    glossary_embeddings={}
    for term in glossary:
        glossary_embeddings[term]=torch.mean(torch.stack(g_embeddings[term]),dim=0)
    bert_similarity_df=pd.DataFrame(columns=column)
    for term in impterm_embeddings:
        score=0
        sim_score=[term]
        for seed_word in glossary :
            s=np.array(torch.cosine_similarity(impterm_embeddings[term].reshape(1,-1),glossary_embeddings[seed_word].reshape(1,-1)))[0]
            sim_score.append(s)
            score=score+s
        sim_score.append(score/len(glossary))
        bert_similarity_df.loc[len(bert_similarity_df)]=sim_score
    threshold=bert_similarity_df['Average'].quantile(0.50)
    bert_similarity_df['predicted']=bert_similarity_df['Average'].apply(lambda x:1 if x>threshold else 0)
    predicted_labels[r]=bert_similarity_df['predicted']

Range = 5


Token indices sequence length is longer than the specified maximum sequence length for this model (538 > 512). Running this sequence through the model will result in indexing errors


Range = 6
Range = 7
Range = 8
Range = 9
Range = 10
Range = 11
Range = 12
Range = 13
Range = 14


### Fine tuning for sentence embed

In [965]:
predicted_labels={}
column=["Term"]+glossary+["Average"]
for r in range(5,15):
    print("Range = "+str(r))
    impembeddings,no_term_embed=extract_embeddings(new_text_lematize,new_terms,rAnge=r)
    g_embeddings,no_term_embed=extract_embeddings(new_text_unclean,glossary,rAnge=r)
    impterm_embeddings={}
    for term in new_terms:
        impterm_embeddings[term]=torch.mean(impembeddings[term],dim=0)
    glossary_embeddings={}
    for term in glossary:
        glossary_embeddings[term]=torch.mean(g_embeddings[term],dim=0)
    bert_similarity_df=pd.DataFrame(columns=column)
    for term in impterm_embeddings:
        score=0
        sim_score=[term]
        for seed_word in glossary :
            s=np.array(torch.cosine_similarity(impterm_embeddings[term].reshape(1,-1),glossary_embeddings[seed_word].reshape(1,-1)))[0]
            sim_score.append(s)
            score=score+s
        sim_score.append(score/len(glossary))
        bert_similarity_df.loc[len(bert_similarity_df)]=sim_score
    threshold=bert_similarity_df['Average'].quantile(0.50)
    bert_similarity_df['predicted']=bert_similarity_df['Average'].apply(lambda x:1 if x>threshold else 0)
    predicted_labels[r]=bert_similarity_df['predicted']

Range = 5
Range = 6
Range = 7
Range = 8
Range = 9
Range = 10
Range = 11
Range = 12
Range = 13
Range = 14


In [1150]:
predicted_labels

{5: 0     1
 1     0
 2     1
 3     0
 4     0
      ..
 89    0
 90    1
 91    1
 92    0
 93    1
 Name: predicted, Length: 94, dtype: int64,
 6: 0     1
 1     0
 2     1
 3     0
 4     0
      ..
 89    0
 90    0
 91    1
 92    0
 93    1
 Name: predicted, Length: 94, dtype: int64,
 7: 0     1
 1     0
 2     1
 3     0
 4     0
      ..
 89    0
 90    0
 91    1
 92    0
 93    1
 Name: predicted, Length: 94, dtype: int64,
 8: 0     1
 1     0
 2     1
 3     0
 4     0
      ..
 89    0
 90    0
 91    1
 92    0
 93    1
 Name: predicted, Length: 94, dtype: int64,
 9: 0     1
 1     0
 2     0
 3     0
 4     0
      ..
 89    0
 90    0
 91    1
 92    0
 93    1
 Name: predicted, Length: 94, dtype: int64,
 10: 0     1
 1     0
 2     0
 3     0
 4     0
      ..
 89    0
 90    0
 91    1
 92    0
 93    1
 Name: predicted, Length: 94, dtype: int64,
 11: 0     1
 1     0
 2     0
 3     0
 4     0
      ..
 89    0
 90    0
 91    1
 92    0
 93    1
 Name: predicted, Le

In [1151]:
results=pd.DataFrame.from_dict(predicted_labels)
results['avg']=results.mean(axis=1)


In [1152]:
results['Predicted']=results['avg'].apply(lambda x:1 if x>0.8 else 0)

In [969]:
results.Predicted

0     1
1     1
2     1
3     1
4     1
     ..
89    0
90    0
91    1
92    0
93    0
Name: Predicted, Length: 94, dtype: int64

In [1153]:
pd.DataFrame(results).to_csv("Solution2-2.csv")

## Classification of 4 Approaches

##### Solution 1-1

In [1]:
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
import pandas as pd

In [2]:
data_file=pd.read_csv("/Users/dhanushkikkisetti/Documents/Research Assistant/DSAA 2024/Solution 1-1.csv")


In [3]:
### 1 using Bert model to extract embedding using tokens to compare similarity between imp terms and glossary
#data_file=pd.read_csv("/Users/dhanushkikkisetti/Documents/Research Assistant/Scripts/1.csv",encoding='latin-1')
print("Results for : Solution 1-1")
print("-----------------------------------------")
print("Accuracy : ",accuracy_score(data_file['Actual'],data_file['Predicted']))
print("Precision : ",precision_score(data_file['Actual'],data_file['Predicted']))
print("Recall  : ",recall_score(data_file['Actual'],data_file['Predicted']))
print("F1 Score  : ",f1_score(data_file['Actual'],data_file['Predicted']))

Results for : Solution 1-1
-----------------------------------------
Accuracy :  0.7884615384615384
Precision :  0.3888888888888889
Recall  :  1.0
F1 Score  :  0.56


##### Solution 1-2

In [4]:
data_file=pd.read_csv("/Users/dhanushkikkisetti/Documents/Research Assistant/DSAA 2024/Solution 1-2.csv")


In [5]:
print("Results for : Solution 1-2")
print("-----------------------------------------")
print("Accuracy : ",accuracy_score(data_file['Predicted'],data_file['Actual']))
print("Precision : ",precision_score(data_file['Predicted'],data_file['Actual']))
print("Recall  : ",recall_score(data_file['Predicted'],data_file['Actual']))
print("F1 Score  : ",f1_score(data_file['Predicted'],data_file['Actual']))

Results for : Solution 1-2
-----------------------------------------
Accuracy :  0.7692307692307693
Precision :  1.0
Recall  :  0.3684210526315789
F1 Score  :  0.5384615384615384


##### Solution 2-1

In [7]:
data_file=pd.read_csv("/Users/dhanushkikkisetti/Documents/Research Assistant/DSAA 2024/Solution 2-1.csv")


In [8]:
print("Results for : Solution 2-1")
print("-----------------------------------------")
print("Accuracy : ",accuracy_score(data_file['Actual'],data_file['Predicted']))
print("Precision : ",precision_score(data_file['Actual'],data_file['Predicted']))
print("Recall  : ",recall_score(data_file['Actual'],data_file['Predicted']))
print("F1 Score  : ",f1_score(data_file['Actual'],data_file['Predicted']))

Results for : Solution 2-1
-----------------------------------------
Accuracy :  0.6808510638297872
Precision :  0.4857142857142857
Recall  :  0.5862068965517241
F1 Score  :  0.53125


##### solution 2-2

In [9]:
data_file=pd.read_csv("/Users/dhanushkikkisetti/Documents/Research Assistant/DSAA 2024/Solution 2-2.csv")

In [10]:
print("Results for : Solution 2-2")
print("-----------------------------------------")
print("Accuracy : ",accuracy_score(data_file['Actual'],data_file['Predicted']))
print("Precision : ",precision_score(data_file['Actual'],data_file['Predicted']))
print("Recall  : ",recall_score(data_file['Actual'],data_file['Predicted']))
print("F1 Score  : ",f1_score(data_file['Actual'],data_file['Predicted']))

Results for : Solution 2-2
-----------------------------------------
Accuracy :  0.7978723404255319
Precision :  0.631578947368421
Recall  :  0.8275862068965517
F1 Score  :  0.716417910447761
