In [1]:
import re
import unicodedata
import nltk
import random
import numpy as np
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
from collections import Counter
import csv
random.seed(42)

In [2]:
# Read text files from the local "dataset" folder

# YORUBA
with open('dataset/bot_data_yoruba.txt', 'r', encoding='utf-8') as f:
    bot_data_yoruba = f.read()
with open('dataset/human_data_yoruba.txt', 'r', encoding='utf-8') as f:
    human_data_yoruba = f.read()
    
# KASHMIRI    
with open('dataset/bot_data_kashmiri.txt', 'r', encoding='utf-8') as f:
     bot_data_kashmiri = f.read()
with open('dataset/human_data_kashmiri.txt', 'r', encoding='utf-8') as f:
     human_data_kashmiri = f.read()

In [3]:
# print(human_data_yoruba)

In [4]:
# print(bot_data_yoruba)

In [5]:
# print(human_data_kashmiri)

In [6]:
# print(bot_data_kashmiri)

In [7]:
def summarize_text(text):
  """
  Summarizes the given text by printing the number of words and display first 20 words.
  """
  # Count the number of words and characters
  num_words = len(text.split())

  # Print the summary information
  print(f"Number of words: {num_words}")

  # Print the first 20 words
  print(f"First 20 words: {text.split()[50:70]}")

# Summarize the bot-generated Yoruba text
summarize_text(bot_data_yoruba)

# Summarize the human-written Yoruba text
summarize_text(human_data_yoruba)

# Summarize the bot-generated Kashmiri text
summarize_text(bot_data_kashmiri)

# Summarize the human-written Kashmiri text
summarize_text(human_data_kashmiri)

Number of words: 22096
First 20 words: ['gbogbo', 'àgbègbè', 'tó', 'lórí', 'ìtàn', 'YòrùbáÓyó', 'Ìbàdàn', 'Ìjbú', 'Èkìtì', 'Ègbáàwn', 'aáájúòèlú', 'atij', 'àti', 'onímìtàn', 'ti', 'da', 'àfihàn', 'ìtàn', 'àwn', 'ìjba']
Number of words: 949580
First 20 words: ['Taani', 'o', 'so', 'oro', 'yii', '"Mu', 'ohun', 'ikowe', 're,', 'beresi', 'iko', 'itan', 'ti', 'emi', 'o', 'so', 'wonyi', 'sile,', 'mase', 'fi']
Number of words: 20146
First 20 words: ['بابا،', 'بہن', 'تہ', 'برار،', 'اَمُن', 'یار', 'مِتر', 'تہ', 'ہمسایہ', 'چھُہ', 'گامِچ', 'دِل', 'ہند', 'شِکار', 'تہ', 'نَوکھ', 'سَلطَنَت', 'ہندس', 'رِکھیوَل۔', 'اماں']
Number of words: 120842
First 20 words: ['گٹہٕ', 'کٲژا', 'چھس', 'وۅہوَن', 'منہٕ', 'آنگن', 'منز', 'راتھ', 'وُچھم', 'کُن', 'زۆن', 'شُر', 'اکھ', 'زوٗنہ', 'گندن', 'گۅڈہ', 'یێتہ', 'گاہ', 'پٮ۪و', 'افتابُک']


In [9]:
import re
from collections import Counter

# language → regex to strip out everything except letters + whitespace
PATTERNS = {
    'en':      r'[^a-z\s]',
    'yoruba':  r'[^a-zàáâãèéêìíîòóôõùúûẹọṣń\s]',
    'kashmiri':r'[^\u0600-\u06FF\s]'
}

def preprocess_text(text: str, language: str = 'en') -> list[str]:
    """
    1. Lowercase (no‑op for Arabic script).
    2. Remove any char not in the target alphabet.
    3. Split on whitespace.
    """
    text = text.lower()
    pattern = PATTERNS.get(language, PATTERNS['en'])
    text = re.sub(pattern, '', text)
    return text.split()

def count_frequencies(tokens: list[str]) -> Counter:
    return Counter(tokens)

def get_most_common(corpus_text: str, language: str = 'en', top_n: int = 50):
    tokens = preprocess_text(corpus_text, language)
    freq   = count_frequencies(tokens)
    return freq.most_common(top_n)

if __name__ == '__main__':
    # YORUBA
    with open('dataset/human_data_yoruba.txt', encoding='utf-8') as f:
        human_data_yoruba = f.read()
    print('Top Human Yoruba words:', get_most_common(human_data_yoruba, language='yoruba'))

    with open('dataset/bot_data_yoruba.txt', encoding='utf-8') as f:
        bot_data_yoruba = f.read()
    print('Top Bot Yoruba words:', get_most_common(bot_data_yoruba, language='yoruba'))
    
    # KASHMIRI
    with open('dataset/human_data_kashmiri.txt', encoding='utf-8') as f:
        human_data_kashmiri = f.read()
    print('Top Human Kashmiri words:', get_most_common(human_data_kashmiri, language='kashmiri'))

    with open('dataset/bot_data_kashmiri.txt', encoding='utf-8') as f:
        bot_data_kashmiri = f.read()
    print('Top Kashmiri Bot words:', get_most_common(bot_data_kashmiri, language='kashmiri'))

Top Human Yoruba words: [('si', 46870), ('o', 38146), ('ti', 35510), ('ni', 22489), ('rẹ', 21574), ('awọn', 21540), ('ati', 18684), ('ki', 15104), ('fun', 14138), ('li', 13587), ('a', 13415), ('wọn', 12262), ('nwọn', 12160), ('fi', 9947), ('mi', 9292), ('bi', 9261), ('ṣe', 8215), ('oluwa', 7966), ('ọmọ', 7388), ('ba', 7294), ('yio', 7164), ('kò', 7109), ('emi', 7018), ('pe', 6950), ('gbogbo', 6771), ('lati', 6616), ('na', 6368), ('ninu', 6240), ('iwọ', 5904), ('lọ', 5326), ('jẹ', 5174), ('wá', 4972), ('nyin', 4730), ('enia', 4513), ('le', 4488), ('ọlọrun', 4404), ('ara', 4373), ('i', 4296), ('wi', 4255), ('mu', 4211), ('nitori', 4210), ('ṣugbọn', 4084), ('ọ', 4065), ('kan', 4012), ('wa', 3955), ('ẹ', 3818), ('ẹnyin', 3571), ('yi', 3425), ('pẹlu', 3397), ('wipe', 3371)]
Top Bot Yoruba words: [('ní', 725), ('tí', 380), ('ń', 372), ('pé', 362), ('ó', 345), ('tó', 300), ('bí', 276), ('àwn', 267), ('àti', 255), ('a', 247), ('bá', 244), ('wn', 236), ('fi', 208), ('sí', 207), ('kó', 185), ('f

In [10]:
import re
from collections import Counter
def find_frequent_words(text, threshold=150):
    words = text.split()
    word_counts = Counter(words)
    frequent_words = [word for word, count in word_counts.items() if count >= threshold]
    return frequent_words



In [11]:
frequent_words_yoruba_human = find_frequent_words(human_data_yoruba)
print(frequent_words_yoruba_human )

['A', 'fi', 'i', 'a', 'o', 'so', 'ohun', 'ti', 'emi', 'di', 'miran', 'ki', 'ma', 'ba', 'Emi', 'wa', 'mo', 'nitori', 'mi', 'lo', 'ku', 'ni', 'na', 'si', 'mi,', 'bi', 'fun', 'ko', 'pe', 'Kini', 'nkan', 'ninu', '-', 'ati', 'pa', 'ri', 'nitoripe', 'Ohun', 'tan', 'e', 'wo', 'inu', 'nigbati', 'aiye', 'yi', '—', '.', 'gbe', ',', 'de', 'le', 'oju', 'yio', 'gba', 'lati', 'on', 'n', 'gbogbo', 'ni,', 'to', 'loju', 'iya', 'iba', 'rin', 'ilu', 'mi.', 'O', 'ju', 'tun', 'Bi', 'da', 'mi;', 'oru', 'oni', 'eniyan', 'sun', 'dide,', 'o,', 'wa,', 'wa.', 'mu', 'iru', 'Wa', 'naa', 'obinrin', 'baba', 'ara', 'Baba', 'yi,', 'owo', 'ogun', 'Tani', 'Mo', 'dabi', 'aiya', 'maa', 'nfi', 'ori', 'Nitori', 'idi', 'Gbogbo', 'meji', 'a.', 'wi', 'bayi', 'rẹ', 'ranti', 'kan', 'la', 'Bayi', 'awa', 'nipa', 'i.', 'nigba', 'tani', 'sin', 'ya', 'dide', 'ile', 'bi?', 'ran', 'ibi', 'ni.', '..', 'gbogbo,', 'enia,', 'a,', 'titi', 'wipe', 'enia', 'Ni', 'niwaju', 'nitorina', 'san', 'jade', 'na.', 'ọwọ', 'jẹ', 'ọkan', 'awọn', 'iwe', '

In [12]:
frequent_words_yoruba_bot = find_frequent_words(bot_data_yoruba)
print(frequent_words_yoruba_bot)

['Ní', 'tó', 'àti', 'tí', 'wn', 'sí', 'a', 'àwn', 'lórí', 'fún', 'ní', 'fi', 'pé', 'lè', 'e', 'j', 'ń', 'bí', 'ó', 'bá', 'máa', 'kó']


In [13]:
frequent_words_kashmiri_human = find_frequent_words(human_data_kashmiri)
print(frequent_words_kashmiri_human)

['اوس', 'منز', 'کُن', 'اکھ', 'غلام', 'محمد', '۔', 'کینہہ', 'تہٕ', 'تہ', '،', 'یُس', 'پٲٹھۍ', 'کٲشرِ', 'الدین', 'پٮ۪ٹھ', 'زِ', 'و', 'چھہ', 'فارسی', 'مگر', 'خٲطرٕ', 'نہ', 'یا', 'یہ', 'سٍتۍ', 'کران', 'یتھ', 'بے', 'پتہٕ', 'نہٕ', 'شعر', 'چھے', 'گو', 'سُہ', 'زبٲنۍ', 'ہے', 'یِم', 'ٲسۍ', 'ٲس', 'کرنہٕ', 'تس', 'سٕنز', 'ہٕنز', 'ہُند', 'اَمہ', 'کٔشیرِ', 'چھُنہٕ', 'چھُ', 'از', 'ہٕندِ', 'ہٕندۍ', 'تمٔۍ', 'تِمن', 'یِمن', 'مثنوی', 'ہٕندِس', 'تام', 'مے', 'چھِ', 'شاہ', 'متعلق', 'اَتھ', 'ییلہ', 'سُند', 'حضرت', 'اللہ', 'نامہٕ', 'شیخ']


In [14]:
frequent_words_kashmiri_bot = find_frequent_words(bot_data_kashmiri)
print(frequent_words_kashmiri_bot)

['زُون', 'چُھہ', 'منز', 'تہ', 'ہند', 'چھُہ', 'اوس', 'ہُند', 'کہ', 'بیگم']


## Yoruba Diacritic Removal

In [15]:
import unicodedata

def remove_diacritics(text: str) -> str:
    """
    Remove all Unicode diacritic marks from the input string.
    """
    return ''.join(
        ch for ch in unicodedata.normalize('NFD', text)
        if unicodedata.category(ch) != 'Mn'
    )

# 1. Read the original file
with open('dataset/bot_data_yoruba.txt', 'r', encoding='utf-8') as f:
    bot_original_text = f.read()
with open('dataset/human_data_yoruba.txt', 'r', encoding='utf-8') as f:
    human_original_text = f.read()

# 2. Remove diacritics and save into a new variable
bot_cleaned_text = remove_diacritics(bot_original_text)
human_cleaned_text = remove_diacritics(human_original_text)


# 3. Display the cleaned text
print(bot_cleaned_text)

Ni agbegbe Iwrun Naijiria Yoruba ti da itan ibil kan to gbooro plu ede ie isin litires ati imo ojusin IleIf ilu ibil ti wn gba si kelda ni a ka si kanaya iran Yoruba ib ni agbelebu itanibil ti br plu Oduduwa Obatala runmila ati awn irunmole miiran Ni gbogbo agbegbe to lori itan YorubaOyo Ibadan Ijbu Ekiti Egbaawn aaajuoelu atij ati onimitan ti da afihan itan awn ijba irinajo eniyan ibaep ati igbyawo fun p grunun dun pere ugbn irinajo Yoruba o da sib nigba de akoko ijba oyinbo awn ba ati oloye ti kk koju iakoso alagbara Pnti naa wn si jiroro lati pa aa m niwaju lya amdemae dagba Amde ile Yoruba to ks ni DO Fagunwa fi itan Ogboju d ninu Igbo Irunmila han pe a le lo ede atwda ati alaye aaaju lati tsiwaju ireti ati ifkansin Amos Tutuola tun fi The PalmWine Drinkard e afihan irinajo im kan ni ayederu itanoniran Ni kanaya Yoruba ni Iwa Omoluabi ie rere iforiti ooto aitankara inure ati ibagb plu ibaep owo j koko r Owe Yoruba wi pe Iwa loogo Iwa rere san ju wura itan agbalagba n k mde bi a e l

In [16]:
print(human_cleaned_text)

IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



In [17]:
import re

# Morphology classes for Yoruba and Kashmiri
class YorubaMorphology:
    def __init__(self):
        # Common tone-mark prefixes and clitics
        self.prefixes = ['ì', 'í', 'à', 'á', 'ẹ', 'è', 'ó', 'ò', 'ú', 'ù']
        # Typical derivational and inflectional suffixes by part of speech
        self.suffixes = {
            'nouns': ['kan', 'jẹ', 'ni', 'ra', 'rẹ', 'lọ'],
            'verbs': ['mọ', 'jẹ', 'lo', 'mu', 'ka', 'ṣe'],
            'adjectives': ['dá', 'tọ', 'sùn'],
            'adverbs': ['pẹ̀lú', 'gan', 'fun', 'lẹ́'],
            'pronouns': ['mi', 'ẹ', 'wa', 'yín', 'ọ́'],
            'prepositions': ['ni', 'si', 'láti', 'lórí'],
            'conjunctions': ['ati', 'ṣùgbọ́n', 'bí'],
            'interrogative_markers': ['ni', 'ǹjẹ', 'kí'],
            'negation_markers': ['kò', 'ọ̀']
        }
        # Infixes to strip (e.g., reduplication or tone markers)
        self.infixes = ['ọ́', 'ẹ́']

    def tokenize(self, text):
        tokens = []
        words = text.split()
        for word in words:
            w = word
            # Strip prefixes
            for prefix in self.prefixes:
                if w.startswith(prefix):
                    w = w[len(prefix):]
            # Strip suffixes
            for suffix_list in self.suffixes.values():
                for suffix in suffix_list:
                    if w.endswith(suffix):
                        w = w[:-len(suffix)]
            # Remove infixes
            for infix in self.infixes:
                if infix in w:
                    parts = w.split(infix)
                    w = ''.join(parts)
            tokens.append(w)
        return tokens

class KashmiriMorphology:
    def __init__(self):
        # Kashmiri uses Perso-Arabic script; common prefixes (pre-verbal particles)
        self.prefixes = ['بِ', 'ہِ', 'کِ', 'تُ', 'پِ']
        # Common Kashmiri suffixes for declension and conjugation
        self.suffixes = {
            'nouns': ['ہ', 'ۍ', 'یں', 'ون'],
            'verbs': ['یتھ', 'یا', 'یو', 'ڑہ', 'ہن', 'یُن'],
            'adjectives': ['وَن', 'سَر', 'کَر'],
            'adverbs': ['وَو', 'گَر'],
            'pronouns': ['مژ', 'تُ', 'سۍ', 'ژن'],
            'prepositions': ['ئیس', 'کہ'],
            'conjunctions': ['آس', 'تہ'],
            'interrogative_markers': ['کین', 'تما'],
            'negation_markers': ['نہ', 'نک']
        }

    def tokenize(self, text):
        tokens = []
        words = text.split()
        for word in words:
            w = word
            # Strip prefixes
            for prefix in self.prefixes:
                if w.startswith(prefix):
                    w = w[len(prefix):]
            # Strip suffixes
            for suffix_list in self.suffixes.values():
                for suffix in suffix_list:
                    if w.endswith(suffix):
                        w = w[:-len(suffix)]
            tokens.append(w)
        return tokens

# Function to remove stop words

def remove_stop_words(tokens, stopwords):
    return [token for token in tokens if token not in stopwords]

# Function to clean text (strip punctuation, lowercase)

def clean_text(text):
    cleaned = re.sub(r'[^\u0600-\u06FFa-zA-Z\s]', '', text)
    return cleaned.strip().lower()

    

# Load stop words for Yoruba and Kashmiri
# stopwords_yoruba = {'ni', 'ati', 'pẹlu', 'gẹgẹ', 'ki', 'ti', 'jẹ', 'ko', 'wa'}
# stopwords_kashmiri = {'ہند', 'تِ', 'ژِ', 'کَن', 'مژ', 'نو', 'ہے', 'ہوں'}

stopwords_yoruba = {
    'ó', 'ní', 'ṣe', 'rẹ̀', 'tí', 'àwọn', 'sí', 'ni', 'náà', 'láti',
    'kan', 'ti', 'ń', 'lọ', 'o', 'bí', 'padà', 'sì', 'wá', 'lè', 'wà',
    'kí', 'púpọ̀', 'mi', 'wọ́n', 'pẹ̀lú', 'a', 'ṣùgbọ́n', 'fún', 'jẹ́',
    'fẹ́', 'kò', 'jù', 'pé', 'é', 'gbogbo', 'inú', 'bẹ̀rẹ̀', 'jẹ',
    'ọjọ́', 'nítorí', 'nǹkan', 'sínú', 'ṣ', 'yìí', 'ṣé', 'àti', 'í',
    'máa', 'nígbà', 'mo', 'an', 'mọ̀', 'bá', 'kì', 'ńlá', 'ọ̀pọ̀lọpọ̀',
    'ẹmọ́', 'wọn', 'òun'
}

stopwords_kashmiri = {
    'تہٕ', 'چھُ', 'منز', 'تہ', 'یہ', 'و', 'چھِ', 'زِ', 'چھے', 'اوس',
    'ہُند', 'پٮ۪ٹھ', 'سٍتۍ', 'اکھ', 'مثنوی', 'چھہ', 'نہٕ', 'شاہ',
    'یا', 'سُہ', 'اَتھ', 'پٲٹھۍ', 'مگر', 'حضرت', 'ہٕنز', 'ہٕندۍ',
    'از', 'اَمہ'
}

# Process text through all steps

def process_text(text, morphology, stopwords):
    tokens = morphology.tokenize(text)
    tokens = remove_stop_words(tokens, stopwords)
    cleaned = clean_text(' '.join(tokens))
    return cleaned

# Read uploaded files
with open('dataset/bot_data_yoruba.txt', 'r', encoding='utf-8') as f:
    bot_data_yoruba = f.read()
with open('dataset/human_data_yoruba.txt', 'r', encoding='utf-8') as f:
    human_data_yoruba = f.read()
    
with open('dataset/bot_data_kashmiri.txt', 'r', encoding='utf-8') as f:
     bot_data_kashmiri = f.read()
with open('dataset/human_data_kashmiri.txt', 'r', encoding='utf-8') as f:
     human_data_kashmiri = f.read()
    

# Instantiate morphologies
yoruba_morph = YorubaMorphology()
kashmiri_morph = KashmiriMorphology()

# Process texts
processed_yoruba_bot = process_text(bot_data_yoruba, yoruba_morph, stopwords_yoruba)
processed_yoruba_human = process_text(human_data_yoruba, yoruba_morph, stopwords_yoruba)
processed_kashmiri_bot = process_text(bot_data_kashmiri, kashmiri_morph, stopwords_kashmiri)
processed_kashmiri_human = process_text(human_data_kashmiri, kashmiri_morph, stopwords_kashmiri)

# Output the processed texts
print("Processed Yoruba Bot Text:", processed_yoruba_bot)
print("Processed Yoruba Human Text:", processed_yoruba_human)
print("Processed Kashmiri Bot Text:", processed_kashmiri_bot)
print("Processed Kashmiri Human Text:", processed_kashmiri_human)

IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



# Word embedding

In [18]:
from gensim.models import Word2Vec
from gensim.models import FastText
import logging

In [19]:
# Train a FastText model

processed_texts_yoruba_bot = [processed_yoruba_bot.split()]
processed_texts_yoruba_human = [processed_yoruba_human.split()]
processed_texts_kashmiri_bot = [processed_kashmiri_bot.split()]
processed_texts_kashmiri_human = [processed_kashmiri_human.split()]

fasttext_model_yoruba_bot = FastText(processed_texts_yoruba_bot , vector_size=10, window=3, min_count=1, workers=4, sg=1)
fasttext_model_yoruba_human = FastText(processed_texts_yoruba_human, vector_size=10, window=3, min_count=1, workers=4, sg=1)
fasttext_model_kashmiri_bot = FastText(processed_texts_kashmiri_bot, vector_size=10, window=3, min_count=1, workers=4, sg=1)
fasttext_model_kashmiri_human = FastText(processed_texts_kashmiri_human, vector_size=10, window=3, min_count=1, workers=4, sg=1)

fasttext_model_yoruba_bot.save("fasttext_gpt_yoruba.model")
print("FastText model trained and saved.")

fasttext_model_yoruba_human.save("fasttext_corpus_yoruba.model")
print("FastText model trained and saved.")

fasttext_model_kashmiri_bot.save("fasttext_gpt_kashmiri.model")
print("FastText model trained and saved.")

fasttext_model_kashmiri_human.save("fasttext_corpus_kashmiri.model")
print("FastText model trained and saved.")

FastText model trained and saved.
FastText model trained and saved.
FastText model trained and saved.
FastText model trained and saved.


In [20]:
# Train a Word2Vec model

word2vec_model_yoruba_bot = Word2Vec(processed_texts_yoruba_bot, vector_size=10, window=3, min_count=2, workers=4)
word2vec_model_yoruba_human = Word2Vec(processed_texts_yoruba_human, vector_size=10, window=3, min_count=2, workers=4)
word2vec_model_kashmiri_bot = Word2Vec(processed_texts_kashmiri_bot, vector_size=10, window=3, min_count=2, workers=4)
word2vec_model_kashmiri_human = Word2Vec(processed_texts_kashmiri_human, vector_size=10, window=3, min_count=2, workers=4)

word2vec_model_yoruba_bot.save("word2vec_gpt_yoruba.model")
print("Word2Vec model trained and saved.")

word2vec_model_yoruba_human.save("word2vec_corpus_yoruba.model")
print("Word2Vec model trained and saved.")

word2vec_model_kashmiri_bot.save("word2vec_gpt_kashmiri.model")
print("Word2Vec model trained and saved.")

word2vec_model_kashmiri_human.save("word2vec_corpus_kashmiri.model")
print("Word2Vec model trained and saved.")

Word2Vec model trained and saved.
Word2Vec model trained and saved.
Word2Vec model trained and saved.
Word2Vec model trained and saved.


In [21]:
import tensorflow as tf
import tensorflow_hub as hub
import numpy as np


# Function to segment text into chunks of 200 words
def segment_text(text, chunk_size=40):
    words = text.split()
    segments = [' '.join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]
    return segments

# Prepare the segmented corpus
corpus_yoruba_bot = segment_text(processed_yoruba_bot)
corpus_yoruba_human = segment_text(processed_yoruba_human)
corpus_kashmiri_bot = segment_text(processed_kashmiri_bot)
corpus_kashmiri_human = segment_text(processed_kashmiri_human)

# Print segments to verify
"""print("Yoruba Bot Segments:", corpus_yoruba_bot)
print("Yoruba Human Segments:", corpus_yoruba_human)
print("Kashmiri Bot Segments:", corpus_kashmiri_bot)
print("Kashmiri Human Segments:", corpus_kashmiri_human)"""

# Load the Universal Sentence Encoder model
use_model = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

# Compute USE embeddings
def compute_use_embeddings(corpus):
    return use_model(corpus).numpy()

vectors_yoruba_bot_use = compute_use_embeddings(corpus_yoruba_bot)
vectors_yoruba_human_use = compute_use_embeddings(corpus_yoruba_human)
vectors_kashmiri_bot_use = compute_use_embeddings(corpus_kashmiri_bot)
vectors_kashmiri_human_use = compute_use_embeddings(corpus_kashmiri_human)

print(f"Yoruba Bot USE Embedding Shape: {vectors_yoruba_bot_use.shape}")
print(f"Yoruba Human USE Embedding Shape: {vectors_yoruba_human_use.shape}")
print(f"Kashmiri Bot USE Embedding Shape: {vectors_kashmiri_bot_use.shape}")
print(f"Kashmiri Human USE Embedding Shape: {vectors_kashmiri_human_use.shape}")

# Optionally, save the embeddings
np.save("yoruba_bot_use.npy", vectors_yoruba_bot_use)
np.save("yoruba_human_use.npy", vectors_yoruba_human_use)
np.save("kashmiri_bot_use.npy", vectors_kashmiri_bot_use)
np.save("kashmiri_human_use.npy", vectors_kashmiri_human_use)

2025-04-23 18:05:38.800147: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


RuntimeError: CPU dispatcher tracer already initlized

ImportError: Keras cannot be imported. Check that it is installed.