# Read the file and create data structures

Create a data structure from the file that has key, value pairs where the key is the concept and the value if a list of (question, answer) tuples

In [1]:

def read_file(filepath):

    data = {}

    with open(filepath, 'r') as file:

        for line in file:
            concept, question, answer = line.strip().split('\t')

            # Check if the concept is already a key in the data structure
            if concept in data:
                # If so, append the (question, answer) tuple to the existing list
                data[concept].append((question, answer))
            else:
                # If no, create a new entry with the concept as the key and [(question, answer)] as the value
                data[concept] = [(question, answer)]

    return data


data = read_file('class_05.clean.txt')
for key, value_list in data.items():
    print(key)
    for value in value_list:
        print(f'\t{value}')

lion
	('Is he a carnivore or herbivore?', 'Lion is a carnivore.')
	('Is it a vertebrate or invertebrate?', 'Lion is a vertebrate.')
	('Has it ever been domesticated?', 'Lions have been trained to some extent in captivity but are not fully domesticated like dogs or cats.')
	('Is it a mammal?', 'Yes, the lion is a mammal.')
	('Does it have fur?', 'Yes, lions have fur.')
	('In which environments does he live?', 'Lions are found in grasslands, savannas, and open woodlands.')
	('What does it eat?', 'Lions are carnivores and primarily eat meat, usually hunting in groups.')
	('How many legs does it have, if any?', 'Lions have four legs.')
	('Does he produce eggs?', 'No, lions give birth to live young.')
	('Is he a prey or a predator?', 'Lions are predators.')
	('Does it have wings, legs, or fins?', 'Lions have legs but no wings or fins.')
	('In what continents does he predominantly live?', 'Lions are primarily found in Africa, with a small population in the Gir Forest of India.')
	('Is he con

1. Remove stopwords
2. Lemmatize
3. Remove punctuation

In [2]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import string
from tqdm import tqdm

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

# Download the English stopwords list
stop_words = set(stopwords.words('english'))

# Initialize lemmatizer
lemmatizer = WordNetLemmatizer()

for key, value_list in tqdm(data.items()):

    for i in range(len(value_list)):

        question, answer = value_list[i]

        # Tokenize the sentence
        words = word_tokenize(answer)

        # Remove stopwords and punctuation
        filtered_words = [word.lower() for word in words if word.lower() not in stop_words
                          and word not in string.punctuation and not any(char.isdigit() for char in word)]

        # Lemmatization
        lemmatized_words = [lemmatizer.lemmatize(word) for word in filtered_words]

        # Join the lemmatized words to form a sentence
        cleaned_answer = ' '.join(lemmatized_words)

        # Put the resulting sentece in the data structure
        value_list[i] = (question, answer, cleaned_answer)


for key, value_list in data.items():
    print(key)
    for value in value_list:
        print(f'\t{value}')

[nltk_data] Downloading package stopwords to /home/daniel/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/daniel/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/daniel/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
100%|██████████| 93/93 [00:01<00:00, 69.46it/s] 

lion
	('Is he a carnivore or herbivore?', 'Lion is a carnivore.', 'lion carnivore')
	('Is it a vertebrate or invertebrate?', 'Lion is a vertebrate.', 'lion vertebrate')
	('Has it ever been domesticated?', 'Lions have been trained to some extent in captivity but are not fully domesticated like dogs or cats.', 'lion trained extent captivity fully domesticated like dog cat')
	('Is it a mammal?', 'Yes, the lion is a mammal.', 'yes lion mammal')
	('Does it have fur?', 'Yes, lions have fur.', 'yes lion fur')
	('In which environments does he live?', 'Lions are found in grasslands, savannas, and open woodlands.', 'lion found grassland savanna open woodland')
	('What does it eat?', 'Lions are carnivores and primarily eat meat, usually hunting in groups.', 'lion carnivore primarily eat meat usually hunting group')
	('How many legs does it have, if any?', 'Lions have four legs.', 'lion four leg')
	('Does he produce eggs?', 'No, lions give birth to live young.', 'lion give birth live young')
	('Is




# Frequency based approach

In [3]:
def compute_frequency(documents):
    
    word_frequency = {}

    for document in documents:
            
        # Combine the words from both question and answer
        # words = question.split() + answer.split()
        words = document.split()

        # Update the word frequency dictionary
        for word in words:
            if word in word_frequency:
                word_frequency[word] += 1
            else:
                word_frequency[word] = 1

    return word_frequency


# Combine all the answers into a single document
documents = [answer_clean for tuples in data.values() for question, answer, answer_clean in tuples]
word_frequency = compute_frequency(documents)

# Now, word_frequency contains the frequency of each word across answers
for key, value in word_frequency.items():
    print(f'{key}: {value}')

lion: 48
carnivore: 44
vertebrate: 88
trained: 24
extent: 3
captivity: 55
fully: 14
domesticated: 123
like: 135
dog: 57
cat: 94
yes: 293
mammal: 258
fur: 99
found: 331
grassland: 34
savanna: 15
open: 8
woodland: 5
primarily: 185
eat: 74
meat: 77
usually: 23
hunting: 76
group: 120
four: 61
leg: 219
give: 75
birth: 75
live: 160
young: 72
predator: 128
wing: 149
fin: 94
africa: 33
small: 155
population: 43
gir: 1
forest: 50
india: 4
wild: 166
dangerous: 97
human: 285
especially: 136
close: 8
encounter: 2
commonly: 123
eaten: 26
known: 212
powerful: 33
agile: 21
jump: 17
generally: 199
shorter: 11
standing: 21
longer: 7
body: 199
length: 70
social: 80
called: 25
pride: 1
consist: 7
related: 4
female: 30
offspring: 10
suitable: 59
legal: 11
kept: 113
pet: 180
due: 151
nature: 13
migrate: 28
long: 88
distance: 73
regularly: 2
movement: 26
influenced: 5
factor: 15
food: 140
availability: 4
water: 82
source: 34
intelligent: 57
considered: 160
animal: 380
classified: 13
vulnerable: 16
decreasin

In [4]:
for key, value_list in data.items():
    # key -> concept
    # value_list = [(question_1, answer_1), (question_2, answer_2), ...]
    for question, answer, cleaned_answer in value_list:
        
        words = cleaned_answer.split()
        if key in words:
            words.remove(key)

        # Find the word with the highest value
        max_word = max(words, key=lambda word: word_frequency.get(word, 0))
         
        print(f'{key} | {question:50s} {answer:120s} {max_word}')

lion | Is he a carnivore or herbivore?                    Lion is a carnivore.                                                                                                     carnivore
lion | Is it a vertebrate or invertebrate?                Lion is a vertebrate.                                                                                                    vertebrate
lion | Has it ever been domesticated?                     Lions have been trained to some extent in captivity but are not fully domesticated like dogs or cats.                    like
lion | Is it a mammal?                                    Yes, the lion is a mammal.                                                                                               yes
lion | Does it have fur?                                  Yes, lions have fur.                                                                                                     yes
lion | In which environments does he live?                Lions are fou

# TF-IDF

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer

def compute_TF_IDF(documents):

    tfidf = TfidfVectorizer()
    result = tfidf.fit_transform(documents)

    # get idf values
    # for ele1, ele2 in zip(tfidf.get_feature_names_out(), tfidf.idf_):
    #     print(ele1, ':', ele2)
    
    # Create dictionary for idf values
    idf_dict = dict(zip(tfidf.get_feature_names_out(), tfidf.idf_))

    return idf_dict
    
# Combine all answer_clean into a single document
documents = [answer_clean for tuples in data.values() for question, answer, answer_clean in tuples]
tf_idf_dict = compute_TF_IDF(documents)

for key, value in tf_idf_dict.items():
    print(f'{key}: {value}')

abdomen: 8.623886295932458
ability: 4.7631565848918616
absent: 7.930739115372512
abundant: 6.544444754252622
abuse: 8.218421187824294
accident: 8.623886295932458
accidental: 8.623886295932458
accurately: 8.623886295932458
accustomed: 8.623886295932458
achieved: 8.218421187824294
acrobatic: 8.218421187824294
acrobatics: 8.623886295932458
across: 8.623886295932458
act: 7.525274007264348
action: 8.623886295932458
active: 4.361206418891142
actively: 7.237591934812567
activity: 5.8205259150259225
acute: 8.218421187824294
adapt: 6.544444754252622
adaptability: 7.525274007264348
adaptable: 6.544444754252622
adaptation: 6.752084119030866
adapted: 4.22328327568564
adapting: 7.37112332743709
additionally: 8.218421187824294
address: 8.623886295932458
adept: 7.707595564058303
adequate: 8.623886295932458
adhesive: 8.218421187824294
adjacent: 8.623886295932458
adult: 4.7219136263578125
adulthood: 8.218421187824294
advanced: 7.930739115372512
adverse: 8.623886295932458
advised: 8.218421187824294
aeri

In [6]:
for key, value_list in data.items():
    # key -> concept
    # value_list = [(question_1, answer_1), (question_2, answer_2), ...]
    for question, answer, cleaned_answer in value_list:
        
        words = cleaned_answer.split()
        if key in words:
            words.remove(key)

        # Find the word with the highest value
        max_word = max(words, key=lambda word: tf_idf_dict.get(word, 0))

        print(f'{key} | {question:50s} {answer:120s} {max_word}')

lion | Is he a carnivore or herbivore?                    Lion is a carnivore.                                                                                                     carnivore
lion | Is it a vertebrate or invertebrate?                Lion is a vertebrate.                                                                                                    vertebrate
lion | Has it ever been domesticated?                     Lions have been trained to some extent in captivity but are not fully domesticated like dogs or cats.                    extent
lion | Is it a mammal?                                    Yes, the lion is a mammal.                                                                                               mammal
lion | Does it have fur?                                  Yes, lions have fur.                                                                                                     fur
lion | In which environments does he live?                Lions ar

# TF-IDF variant

In [11]:
# Combine all answer_clean into a single document
document_groups = [[answer_clean for question, answer, answer_clean in tuples] for tuples in data.values()]
concepts = data.keys()
for concept, document_group in zip(concepts, document_groups):
    print(f'{concept}:')
    for answer in document_group:
        print(f'\t{answer}')

lion:
	lion carnivore
	lion vertebrate
	lion trained extent captivity fully domesticated like dog cat
	yes lion mammal
	yes lion fur
	lion found grassland savanna open woodland
	lion carnivore primarily eat meat usually hunting group
	lion four leg
	lion give birth live young
	lion predator
	lion leg wing fin
	lion primarily found africa small population gir forest india
	wild lion dangerous human especially close encounter
	lion commonly eaten human
	yes lion known powerful agile jump
	lion generally shorter standing human longer body length
	lion live social group called pride usually consist related female offspring
	lion suitable legal kept pet due wild nature
	lion migrate long distance regularly movement influenced factor like food availability water source
	lion intelligent predator considered intelligent animal
	lion classified vulnerable population decreasing particularly certain region
	lion crepuscular meaning active dawn dusk
	lion hibernate
	yes lion sharp teeth
	yes lion 

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer

def compute_concept_TF_IDF(document_groups):

    idf_list = []
    for document_group in document_groups:

        tfidf = TfidfVectorizer()
        result = tfidf.fit_transform(document_group)

        # get idf values
        # for ele1, ele2 in zip(tfidf.get_feature_names_out(), tfidf.idf_):
        #     print(ele1, ':', ele2)
        
        # Create dictionary for idf values
        idf_dict = dict(zip(tfidf.get_feature_names_out(), tfidf.idf_))

        idf_list.append(idf_dict)

    return idf_list


tf_idf_list = compute_concept_TF_IDF(document_groups)
for concept, tf_idf_dict in zip(data, tf_idf_list):
    print(f'{concept}:')
    for key, value in tf_idf_dict.items():
        print(f'\t{key}: {value}')

lion:
	active: 4.113515309210374
	adult: 4.113515309210374
	africa: 4.113515309210374
	agile: 4.113515309210374
	amphibian: 4.113515309210374
	animal: 3.1972245773362196
	aquatic: 4.113515309210374
	arctic: 4.113515309210374
	area: 4.113515309210374
	around: 3.70805020110221
	availability: 4.113515309210374
	awareness: 4.113515309210374
	ban: 4.113515309210374
	belong: 4.113515309210374
	birth: 4.113515309210374
	blood: 4.113515309210374
	body: 3.4203681286504293
	burst: 4.113515309210374
	called: 4.113515309210374
	captivity: 3.70805020110221
	capture: 4.113515309210374
	carnivore: 3.70805020110221
	cat: 4.113515309210374
	certain: 4.113515309210374
	circus: 4.113515309210374
	city: 4.113515309210374
	classified: 4.113515309210374
	claw: 4.113515309210374
	climate: 4.113515309210374
	close: 4.113515309210374
	commonly: 4.113515309210374
	communicate: 4.113515309210374
	concern: 4.113515309210374
	considered: 4.113515309210374
	consist: 4.113515309210374
	covered: 4.113515309210374
	cr

In [14]:
for key_value, tf_idf_dict in zip(data.items(), tf_idf_list):

    key, value_list = key_value

    # key -> concept
    # value_list = [(question_1, answer_1), (question_2, answer_2), ...]
    for question, answer, cleaned_answer in value_list:
        
        words = cleaned_answer.split()
        if key in words:
            words.remove(key)

        # Find the word with the highest value
        max_word = max(words, key=lambda word: tf_idf_dict.get(word, 0))
         
        print(f'{key} | {question:50s} {answer:120s} {max_word}')

lion | Is he a carnivore or herbivore?                    Lion is a carnivore.                                                                                                     carnivore
lion | Is it a vertebrate or invertebrate?                Lion is a vertebrate.                                                                                                    vertebrate
lion | Has it ever been domesticated?                     Lions have been trained to some extent in captivity but are not fully domesticated like dogs or cats.                    trained
lion | Is it a mammal?                                    Yes, the lion is a mammal.                                                                                               mammal
lion | Does it have fur?                                  Yes, lions have fur.                                                                                                     fur
lion | In which environments does he live?                Lions a

# BM25

In [16]:
import nltk
from collections import Counter
import math

nltk.download('punkt')

def tokenize(sentence):
    return nltk.word_tokenize(sentence.lower())

def calculate_bm25_word(word, sentence, idf_scores):
    tf = Counter(tokenize(sentence))[word]
    idf = idf_scores.get(word, 0)
    
    bm25_word = (tf * (idf + 1)) / (tf + 1)
    
    return bm25_word

def find_most_relevant_word(sentence, idf_scores):
    words = tokenize(sentence)
    
    max_relevance = -1
    most_relevant_word = None
    
    for word in words:
        relevance = calculate_bm25_word(word, sentence, idf_scores)
        
        if relevance > max_relevance:
            max_relevance = relevance
            most_relevant_word = word
    
    return most_relevant_word

# Create the document collection
documents = [answer_clean for tuples in data.values() for question, answer, answer_clean in tuples]

# Calculate IDF scores based on your entire document collection
idf_scores = {}
total_documents = len(documents)

for document in documents:
    unique_words = set(tokenize(document))
    for word in unique_words:
        idf_scores[word] = idf_scores.get(word, 0) + 1

for word in idf_scores:
    idf_scores[word] = math.log((total_documents + 1) / (idf_scores[word] + 1)) + 1

# Find the most relevant word for each sentence
for key, value_list in data.items():
    for question, answer, answer_clean in value_list:
        most_relevant_word = find_most_relevant_word(answer_clean, idf_scores)
        print(f'{key} | {question:50s} {answer:120s} {most_relevant_word}')


[nltk_data] Downloading package punkt to /home/daniel/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


lion | Is he a carnivore or herbivore?                    Lion is a carnivore.                                                                                                     carnivore
lion | Is it a vertebrate or invertebrate?                Lion is a vertebrate.                                                                                                    lion
lion | Has it ever been domesticated?                     Lions have been trained to some extent in captivity but are not fully domesticated like dogs or cats.                    extent
lion | Is it a mammal?                                    Yes, the lion is a mammal.                                                                                               lion
lion | Does it have fur?                                  Yes, lions have fur.                                                                                                     lion
lion | In which environments does he live?                Lions are found

# PPMI

In [None]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import pairwise_distances

def compute_ppmi(data):

    # Combine all answer_clean into a single document
    documents = [answer_clean for tuples in data.values() for question, answer, answer_clean in tuples]

    # Step 1: Create a document-term matrix (DTM) using CountVectorizer
    vectorizer = CountVectorizer()
    dtm = vectorizer.fit_transform(documents)

    # Step 2: Compute the co-occurrence matrix
    co_occurrence_matrix = dtm.T.dot(dtm)

    # Step 3: Compute the PPMI matrix
    total_words = np.sum(co_occurrence_matrix)
    row_totals = np.sum(co_occurrence_matrix, axis=0)
    col_totals = np.sum(co_occurrence_matrix, axis=1)

    ppmi_matrix = np.log2((co_occurrence_matrix.toarray() * total_words) / (row_totals * col_totals))

    # Set negative values to zero (PPMI only considers positive associations)
    ppmi_matrix[ppmi_matrix < 0] = 0

    return ppmi_matrix


ppmi_result = compute_ppmi(data)
print("PPMI Matrix:")
print(ppmi_result)
