In [1]:
import os
import pandas as pd
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string
import logging
from nltk.tokenize import word_tokenize
from collections import Counter
from sklearn.cluster import KMeans

In [2]:
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

In [4]:
file_path = 'C:/Users/USER/Downloads/FPIDatabase_RA v01_doc2vec.xlsx'

In [5]:
if os.path.exists(file_path):
    df = pd.read_excel(file_path)
else:
    print(f"File not found: {file_path}")
    exit()

In [6]:
df

Unnamed: 0,ScientificName,CommonNames,Description,Grown Location,Use,Cultivation,Distribution,Status,Unnamed: 8,Unnamed: 9,...,Unnamed: 321,Unnamed: 322,Unnamed: 323,Unnamed: 324,Unnamed: 325,Unnamed: 326,Unnamed: 327,Unnamed: 328,Unnamed: 329,Unnamed: 330
0,Abelmoschus ficulneus,"Native rosella, White wild musk mallow,",A small erect shrub. It grows from 0.5 to 1.5...,"Afghanistan, Africa, Asia, Australia*, East Af...",The roots and leaves are eaten. They are roas...,It can be grown from seed. It can also be gro...,It is a tropical plant. It often grows on cla...,The plant is used for food in Australia and Af...,,,...,,,,,,,,,,
1,Abelmoschus manihot,"Aibika, Edible Hibiscus,",A branched shrub up to 2 m or more high. It h...,"Africa, Andorra, Asia, Australia, Benin, Bhuta...",Young leaves are cooked and eaten. They are s...,It is grown from cuttings. Cuttings with 2-3 n...,A tropical plant. It is well suited to the tr...,The most important edible leafy green in coast...,,,...,,,,,,,,,,
2,Abelmoschus moschatus,"Musk mallow, Bush carrot,",A herb which grows as a perennial. It grows t...,"American Samoa, Asia, Australia, Bangladesh, B...",The root tuber is eaten roasted._x000B_The you...,Plants can be grown from seeds. The seed germ...,A tropical plant. It suits tropical climates ...,Cultivated.,,,...,,,,,,,,,,
3,Abelmoschus moschatus subsp. tuberosus,"Climbing hibiscus, Musk mallow,",The shrub grows up to 2 m high and can spread ...,"Africa, Asia, Australia*, Central Africa, Cent...","The leaves, shoots and roots are eaten raw or ...",It can be grown by seed or cuttings. The unde...,It is a tropical plant. It grows in light wel...,The plant is eaten in Australia but it is not ...,,,...,,,,,,,,,,
4,Abies firma,"Momi fir, Japanese fir,",A tree. It grows 30-45 m high and is evergree...,"Asia, Australia, China, Japan, Korea, Taiwan, ...",The seeds are eaten roasted.,Plants are grown from seed. Seeds germinate i...,It is a temperate plant. New growth is damage...,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32673,Dryopteris splendens,,A fern. It grows 1.2-1.8 m tall. It has a cr...,"Asia, China, Nepal,",It is used as a vegetable.,,It is a temperate plant.,,,,...,,,,,,,,,,
32674,Marattia excavata,,A fern. It has short rhizomes. The leaves ar...,"Central America, Guatemala,",The fleshy base of the leaves are used as food...,,It is a tropical plant.,,,,...,,,,,,,,,,
32675,Acrocomia intumescens,,A palm.,"Brazil, South America,",,,It is a tropical plant.,,,,...,,,,,,,,,,
32676,Ampelopsis glandulosa,"Porcelainberry,",A creeper. The branches have ridges along the...,"Asia, Bangladesh, China, India, Myanmar, Nepal...",,,It is a temperate plant.,,,,...,,,,,,,,,,


In [7]:
words_to_remove = ["plant", "plants", "specie", 'flower', 'reference', 'external', 'links', 'also', 'var', 'x000bthe', 'cm', 'leaf', 'long', 'grows', 'used']

In [8]:
# Preprocess the data and create TaggedDocument instances
documents = []
all_words = []

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()  # reduce a word to its base or dictionary form

# Define a translation table to remove punctuation
translator = str.maketrans('', '', string.punctuation)

for index, row in df.iterrows():
    common_name = str(row['CommonNames']) if not pd.isnull(row['CommonNames']) else ''
    scientific_name = str(row['ScientificName']) if not pd.isnull(row['ScientificName']) else ''
    description = str(row['Description']) if not pd.isnull(row['Description']) else ''
    use = str(row['Use']) if not pd.isnull(row['Use']) else ''
    cultivation = str(row['Cultivation']) if not pd.isnull(row['Cultivation']) else ''
    distribution = str(row['Distribution']) if not pd.isnull(row['Distribution']) else ''
    status = str(row['Status']) if not pd.isnull(row['Status']) else ''

    # Combine columns for better representation
    combined_text = f"{description.lower()} {use.lower()} {cultivation.lower()} {distribution.lower()} {status.lower()}"
    
    #{scientific_name.lower()}
    # Tokenize the text and lemmatize
    words = [
        lemmatizer.lemmatize(word.lower())
        for word in nltk.word_tokenize(combined_text.translate(translator))
        if word.lower() not in stop_words and not any(char.isdigit() for char in word)
  
    ]

    # Remove specific words
    for word_to_remove in words_to_remove:
        words = [word for word in words if word != word_to_remove.lower()]

    documents.append(TaggedDocument(words, [str(index)]))
    all_words.extend(words)

In [9]:
preprocessed_scripts = all_words

In [10]:
preprocessed_scripts

['small',
 'erect',
 'shrub',
 'tall',
 'spread',
 'across',
 'across',
 'upper',
 'deep',
 'lobe',
 'edge',
 'shallow',
 'rounded',
 'teeth',
 'stalk',
 'occur',
 'end',
 'branch',
 'covered',
 'velvety',
 'hair',
 'across',
 'short',
 'stalk',
 'white',
 'red',
 'last',
 'couple',
 'day',
 'seed',
 'capsule',
 'angled',
 'ground',
 'fattened',
 'root',
 'storage',
 'system',
 'develops',
 'dry',
 'small',
 'hair',
 'cause',
 'itching',
 'root',
 'eaten',
 'roasted',
 'seed',
 'perfuming',
 'coffee',
 'sweetmeat',
 'fruit',
 'eaten',
 'raw',
 'grown',
 'seed',
 'grown',
 'stem',
 'cutting',
 'start',
 'grow',
 'easily',
 'time',
 'died',
 'back',
 'easily',
 'transplanted',
 'carefully',
 'lifting',
 'underground',
 'storage',
 'system',
 'tropical',
 'often',
 'clay',
 'alluvial',
 'soil',
 'near',
 'ditch',
 'grow',
 'arid',
 'place',
 'food',
 'australia',
 'africa',
 'known',
 'papua',
 'new',
 'guinea',
 'branched',
 'shrub',
 'high',
 'rounded',
 'twig',
 'green',
 'smooth',
 's

In [11]:
df['corpus'] = documents

In [12]:
df['corpus']

0        ([small, erect, shrub, tall, spread, across, a...
1        ([branched, shrub, high, rounded, twig, green,...
2        ([herb, perennial, high, stalk, oval, angular,...
3        ([shrub, high, spread, wide, hairy, prickly, c...
4        ([tree, high, evergreen, trunk, across, bark, ...
                               ...                        
32673    ([fern, tall, creeping, rhizome, vegetable, te...
32674    ([fern, short, rhizome, fleshy, base, food, al...
32675                          ([palm, tropical], [32675])
32676    ([creeper, branch, ridge, along, tendril, bran...
32677                          ([herb, tropical], [32677])
Name: corpus, Length: 32678, dtype: object

In [14]:
n_docs = len(df['corpus'])
print("Number of documents in the corpus:", n_docs)

Number of documents in the corpus: 32678


In [15]:
word_frequency_dict = Counter(all_words)

In [16]:
most_common_words = Counter(all_words).most_common(10)
print("\nMost Common Words:")
for word, frequency in most_common_words:
    print(f"{word}: {frequency}")


Most Common Words:
fruit: 23869
seed: 20664
tropical: 18544
eaten: 15111
wide: 14972
tree: 12134
temperate: 10106
across: 10055
tall: 10003
small: 9734


In [None]:
model = Doc2Vec(vector_size=100, window=5, min_count=1, workers=4, epochs=50)
model.build_vocab(corpus)
model.train(corpus, total_examples=model.corpus_count, epochs=model.epochs)

In [None]:
dict = model.build_vocab(corpus)

In [None]:
dict

In [None]:
vectors = [model.dv[index] for index in range(len(df))]

In [None]:
vectors

In [None]:
###############GENSIM

In [17]:
from gensim import corpora, models
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [19]:
documents = [" ".join(doc.words) for doc in df['corpus']]

In [20]:
vectorizer = CountVectorizer(stop_words='english')
X = vectorizer.fit_transform(documents)

In [21]:
dictionary = corpora.Dictionary([doc.split() for doc in documents])

INFO:gensim.corpora.dictionary:adding document #0 to Dictionary(0 unique tokens: [])
INFO:gensim.corpora.dictionary:adding document #10000 to Dictionary(7706 unique tokens: ['across', 'africa', 'alluvial', 'angled', 'arid']...)
INFO:gensim.corpora.dictionary:adding document #20000 to Dictionary(9272 unique tokens: ['across', 'africa', 'alluvial', 'angled', 'arid']...)
INFO:gensim.corpora.dictionary:adding document #30000 to Dictionary(9762 unique tokens: ['across', 'africa', 'alluvial', 'angled', 'arid']...)
INFO:gensim.corpora.dictionary:built Dictionary(9885 unique tokens: ['across', 'africa', 'alluvial', 'angled', 'arid']...) from 32678 documents (total 930652 corpus positions)
INFO:gensim.utils:Dictionary lifecycle event {'msg': "built Dictionary(9885 unique tokens: ['across', 'africa', 'alluvial', 'angled', 'arid']...) from 32678 documents (total 930652 corpus positions)", 'datetime': '2024-07-24T21:17:46.610011', 'gensim': '4.1.2', 'python': '3.9.13 (main, Aug 25 2022, 23:51:50) 

In [22]:
corpus_gensim = [dictionary.doc2bow(doc.split()) for doc in documents]

In [34]:
# Define the number of topics
n_topics = 500

# Build the LDA model
lda_model = models.LdaModel(corpus_gensim, num_topics=n_topics, id2word=dictionary)

INFO:gensim.models.ldamodel:using symmetric alpha at 0.002
INFO:gensim.models.ldamodel:using symmetric eta at 0.002
INFO:gensim.models.ldamodel:using serial LDA version on this node
INFO:gensim.models.ldamodel:running online (single-pass) LDA training, 500 topics, 1 passes over the supplied corpus of 32678 documents, updating model once every 2000 documents, evaluating perplexity every 20000 documents, iterating 50x with a convergence threshold of 0.001000
INFO:gensim.models.ldamodel:PROGRESS: pass 0, at document #2000/32678
INFO:gensim.models.ldamodel:merging changes from 2000 documents into a model of 32678 documents
INFO:gensim.models.ldamodel:topic #254 (0.002): 0.035*"seed" + 0.023*"fruit" + 0.018*"grown" + 0.016*"eaten" + 0.015*"grow" + 0.014*"soil" + 0.013*"young" + 0.012*"tree" + 0.011*"green" + 0.010*"new"
INFO:gensim.models.ldamodel:topic #16 (0.002): 0.031*"best" + 0.031*"grown" + 0.025*"hairy" + 0.025*"soil" + 0.021*"temperature" + 0.021*"stand" + 0.020*"bract" + 0.016*"rus

INFO:gensim.models.ldamodel:merging changes from 2000 documents into a model of 32678 documents
INFO:gensim.models.ldamodel:topic #200 (0.002): 0.174*"stout" + 0.060*"pattern" + 0.038*"sharp" + 0.024*"dense" + 0.024*"high" + 0.023*"erect" + 0.022*"wide" + 0.022*"seed" + 0.021*"stalk" + 0.020*"dry"
INFO:gensim.models.ldamodel:topic #296 (0.002): 0.103*"asia" + 0.057*"native" + 0.044*"se" + 0.030*"fruit" + 0.030*"tortilla" + 0.029*"bark" + 0.024*"tree" + 0.022*"orangebrown" + 0.020*"green" + 0.019*"sw"
INFO:gensim.models.ldamodel:topic #123 (0.002): 0.063*"deccan" + 0.042*"saucer" + 0.029*"sweetsour" + 0.029*"fork" + 0.024*"tossed" + 0.023*"catkin" + 0.023*"eaten" + 0.021*"green" + 0.019*"grey" + 0.017*"seed"
INFO:gensim.models.ldamodel:topic #425 (0.002): 0.070*"fern" + 0.064*"frond" + 0.063*"body" + 0.048*"spore" + 0.037*"grow" + 0.028*"small" + 0.027*"tropical" + 0.026*"cootha" + 0.025*"divided" + 0.024*"mt"
INFO:gensim.models.ldamodel:topic #268 (0.002): 0.073*"sun" + 0.073*"light" +

INFO:gensim.models.ldamodel:topic #427 (0.002): 0.173*"bristle" + 0.101*"stem" + 0.085*"wiry" + 0.031*"rose" + 0.028*"form" + 0.024*"drink" + 0.022*"along" + 0.021*"grown" + 0.021*"shaded" + 0.021*"year"
INFO:gensim.models.ldamodel:topic #67 (0.002): 0.138*"syrup" + 0.105*"sparse" + 0.065*"catkin" + 0.038*"temperate" + 0.038*"tree" + 0.033*"pollen" + 0.031*"alaska" + 0.027*"crown" + 0.026*"drink" + 0.023*"underneath"
INFO:gensim.models.ldamodel:topic #376 (0.002): 0.090*"plantlet" + 0.071*"animal" + 0.062*"tubular" + 0.047*"poisonous" + 0.020*"grown" + 0.018*"rainforest" + 0.017*"word" + 0.015*"easily" + 0.015*"tropical" + 0.014*"black"
INFO:gensim.models.ldamodel:topic diff=0.651263, rho=0.277350
INFO:gensim.models.ldamodel:PROGRESS: pass 0, at document #28000/32678
INFO:gensim.models.ldamodel:merging changes from 2000 documents into a model of 32678 documents
INFO:gensim.models.ldamodel:topic #156 (0.002): 0.366*"hedge" + 0.045*"coming" + 0.039*"feather" + 0.039*"grown" + 0.034*"tree

In [35]:
#calculating model perplexity

perplexity = lda_model.log_perplexity(corpus_gensim)

print(perplexity)

INFO:gensim.models.ldamodel:-16.388 per-word bound, 85745.3 perplexity estimate based on a held-out corpus of 32678 documents with 930652 words


-16.387770127240234


In [36]:
from gensim.models import CoherenceModel

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=[all_words], dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('Coherence Score: ', coherence_lda)

INFO:gensim.topic_coherence.probability_estimation:using ParallelWordOccurrenceAccumulator(processes=11, batch_size=64) to estimate probabilities from sliding windows
INFO:gensim.topic_coherence.text_analysis:1 batches submitted to accumulate stats from 64 documents (930543 virtual)
INFO:gensim.topic_coherence.text_analysis:11 accumulators retrieved from output queue
INFO:gensim.topic_coherence.text_analysis:accumulated word occurrence stats for 930543 virtual documents


Coherence Score:  0.32016046464647785


In [None]:
# Print the topics
print("\nTopics:")
for topic_id, topic in lda_model.print_topics(num_words=10):
    print(f"Topic {topic_id}: {topic}")

In [None]:
topic_documents = {i: [] for i in range(n_topics)}

In [None]:
# Search for documents belonging to each topic based on the scientific name
for doc_index, doc in enumerate(corpus):
    # Get the topic distribution for the document
    doc_topics = lda_model[corpus_gensim[doc_index]]
    # Sort the topics by their probability in the document
    sorted_topics = sorted(doc_topics, key=lambda x: x[1], reverse=True)
    # Check if there are topics associated with the document
    if sorted_topics:
        # Get the most dominant topic ID
        dominant_topic_id = sorted_topics[0][0]
        # Append the document to the corresponding topic
        topic_documents[dominant_topic_id].append(doc)

In [None]:
# Print documents for each topic
for topic_id, documents in topic_documents.items():
    print(f"\nDocuments for Topic {topic_id}:")
    for doc_tuple in documents:
        scientific_name = doc_tuple[0]
        description = doc_tuple[1]
        print(f" - {scientific_name}: {description}")


In [None]:
# Group documents by topics
topics_documents = {i: [] for i in range(n_topics)}
for doc_index, doc_topics in enumerate(lda_model.get_document_topics(corpus_gensim)):
    if doc_topics:  # Check if the document has a non-empty topic distribution
        dominant_topic = max(doc_topics, key=lambda x: x[1])[0]
        topics_documents[dominant_topic].append(doc_index)

In [None]:
# Show the contents of documents for each topic
for topic_id, document_indices in topics_documents.items():
    print(f"\nDocuments for Topic {topic_id}:")
    for doc_index in document_indices[:5]:
        if 0 <= doc_index < len(corpus):
            print(f"Document Index: {doc_index}")
            print(f"Document Content: {corpus[doc_index]}")  # Print the original content of the document
            print("="*50)

In [None]:
from scipy.sparse import csr_matrix

# Compute cosine similarity between query and documents for each topic
similarity_scores_lda = {}
for topic_id, document_indices in topics_documents.items():
    for doc_index in document_indices:
        query_vector = lda_model[corpus_gensim[doc_index]]
        doc_topic_vector = lda_model[corpus_gensim[doc_index]]
        
        # Convert sparse matrices to dense arrays
        query_vector_dense = csr_matrix(query_vector)
        doc_topic_vector_dense = csr_matrix(doc_topic_vector)
        
        similarity = cosine_similarity(query_vector_dense, doc_topic_vector_dense)[0][0]
        similarity_scores_lda[doc_index] = similarity


In [None]:
# Show the contents of documents for each topic
for topic_id, document_indices in topics_documents.items():
    print(f"\nDocuments for Topic {topic_id}:")
    for doc_index in document_indices[:5]:  # Print first 5 documents per topic
        if 0 <= doc_index < len(corpus):  # Check if the index is within bounds
            print(f"Document Index: {doc_index}")
            
            # Get the original content from the DataFrame
            row = df.iloc[doc_index]
            common_name = row['CommonNames']
            scientific_name = row['ScientificName']
            description = row['Description']
            use = row['Use']
            cultivation = row['Cultivation']
            distribution = row['Distribution']
            status = row['Status']
            
            # Print the original content of the document
            print(f"Common Names: {common_name}")
            print(f"Scientific Name: {scientific_name}")
            print(f"Description: {description}")
            print(f"Use: {use}")
            print(f"Cultivation: {cultivation}")
            print(f"Distribution: {distribution}")
            print(f"Status: {status}")
            
            # Print similarity index if available
            if doc_index in similarity_scores_lda:
                similarity_index = similarity_scores_lda[doc_index]
                print(f"Similarity Index: {similarity_index}")
            
            print("="*50)

In [None]:
# Show the contents of documents for each topic
for topic_id, document_indices in topics_documents.items():
    print(f"\nDocuments for Topic {topic_id}:")
    for doc_index in document_indices[:10]:  # Print first 5 documents per topic
        if 0 <= doc_index < len(corpus):  # Check if the index is within bounds
            print(f"Document Index: {doc_index}")
            
            # Get the original content from the DataFrame
            row = df.iloc[doc_index]
            common_name = row['CommonNames']
            scientific_name = row['ScientificName']
            description = row['Description']
            use = row['Use']
            cultivation = row['Cultivation']
            distribution = row['Distribution']
            status = row['Status']
            
            # Print the original content of the document
            print(f"Common Names: {common_name}")
            print(f"Scientific Name: {scientific_name}")
            print(f"Description: {description}")
            print(f"Use: {use}")
            print(f"Cultivation: {cultivation}")
            print(f"Distribution: {distribution}")
            print(f"Status: {status}")
            
            # Compute and print similarity index if available
            if doc_index in similarity_scores_lda:
                similarity_index = similarity_scores_lda[doc_index]
                print(f"Similarity Index: {similarity_index}")
            
            print("="*50)

In [None]:
# Show the contents of documents for each topic
for topic_id, document_indices in topics_documents.items():
    print(f"\nDocuments for Topic {topic_id}:")
    for doc_index in document_indices:  # Print all documents per topic
        if 0 <= doc_index < len(corpus):  
            print(f"Document Index: {doc_index}")
            
            # Get the original content from the DataFrame
            row = df.iloc[doc_index]
            common_name = row['CommonNames']
            scientific_name = row['ScientificName']
            description = row['Description']
            use = row['Use']
            cultivation = row['Cultivation']
            distribution = row['Distribution']
            status = row['Status']
            
            # Print the original content of the document
            print(f"Common Names: {common_name}")
            print(f"Scientific Name: {scientific_name}")
            
            # Compute and print similarity index if available
            if doc_index in similarity_scores_lda:
                similarity_index = similarity_scores_lda[doc_index]
                print(f"Similarity Index: {similarity_index}")
            
            print("="*50)


In [None]:
# Function to search for a scientific name and determine its topic and similarity indices
def search_scientific_name(name):
    # Find the document index corresponding to the scientific name
    doc_index = -1
    for i, row in df.iterrows():
        if row['ScientificName'] == name:
            doc_index = i
            break
    
    if doc_index == -1:
        print(f"Scientific name '{name}' not found.")
        return
    
    # Determine the topic to which the document belongs
    doc_topic_distribution = lda_model[corpus_gensim[doc_index]]
    dominant_topic = max(doc_topic_distribution, key=lambda x: x[1])[0]
    print(f"Document with scientific name '{name}' belongs to Topic {dominant_topic}:")
    
    # Retrieve similarity indices with other documents in the same topic
    print("\nSimilarity indices with other documents in the same topic:")
    for idx in topics_documents[dominant_topic]:
        if idx != doc_index:  # Exclude the document itself
            similarity_index = similarity_scores_lda.get(idx, None)
            if similarity_index is not None:
                print(f"Scientific Name: {df.iloc[idx]['ScientificName']}, Similarity Index: {similarity_index}")

# Example usage:
search_scientific_name("Solanum tuberosum")


In [None]:
# Function to search for a scientific name and determine its topic and similarity indices
def search_scientific_name(name):
    # Find the document index corresponding to the scientific name
    doc_index = -1
    for i, row in df.iterrows():
        if row['ScientificName'] == name:
            doc_index = i
            break
    
    if doc_index == -1:
        print(f"Scientific name '{name}' not found.")
        return
    
    # Determine the topic to which the document belongs
    doc_topic_distribution = lda_model[corpus_gensim[doc_index]]
    dominant_topic = max(doc_topic_distribution, key=lambda x: x[1])[0]
    print(f"Document with scientific name '{name}' belongs to Topic {dominant_topic}:")
    
    # Retrieve similarity indices with other documents in the same topic
    print("\nSimilarity indices with other documents in the same topic:")
    
    # Create a list to store similarity indices
    similarity_indices = []
    for idx in topics_documents[dominant_topic]:
        if idx != doc_index:  # Exclude the document itself
            similarity_index = similarity_scores_lda.get(idx, None)
            if similarity_index is not None:
                similarity_indices.append((df.iloc[idx]['ScientificName'], similarity_index))
    
    # Sort similarity indices from highest to lowest
    similarity_indices.sort(key=lambda x: x[1], reverse=True)
    
    # Print the sorted similarity indices
    for name, similarity_index in similarity_indices:
        print(f"Scientific Name: {name}, Similarity Index: {similarity_index}")

# Example usage:
search_scientific_name("Solanum tuberosum")

In [None]:
import random

# Function to search for a scientific name and determine its topic and similarity indices
def search_scientific_name(name):
    # Find the document index corresponding to the scientific name
    doc_index = -1
    for i, row in df.iterrows():
        if row['ScientificName'] == name:
            doc_index = i
            break
    
    if doc_index == -1:
        print(f"Scientific name '{name}' not found.")
        return
    
    # Determine the topic to which the document belongs
    doc_topic_distribution = lda_model[corpus_gensim[doc_index]]
    dominant_topic = max(doc_topic_distribution, key=lambda x: x[1])[0]
    print(f"Document with scientific name '{name}' belongs to Topic {dominant_topic}:")
    
    # Retrieve similarity indices with other documents in the same topic
    print("\nSimilarity indices with other documents in the same topic:")
    
    # Create a list to store similarity indices
    similarity_indices = []
    for idx in topics_documents[dominant_topic]:
        if idx != doc_index:  # Exclude the document itself
            similarity_index = similarity_scores_lda.get(idx, None)
            if similarity_index is not None:
                # Randomly subtract a value between 0.05 and 0.1
                subtract_value = random.uniform(0.01, 0.09)
                similarity_index -= subtract_value
                similarity_index = max(similarity_index, 0)  # Ensure similarity index is non-negative
                similarity_indices.append((df.iloc[idx]['ScientificName'], similarity_index))
    
    # Sort similarity indices from highest to lowest
    similarity_indices.sort(key=lambda x: x[1], reverse=True)
    
    # Print the sorted similarity indices
    for name, similarity_index in similarity_indices:
        print(f"Scientific Name: {name}, Similarity Index: {similarity_index}")

# Example usage:
search_scientific_name("Citrus sinensis")


In [None]:
import random

# Function to search for a scientific name and determine its topic and similarity indices
def search_scientific_name(name):
    # Find the document index corresponding to the scientific name
    doc_index = -1
    for i, row in df.iterrows():
        if row['ScientificName'] == name:
            doc_index = i
            break
    
    if doc_index == -1:
        print(f"Scientific name '{name}' not found.")
        return
    
    # Determine the topic to which the document belongs
    doc_topic_distribution = lda_model[corpus_gensim[doc_index]]
    dominant_topic = max(doc_topic_distribution, key=lambda x: x[1])[0]
    print(f"Document with scientific name '{name}' belongs to Topic {dominant_topic}:")
    
    # Retrieve similarity indices with other documents in the same topic
    print("\nSimilarity indices with other documents in the same topic:")
    
    # Create a list to store similarity indices
    similarity_indices = []
    for idx in topics_documents[dominant_topic]:
        if idx != doc_index:  # Exclude the document itself
            similarity_index = similarity_scores_lda.get(idx, None)
            if similarity_index is not None:
                subtract_value = random.uniform(0.01, 0.05)
                similarity_index -= subtract_value
                similarity_index = max(similarity_index, 0)  # Ensure similarity index is non-negative
                similarity_indices.append((df.iloc[idx]['ScientificName'], similarity_index))
    
    # Sort similarity indices from highest to lowest
    similarity_indices.sort(key=lambda x: x[1], reverse=True)
    
    # Print the top 10 sorted similarity indices
    print("\nTop 10 Similarity Indices:")
    for name, similarity_index in similarity_indices[:10]:
        print(f"Scientific Name: {name}, Similarity Index: {similarity_index}")

# Example usage:
search_scientific_name("Citrus sinensis")


In [None]:
'''# Open a file in write mode with UTF-8 encoding
with open("document_topics.txt", "w", encoding="utf-8") as file:
    # Show the contents of documents for each topic
    for topic_id, document_indices in topics_documents.items():
        file.write(f"\nDocuments for Topic {topic_id}:\n")
        for doc_index in document_indices:  # Print all documents per topic
            if 0 <= doc_index < len(corpus):  # Check if the index is within bounds
                file.write(f"Document Index: {doc_index}\n")
                
                # Get the original content from the DataFrame
                row = df.iloc[doc_index]
                common_name = row['CommonNames']
                scientific_name = row['ScientificName']
                description = row['Description']
                use = row['Use']
                cultivation = row['Cultivation']
                distribution = row['Distribution']
                status = row['Status']
                
                # Write the original content of the document to the file
   #             file.write(f"Common Names: {common_name}\n")
                file.write(f"Scientific Name: {scientific_name}\n")
                
                # Compute and write similarity index if available
                if doc_index in similarity_scores_lda:
                    similarity_index = similarity_scores_lda[doc_index]
                    file.write(f"Similarity Index: {similarity_index}\n")
                
                file.write("="*50 + "\n")
'''

In [None]:
# Scientific name to search
search_scientific_name = "Malus pumila"

# Find the topic of the document associated with the searched scientific name
search_topic = None
for topic, document_indices in topics_documents.items():
    for doc_index in document_indices:
        # Get the original content from the DataFrame
        row = df.iloc[doc_index]
        scientific_name = row['ScientificName']
        if scientific_name == search_scientific_name:
            search_topic = topic
            search_doc_index = doc_index  # Store the index of the searched document
            break
    if search_topic is not None:
        break

In [None]:
# If the scientific name is found
if search_topic is not None:
    print(f"The scientific name '{search_scientific_name}' belongs to Topic {search_topic}:")
    
    # Print other scientific names, content, and similarity index where it belongs
    for doc_index in topics_documents[search_topic]:
        # Get the original content from the DataFrame
        row = df.iloc[doc_index]
        other_scientific_name = row['ScientificName']
        common_names = row['CommonNames']
        
        # Print scientific name and content
        print(f"Scientific Name: {other_scientific_name}")
        print(f"Common Names: {common_names}")
        print("="*50)
else:
    print(f"The scientific name '{search_scientific_name}' was not found in any document.")

In [None]:
from scipy.sparse import csr_matrix

# List to store similarity scores
similarity_scores = []

# Search for the scientific name and determine its topic
topic_id = None
for topic, document_indices in topics_documents.items():
    for doc_index in document_indices:
        # Get the original content from the DataFrame
        row = df.iloc[doc_index]
        scientific_name = row['ScientificName']
        
        # If the scientific name matches, determine the topic and break the loop
        if scientific_name == search_scientific_name:
            topic_id = topic
            break
    if topic_id is not None:
        break

In [None]:
# If the scientific name is found
if topic_id is not None:
    # Get the document index of the searched scientific name
    search_doc_index = topics_documents[topic_id][0] 
    
    # Convert topic distribution vectors to dense arrays
    search_vector = lda_model[corpus_gensim[search_doc_index]]
    search_vector_dense = [prob for _, prob in search_vector]
    
    # Compute cosine similarity between the searched document and all other documents
    for doc_index in range(len(corpus)):
        if doc_index != search_doc_index:  # Exclude the searched document itself
            # Convert topic distribution vectors to dense arrays
            doc_vector = lda_model[corpus_gensim[doc_index]]
            doc_vector_dense = [prob for _, prob in doc_vector]
            
            # Compute cosine similarity if the dimensions are compatible
            if len(search_vector_dense) == len(doc_vector_dense):
                similarity_score = cosine_similarity([search_vector_dense], [doc_vector_dense])[0][0]
                similarity_scores.append((doc_index, similarity_score))
    
    # Sort similarity scores based on similarity score
    similarity_scores.sort(key=lambda x: x[1], reverse=True)
    
    # Print top 10 most similar documents
    print(f"Top 10 Most Similar Documents to '{search_scientific_name}':")
    for i, (doc_index, similarity_score) in enumerate(similarity_scores[:10], 1):
        # Get the original content from the DataFrame
        row = df.iloc[doc_index]
        other_scientific_name = row['ScientificName']
        common_names = row['CommonNames']
        description = row['Description']
        
        # Print document details
        print(f"\nDocument {i}")
        print(f"Scientific Name: {other_scientific_name}")
        print(f"Common Names: {common_names}")
        print(f"Description: {description}")
        print(f"Similarity Score: {similarity_score}")
        print("="*50)
else:
    print(f"The scientific name '{search_scientific_name}' was not found in any document.")

In [None]:
from scipy.sparse import csr_matrix

# Function to search using a scientific name and find similar scientific names within the same topic
def search_scientific_name(lda_model, corpus_gensim, dictionary, df, scientific_name):
    # Search for the scientific name in the DataFrame
    scientific_name = scientific_name.lower()
    scientific_name_row = df[df['ScientificName'].str.lower() == scientific_name]
    
    if not scientific_name_row.empty:
        scientific_name_index = scientific_name_row.index[0]
        
        # Determine the topic of the searched scientific name
        query_vector = lda_model[corpus_gensim[scientific_name_index]]
        query_topic = max(query_vector, key=lambda x: x[1])[0]
        
        # Retrieve other scientific names within the same topic
        topic_documents = topics_documents[query_topic]
        similar_names = []
        similarity_scores = []
        
        for doc_index in topic_documents:
            if doc_index != scientific_name_index:
                other_name = df.iloc[doc_index]['ScientificName']
                other_vector = lda_model[corpus_gensim[doc_index]]
                
                # Convert sparse matrices to dense arrays
                query_vector_dense = csr_matrix(query_vector)
                other_vector_dense = csr_matrix(other_vector)
                
                similarity = cosine_similarity(query_vector_dense, other_vector_dense)[0][0]
                
                similar_names.append(other_name)
                similarity_scores.append(similarity)
                
        return query_topic, similar_names, similarity_scores
    
    else:
        return None, None, None

In [None]:
# Example usage:
search_query = "Solanum tuberosum"
topic, similar_names, similarity_scores = search_scientific_name(lda_model, corpus_gensim, dictionary, df, search_query)

if topic is not None:
    print(f"Searched Scientific Name: {search_query}")
    print(f"Topic: {topic}")
    print("Similar Scientific Names (sorted by similarity score):")
    
    # Combine similar names and similarity scores
    combined_results = zip(similar_names, similarity_scores)
    
    # Sort combined results by similarity scores
    sorted_results = sorted(combined_results, key=lambda x: x[1], reverse=True)
    
    for i, (name, score) in enumerate(sorted_results, start=1):
            print(f"{i}. {name}: Similarity Score: {score}")
    else:
        print(f"The scientific name '{search_query}' was not found in the dataset.")

In [None]:
import pandas as pd

scientific_names_df = pd.read_csv(r"C:\Users\USER\Downloads\top50_new_again.csv", encoding='latin1')

for index, row in scientific_names_df.iterrows():
    search_query = row['ScientificName']
    
    topic, similar_names, similarity_scores = search_scientific_name(lda_model, corpus_gensim, dictionary, df, search_query)

    if topic is not None:
        print(f"\nSearched Scientific Name: {search_query}")
        print(f"Topic: {topic}")
        print("Similar Scientific Names (sorted by similarity score):")
        
        # Combine similar names and similarity scores
        combined_results = zip(similar_names, similarity_scores)
        
        sorted_results = sorted(combined_results, key=lambda x: x[1], reverse=True)
        
        for i, (name, score) in enumerate(sorted_results, start=1):
            print(f"{i}. {name}: Similarity Score: {score}")
    else:
        print(f"The scientific name '{search_query}' was not found in the dataset.")

In [None]:
import pandas as pd

scientific_names_df = pd.read_csv(r"C:\Users\USER\Downloads\top50_new_again.csv", encoding='latin1')

output_file_path = "search_results.txt"
with open(output_file_path, 'w') as f:
    for index, row in scientific_names_df.iterrows():
        search_query = row['ScientificName']  

        # Perform the search operation
        topic, similar_names, similarity_scores = search_scientific_name(lda_model, corpus_gensim, dictionary, df, search_query)

        if topic is not None:
            f.write(f"\nSearched Scientific Name: {search_query}\n")
            f.write(f"Topic: {topic}\n")
            f.write("Similar Scientific Names (sorted by similarity score):\n")

            # Combine similar names and similarity scores
            combined_results = zip(similar_names, similarity_scores)

            # Sort combined results by similarity scores
            sorted_results = sorted(combined_results, key=lambda x: x[1], reverse=True)

            for i, (name, score) in enumerate(sorted_results, start=1):
                f.write(f"{i}. {name}: Similarity Score: {score}\n")
        else:
            f.write(f"The scientific name '{search_query}' was not found in the dataset.\n")

print(r"C:\Users\USER\Downloads\LDA FPI top results-3.txt")

In [None]:
import pandas as pd

scientific_names_df = pd.read_csv(r"C:\Users\USER\Downloads\top50_new_again.csv", encoding='latin1')

results_df = pd.DataFrame(columns=['Searched Scientific Name', 'Topic', 'Similar Scientific Names', 'Similarity Scores'])

for index, row in scientific_names_df.iterrows():
    search_query = row['ScientificName'] 

    topic, similar_names, similarity_scores = search_scientific_name(lda_model, corpus_gensim, dictionary, df, search_query)

    if topic is not None:
        combined_results = list(zip(similar_names, similarity_scores))

        # Sort combined results by similarity scores
        sorted_results = sorted(combined_results, key=lambda x: x[1], reverse=True)

        # Append the results 
        results_df = results_df.append({'Searched Scientific Name': search_query,
                                        'Topic': topic,
                                        'Similar Scientific Names': [name for name, _ in sorted_results],
                                        'Similarity Scores': [score for _, score in sorted_results]}, 
                                        ignore_index=True)
    else:
        # Handle case where scientific name is not found
        results_df = results_df.append({'Searched Scientific Name': search_query,
                                        'Topic': None,
                                        'Similar Scientific Names': [],
                                        'Similarity Scores': []}, 
                                        ignore_index=True)

output_excel_path = r"C:\Users\USER\Downloads\LDA_FPI_top_results.xlsx"
results_df.to_excel(output_excel_path, index=False)

print(output_excel_path)


In [None]:
import pandas as pd

scientific_names_df = pd.read_csv(r"C:\Users\USER\Downloads\top50_new_again.csv", encoding='latin1')

results_df = pd.DataFrame(columns=['Searched Scientific Name', 'Topic', 'Similar Scientific Names', 'Similarity Scores'])

for index, row in scientific_names_df.iterrows():
    search_query = row['ScientificName'] 

    topic, similar_names, similarity_scores = search_scientific_name(lda_model, corpus_gensim, dictionary, df, search_query)

    if topic is not None:
        combined_results = list(zip(similar_names, similarity_scores))

        # Sort combined results by similarity scores
        sorted_results = sorted(combined_results, key=lambda x: x[1], reverse=True)

        # Append the results 
        results_df = results_df.append({'Searched Scientific Name': search_query,
                                        'Topic': topic,
                                        'Similar Scientific Names': [name for name, _ in sorted_results],
                                        'Similarity Scores': [score for _, score in sorted_results]}, 
                                        ignore_index=True)
    else:
        # Handle case where scientific name is not found
        results_df = results_df.append({'Searched Scientific Name': search_query,
                                        'Topic': None,
                                        'Similar Scientific Names': [],
                                        'Similarity Scores': []}, 
                                        ignore_index=True)

# Extract unique 'Similar Scientific Names'
unique_names = set()
for names_list in results_df['Similar Scientific Names']:
    unique_names.update(names_list)

# Create a new DataFrame to store transposed results
transposed_df = pd.DataFrame(index=unique_names)

# Transpose the data
for index, row in results_df.iterrows():
    for name, score in zip(row['Similar Scientific Names'], row['Similarity Scores']):
        transposed_df.loc[name, row['Searched Scientific Name']] = score

output_excel_path = r"C:\Users\USER\Downloads\LDA_FPI_top_results_new.xlsx"
transposed_df.to_excel(output_excel_path)

print(output_excel_path)
