In [72]:
import os
import pandas as pd
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string
import logging
from nltk.tokenize import word_tokenize
from collections import Counter
from sklearn.cluster import KMeans

In [73]:
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [74]:
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

In [75]:
file_path = r"C:\Users\USER\Documents\Github\h4h-submit version\INPUT DATA FOR MODELS\FPIDatabase_RA v01_doc2vec.xlsx"

In [76]:
if os.path.exists(file_path):
    df = pd.read_excel(file_path)
else:
    print(f"File not found: {file_path}")
    exit()

In [77]:
df

Unnamed: 0,ScientificName,CommonNames,Description,Grown Location,Use,Cultivation,Distribution,Status,Unnamed: 8,Unnamed: 9,...,Unnamed: 321,Unnamed: 322,Unnamed: 323,Unnamed: 324,Unnamed: 325,Unnamed: 326,Unnamed: 327,Unnamed: 328,Unnamed: 329,Unnamed: 330
0,Abelmoschus ficulneus,"Native rosella, White wild musk mallow,",A small erect shrub. It grows from 0.5 to 1.5...,"Afghanistan, Africa, Asia, Australia*, East Af...",The roots and leaves are eaten. They are roas...,It can be grown from seed. It can also be gro...,It is a tropical plant. It often grows on cla...,The plant is used for food in Australia and Af...,,,...,,,,,,,,,,
1,Abelmoschus manihot,"Aibika, Edible Hibiscus,",A branched shrub up to 2 m or more high. It h...,"Africa, Andorra, Asia, Australia, Benin, Bhuta...",Young leaves are cooked and eaten. They are s...,It is grown from cuttings. Cuttings with 2-3 n...,A tropical plant. It is well suited to the tr...,The most important edible leafy green in coast...,,,...,,,,,,,,,,
2,Abelmoschus moschatus,"Musk mallow, Bush carrot,",A herb which grows as a perennial. It grows t...,"American Samoa, Asia, Australia, Bangladesh, B...",The root tuber is eaten roasted._x000B_The you...,Plants can be grown from seeds. The seed germ...,A tropical plant. It suits tropical climates ...,Cultivated.,,,...,,,,,,,,,,
3,Abelmoschus moschatus subsp. tuberosus,"Climbing hibiscus, Musk mallow,",The shrub grows up to 2 m high and can spread ...,"Africa, Asia, Australia*, Central Africa, Cent...","The leaves, shoots and roots are eaten raw or ...",It can be grown by seed or cuttings. The unde...,It is a tropical plant. It grows in light wel...,The plant is eaten in Australia but it is not ...,,,...,,,,,,,,,,
4,Abies firma,"Momi fir, Japanese fir,",A tree. It grows 30-45 m high and is evergree...,"Asia, Australia, China, Japan, Korea, Taiwan, ...",The seeds are eaten roasted.,Plants are grown from seed. Seeds germinate i...,It is a temperate plant. New growth is damage...,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32673,Dryopteris splendens,,A fern. It grows 1.2-1.8 m tall. It has a cr...,"Asia, China, Nepal,",It is used as a vegetable.,,It is a temperate plant.,,,,...,,,,,,,,,,
32674,Marattia excavata,,A fern. It has short rhizomes. The leaves ar...,"Central America, Guatemala,",The fleshy base of the leaves are used as food...,,It is a tropical plant.,,,,...,,,,,,,,,,
32675,Acrocomia intumescens,,A palm.,"Brazil, South America,",,,It is a tropical plant.,,,,...,,,,,,,,,,
32676,Ampelopsis glandulosa,"Porcelainberry,",A creeper. The branches have ridges along the...,"Asia, Bangladesh, China, India, Myanmar, Nepal...",,,It is a temperate plant.,,,,...,,,,,,,,,,


In [78]:
words_to_remove = ["plant", "plants", "specie", 'flower', 'reference', 'external', 'links', 'also', 'var', 'x000bthe', 'cm', 'leaf', 'long', 'grows', 'used']

In [79]:
# Preprocess the data and create TaggedDocument instances
documents = []
all_words = []

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()  # reduce a word to its base or dictionary form

# Define a translation table to remove punctuation
translator = str.maketrans('', '', string.punctuation)

for index, row in df.iterrows():
    common_name = str(row['CommonNames']) if not pd.isnull(row['CommonNames']) else ''
    scientific_name = str(row['ScientificName']) if not pd.isnull(row['ScientificName']) else ''
    description = str(row['Description']) if not pd.isnull(row['Description']) else ''
    use = str(row['Use']) if not pd.isnull(row['Use']) else ''
    cultivation = str(row['Cultivation']) if not pd.isnull(row['Cultivation']) else ''
    distribution = str(row['Distribution']) if not pd.isnull(row['Distribution']) else ''
    status = str(row['Status']) if not pd.isnull(row['Status']) else ''

    # Combine columns for better representation
    combined_text = f"{description.lower()} {use.lower()} {cultivation.lower()} {distribution.lower()} {status.lower()}"
    
    #{scientific_name.lower()}
    # Tokenize the text and lemmatize
    words = [
        lemmatizer.lemmatize(word.lower())
        for word in nltk.word_tokenize(combined_text.translate(translator))
        if word.lower() not in stop_words and not any(char.isdigit() for char in word)
  
    ]

    # Remove specific words
    for word_to_remove in words_to_remove:
        words = [word for word in words if word != word_to_remove.lower()]

    documents.append(TaggedDocument(words, [str(index)]))
    all_words.extend(words)

In [80]:
corpus = documents

In [81]:
corpus[3]

TaggedDocument(words=['shrub', 'high', 'spread', 'wide', 'hairy', 'prickly', 'continues', 'grow', 'year', 'year', 'top', 'dying', 'back', 'regrowing', 'thickened', 'root', 'stem', 'thin', 'branch', 'spread', 'green', 'shaped', 'like', 'finger', 'hand', 'lobe', 'vary', 'shape', 'wide', 'hairy', 'deep', 'pink', 'colour', 'yellow', 'purple', 'centre', 'like', 'hibiscus', 'across', 'attractive', 'grow', 'singly', 'angle', 'occur', 'summer', 'fruit', 'capsule', 'sided', 'containing', 'seed', 'capsule', 'across', 'seed', 'musk', 'smell', 'seed', 'small', 'brown', 'dy', 'back', 'thickened', 'root', 'develops', 'ground', 'root', 'good', 'loose', 'soil', 'shoot', 'root', 'eaten', 'raw', 'cooked', 'grown', 'seed', 'cutting', 'underground', 'tuber', 'die', 'regrow', 'transplanted', 'easily', 'time', 'top', 'died', 'back', 'tropical', 'light', 'well', 'drained', 'soil', 'suit', 'protected', 'lightly', 'shaded', 'position', 'damaged', 'drought', 'frost', 'must', 'grow', 'temperature', 'often', 'ope

In [82]:
n_docs = len(corpus)
print("Number of documents in the corpus:", n_docs)

Number of documents in the corpus: 32678


In [83]:
word_frequency_dict = Counter(all_words)

In [84]:
most_common_words = Counter(all_words).most_common(10)
print("\nMost Common Words:")
for word, frequency in most_common_words:
    print(f"{word}: {frequency}")


Most Common Words:
fruit: 23869
seed: 20664
tropical: 18544
eaten: 15111
wide: 14972
tree: 12134
temperate: 10106
across: 10055
tall: 10003
small: 9734


In [85]:
model = Doc2Vec(vector_size=100, window=5, min_count=1, workers=4, epochs=50)
model.build_vocab(corpus)
model.train(corpus, total_examples=model.corpus_count, epochs=model.epochs)

INFO:gensim.utils:Doc2Vec lifecycle event {'params': 'Doc2Vec(dm/m,d100,n5,w5,s0.001,t4)', 'datetime': '2025-03-30T20:34:35.014420', 'gensim': '4.1.2', 'python': '3.9.13 (main, Aug 25 2022, 23:51:50) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22631-SP0', 'event': 'created'}
INFO:gensim.models.doc2vec:collecting all words and their counts
INFO:gensim.models.doc2vec:PROGRESS: at example #0, processed 0 words (0/s), 0 word types, 0 tags
INFO:gensim.models.doc2vec:PROGRESS: at example #10000, processed 476337 words (5424570/s), 7706 word types, 10000 tags
INFO:gensim.models.doc2vec:PROGRESS: at example #20000, processed 762962 words (5188214/s), 9272 word types, 20000 tags
INFO:gensim.models.doc2vec:PROGRESS: at example #30000, processed 902908 words (3909160/s), 9762 word types, 30000 tags
INFO:gensim.models.doc2vec:collected 9885 word types and 32678 unique tags from a corpus of 32678 examples and 930652 words
INFO:gensim.models.word2vec:Creating a fresh vocabulary
INFO:g

INFO:gensim.models.word2vec:worker thread finished; awaiting finish of 1 more threads
INFO:gensim.models.word2vec:worker thread finished; awaiting finish of 0 more threads
INFO:gensim.models.word2vec:EPOCH - 8 : training on 930652 raw words (754714 effective words) took 2.9s, 256144 effective words/s
INFO:gensim.models.word2vec:EPOCH 9 - PROGRESS: at 31.77% examples, 398725 words/s, in_qsize 7, out_qsize 0
INFO:gensim.models.word2vec:EPOCH 9 - PROGRESS: at 65.17% examples, 311781 words/s, in_qsize 8, out_qsize 0
INFO:gensim.models.word2vec:worker thread finished; awaiting finish of 3 more threads
INFO:gensim.models.word2vec:worker thread finished; awaiting finish of 2 more threads
INFO:gensim.models.word2vec:worker thread finished; awaiting finish of 1 more threads
INFO:gensim.models.word2vec:worker thread finished; awaiting finish of 0 more threads
INFO:gensim.models.word2vec:EPOCH - 9 : training on 930652 raw words (754130 effective words) took 2.9s, 260524 effective words/s
INFO:gen

INFO:gensim.models.word2vec:EPOCH 20 - PROGRESS: at 31.17% examples, 388563 words/s, in_qsize 7, out_qsize 0
INFO:gensim.models.word2vec:EPOCH 20 - PROGRESS: at 65.17% examples, 308946 words/s, in_qsize 8, out_qsize 0
INFO:gensim.models.word2vec:worker thread finished; awaiting finish of 3 more threads
INFO:gensim.models.word2vec:worker thread finished; awaiting finish of 2 more threads
INFO:gensim.models.word2vec:worker thread finished; awaiting finish of 1 more threads
INFO:gensim.models.word2vec:worker thread finished; awaiting finish of 0 more threads
INFO:gensim.models.word2vec:EPOCH - 20 : training on 930652 raw words (754401 effective words) took 2.9s, 260689 effective words/s
INFO:gensim.models.word2vec:EPOCH 21 - PROGRESS: at 31.17% examples, 398960 words/s, in_qsize 7, out_qsize 0
INFO:gensim.models.word2vec:EPOCH 21 - PROGRESS: at 62.64% examples, 315111 words/s, in_qsize 7, out_qsize 0
INFO:gensim.models.word2vec:worker thread finished; awaiting finish of 3 more threads
INF

INFO:gensim.models.word2vec:worker thread finished; awaiting finish of 1 more threads
INFO:gensim.models.word2vec:worker thread finished; awaiting finish of 0 more threads
INFO:gensim.models.word2vec:EPOCH - 31 : training on 930652 raw words (753752 effective words) took 3.0s, 251252 effective words/s
INFO:gensim.models.word2vec:EPOCH 32 - PROGRESS: at 30.04% examples, 378242 words/s, in_qsize 7, out_qsize 0
INFO:gensim.models.word2vec:EPOCH 32 - PROGRESS: at 61.20% examples, 308944 words/s, in_qsize 7, out_qsize 0
INFO:gensim.models.word2vec:worker thread finished; awaiting finish of 3 more threads
INFO:gensim.models.word2vec:worker thread finished; awaiting finish of 2 more threads
INFO:gensim.models.word2vec:worker thread finished; awaiting finish of 1 more threads
INFO:gensim.models.word2vec:worker thread finished; awaiting finish of 0 more threads
INFO:gensim.models.word2vec:EPOCH - 32 : training on 930652 raw words (754390 effective words) took 3.0s, 253183 effective words/s
INFO

INFO:gensim.models.word2vec:worker thread finished; awaiting finish of 3 more threads
INFO:gensim.models.word2vec:worker thread finished; awaiting finish of 2 more threads
INFO:gensim.models.word2vec:worker thread finished; awaiting finish of 1 more threads
INFO:gensim.models.word2vec:worker thread finished; awaiting finish of 0 more threads
INFO:gensim.models.word2vec:EPOCH - 43 : training on 930652 raw words (754057 effective words) took 2.9s, 255965 effective words/s
INFO:gensim.models.word2vec:EPOCH 44 - PROGRESS: at 29.25% examples, 372801 words/s, in_qsize 8, out_qsize 0
INFO:gensim.models.word2vec:EPOCH 44 - PROGRESS: at 59.19% examples, 302409 words/s, in_qsize 8, out_qsize 0
INFO:gensim.models.word2vec:worker thread finished; awaiting finish of 3 more threads
INFO:gensim.models.word2vec:worker thread finished; awaiting finish of 2 more threads
INFO:gensim.models.word2vec:worker thread finished; awaiting finish of 1 more threads
INFO:gensim.models.word2vec:worker thread finishe

In [86]:
vectors = [model.dv[index] for index in range(len(df))]

In [87]:
vectors

[array([-3.3018729e-01,  9.8982953e-02,  4.2442533e-01,  1.2100337e+00,
        -5.6012464e-01,  4.3908268e-01, -8.2618541e-01,  8.9437443e-01,
        -4.3101573e-01, -3.4431852e-02, -4.0195554e-01, -1.7439264e+00,
        -5.9955430e-01, -1.0740922e-01, -4.5159137e-01, -6.0837722e-01,
         1.2706246e+00,  3.0174449e-01, -4.2384833e-01,  7.2260439e-01,
         3.8218695e-01,  3.8081780e-02, -2.3911542e-01,  7.7086735e-01,
         1.4278731e-01, -9.2207503e-01, -9.3587494e-01,  1.0746299e+00,
         1.7012559e-01,  2.6417080e-01,  3.0726662e-01,  1.3939495e-01,
        -5.6160122e-01,  6.4005816e-01, -3.7620738e-01,  5.0407475e-01,
        -9.3217850e-02, -2.1587684e+00,  4.5810181e-01, -1.1428475e-01,
         4.3874225e-01,  1.3707201e-01,  5.9514099e-01, -1.3263044e+00,
         8.5117304e-01, -8.6357753e-04, -9.5290825e-02, -1.0720589e+00,
         3.7654901e-01,  2.9981691e-01,  3.0070904e-01, -2.9325953e-01,
        -3.4337562e-01,  1.8956542e+00, -9.6734762e-01,  2.06762

In [88]:
###############GENSIM

In [89]:
from gensim import corpora, models
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [90]:
documents = [" ".join(doc.words) for doc in corpus]

In [91]:
vectorizer = CountVectorizer(stop_words='english')
X = vectorizer.fit_transform(documents)

In [92]:
dictionary = corpora.Dictionary([doc.split() for doc in documents])

INFO:gensim.corpora.dictionary:adding document #0 to Dictionary(0 unique tokens: [])
INFO:gensim.corpora.dictionary:adding document #10000 to Dictionary(7706 unique tokens: ['across', 'africa', 'alluvial', 'angled', 'arid']...)
INFO:gensim.corpora.dictionary:adding document #20000 to Dictionary(9272 unique tokens: ['across', 'africa', 'alluvial', 'angled', 'arid']...)
INFO:gensim.corpora.dictionary:adding document #30000 to Dictionary(9762 unique tokens: ['across', 'africa', 'alluvial', 'angled', 'arid']...)
INFO:gensim.corpora.dictionary:built Dictionary(9885 unique tokens: ['across', 'africa', 'alluvial', 'angled', 'arid']...) from 32678 documents (total 930652 corpus positions)
INFO:gensim.utils:Dictionary lifecycle event {'msg': "built Dictionary(9885 unique tokens: ['across', 'africa', 'alluvial', 'angled', 'arid']...) from 32678 documents (total 930652 corpus positions)", 'datetime': '2025-03-30T20:37:09.730827', 'gensim': '4.1.2', 'python': '3.9.13 (main, Aug 25 2022, 23:51:50) 

In [93]:
corpus_gensim = [dictionary.doc2bow(doc.split()) for doc in documents]

In [94]:
# Define the number of topics
n_topics = 500

# Build the LDA model
lda_model = models.LdaModel(corpus_gensim, num_topics=n_topics, id2word=dictionary)

INFO:gensim.models.ldamodel:using symmetric alpha at 0.002
INFO:gensim.models.ldamodel:using symmetric eta at 0.002
INFO:gensim.models.ldamodel:using serial LDA version on this node
INFO:gensim.models.ldamodel:running online (single-pass) LDA training, 500 topics, 1 passes over the supplied corpus of 32678 documents, updating model once every 2000 documents, evaluating perplexity every 20000 documents, iterating 50x with a convergence threshold of 0.001000
INFO:gensim.models.ldamodel:PROGRESS: pass 0, at document #2000/32678
INFO:gensim.models.ldamodel:merging changes from 2000 documents into a model of 32678 documents
INFO:gensim.models.ldamodel:topic #194 (0.002): 0.024*"root" + 0.019*"year" + 0.019*"seed" + 0.015*"eaten" + 0.014*"stalk" + 0.014*"narrow" + 0.014*"oil" + 0.012*"high" + 0.011*"temperate" + 0.011*"bulb"
INFO:gensim.models.ldamodel:topic #372 (0.002): 0.034*"stalk" + 0.025*"fruit" + 0.020*"stem" + 0.019*"base" + 0.017*"oblong" + 0.017*"eaten" + 0.016*"grown" + 0.015*"tip

INFO:gensim.models.ldamodel:topic #451 (0.002): 0.193*"north" + 0.102*"purplishblack" + 0.059*"smelling" + 0.039*"reddishorange" + 0.029*"hairy" + 0.025*"take" + 0.022*"fruit" + 0.019*"slightly" + 0.018*"head" + 0.018*"branch"
INFO:gensim.models.ldamodel:topic #363 (0.002): 0.090*"level" + 0.087*"sea" + 0.045*"stew" + 0.043*"cooked" + 0.037*"eaten" + 0.036*"tropical" + 0.031*"herb" + 0.028*"seed" + 0.023*"fruit" + 0.023*"soup"
INFO:gensim.models.ldamodel:topic #22 (0.002): 0.070*"fruit" + 0.050*"berry" + 0.038*"shrub" + 0.034*"scrambling" + 0.028*"tropical" + 0.028*"wide" + 0.021*"small" + 0.021*"across" + 0.021*"black" + 0.020*"red"
INFO:gensim.models.ldamodel:topic #117 (0.002): 0.104*"harvested" + 0.104*"fibre" + 0.080*"left" + 0.045*"x" + 0.035*"often" + 0.034*"main" + 0.024*"needlelike" + 0.020*"method" + 0.019*"section" + 0.017*"remove"
INFO:gensim.models.ldamodel:topic #213 (0.002): 0.104*"tend" + 0.058*"drier" + 0.043*"fruit" + 0.033*"seed" + 0.030*"hungry" + 0.028*"available" 

INFO:gensim.models.ldamodel:topic #73 (0.002): 0.173*"ring" + 0.097*"stem" + 0.062*"base" + 0.047*"year" + 0.047*"herb" + 0.029*"tall" + 0.028*"temperate" + 0.023*"near" + 0.023*"white" + 0.022*"narrow"
INFO:gensim.models.ldamodel:topic #300 (0.002): 0.173*"slope" + 0.089*"mountain" + 0.043*"level" + 0.041*"sea" + 0.040*"wide" + 0.038*"subtropical" + 0.037*"tall" + 0.033*"branch" + 0.027*"group" + 0.027*"china"
INFO:gensim.models.ldamodel:topic #211 (0.002): 0.331*"stream" + 0.093*"together" + 0.058*"bank" + 0.045*"along" + 0.034*"across" + 0.032*"easy" + 0.029*"white" + 0.025*"subtropical" + 0.024*"forest" + 0.024*"shrub"
INFO:gensim.models.ldamodel:topic diff=0.673162, rho=0.277350
INFO:gensim.models.ldamodel:PROGRESS: pass 0, at document #28000/32678
INFO:gensim.models.ldamodel:merging changes from 2000 documents into a model of 32678 documents
INFO:gensim.models.ldamodel:topic #420 (0.002): 0.319*"corky" + 0.096*"bark" + 0.027*"across" + 0.023*"embryo" + 0.022*"seed" + 0.020*"estua

In [95]:
#calculating model perplexity

perplexity = lda_model.log_perplexity(corpus_gensim)

print(perplexity)

INFO:gensim.models.ldamodel:-17.899 per-word bound, 244415.9 perplexity estimate based on a held-out corpus of 32678 documents with 930652 words


-17.898978830694915


In [96]:
from gensim.models import CoherenceModel

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=[all_words], dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('Coherence Score: ', coherence_lda)

INFO:gensim.topic_coherence.probability_estimation:using ParallelWordOccurrenceAccumulator(processes=11, batch_size=64) to estimate probabilities from sliding windows
INFO:gensim.topic_coherence.text_analysis:1 batches submitted to accumulate stats from 64 documents (930543 virtual)
INFO:gensim.topic_coherence.text_analysis:11 accumulators retrieved from output queue
INFO:gensim.topic_coherence.text_analysis:accumulated word occurrence stats for 930543 virtual documents


Coherence Score:  0.31711536578704314


In [97]:
# Print the topics
print("\nTopics:")
for topic_id, topic in lda_model.print_topics(num_words=10):
    print(f"Topic {topic_id}: {topic}")

INFO:gensim.models.ldamodel:topic #77 (0.002): 0.226*"egg" + 0.185*"tinge" + 0.086*"shaped" + 0.043*"yellow" + 0.035*"bark" + 0.028*"red" + 0.024*"shrub" + 0.022*"green" + 0.018*"sweetened" + 0.018*"tree"
INFO:gensim.models.ldamodel:topic #192 (0.002): 0.367*"scaly" + 0.179*"greybrown" + 0.062*"wind" + 0.041*"bark" + 0.023*"purple" + 0.020*"grown" + 0.015*"seed" + 0.013*"dark" + 0.010*"tree" + 0.009*"occur"
INFO:gensim.models.ldamodel:topic #190 (0.002): 0.297*"coloured" + 0.053*"fruit" + 0.051*"shrub" + 0.050*"tall" + 0.043*"oval" + 0.036*"round" + 0.031*"tropical" + 0.026*"branch" + 0.025*"shiny" + 0.024*"yellow"
INFO:gensim.models.ldamodel:topic #401 (0.002): 0.000*"opposed" + 0.000*"predicting" + 0.000*"informant" + 0.000*"isnt" + 0.000*"labour" + 0.000*"learn" + 0.000*"marginally" + 0.000*"nothing" + 0.000*"hurry" + 0.000*"organism"
INFO:gensim.models.ldamodel:topic #24 (0.002): 0.129*"hair" + 0.089*"rusty" + 0.051*"branch" + 0.047*"tropical" + 0.044*"brown" + 0.035*"small" + 0.03


Topics:
Topic 77: 0.226*"egg" + 0.185*"tinge" + 0.086*"shaped" + 0.043*"yellow" + 0.035*"bark" + 0.028*"red" + 0.024*"shrub" + 0.022*"green" + 0.018*"sweetened" + 0.018*"tree"
Topic 192: 0.367*"scaly" + 0.179*"greybrown" + 0.062*"wind" + 0.041*"bark" + 0.023*"purple" + 0.020*"grown" + 0.015*"seed" + 0.013*"dark" + 0.010*"tree" + 0.009*"occur"
Topic 190: 0.297*"coloured" + 0.053*"fruit" + 0.051*"shrub" + 0.050*"tall" + 0.043*"oval" + 0.036*"round" + 0.031*"tropical" + 0.026*"branch" + 0.025*"shiny" + 0.024*"yellow"
Topic 401: 0.000*"opposed" + 0.000*"predicting" + 0.000*"informant" + 0.000*"isnt" + 0.000*"labour" + 0.000*"learn" + 0.000*"marginally" + 0.000*"nothing" + 0.000*"hurry" + 0.000*"organism"
Topic 24: 0.129*"hair" + 0.089*"rusty" + 0.051*"branch" + 0.047*"tropical" + 0.044*"brown" + 0.035*"small" + 0.035*"fruit" + 0.035*"wide" + 0.032*"yellow" + 0.032*"tree"
Topic 419: 0.101*"wrapped" + 0.092*"scented" + 0.067*"next" + 0.027*"pregnancy" + 0.018*"recommended" + 0.016*"sweetly"

In [98]:
topic_documents = {i: [] for i in range(n_topics)}

In [99]:
# Search for documents belonging to each topic based on the scientific name
for doc_index, doc in enumerate(corpus):
    # Get the topic distribution for the document
    doc_topics = lda_model[corpus_gensim[doc_index]]
    # Sort the topics by their probability in the document
    sorted_topics = sorted(doc_topics, key=lambda x: x[1], reverse=True)
    # Check if there are topics associated with the document
    if sorted_topics:
        # Get the most dominant topic ID
        dominant_topic_id = sorted_topics[0][0]
        # Append the document to the corresponding topic
        topic_documents[dominant_topic_id].append(doc)

In [100]:
# Print documents for each topic
for topic_id, documents in topic_documents.items():
    print(f"\nDocuments for Topic {topic_id}:")
    for doc_tuple in documents:
        scientific_name = doc_tuple[0]
        description = doc_tuple[1]
        print(f" - {scientific_name}: {description}")


IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [101]:
# Group documents by topics
topics_documents = {i: [] for i in range(n_topics)}
for doc_index, doc_topics in enumerate(lda_model.get_document_topics(corpus_gensim)):
    if doc_topics:  # Check if the document has a non-empty topic distribution
        dominant_topic = max(doc_topics, key=lambda x: x[1])[0]
        topics_documents[dominant_topic].append(doc_index)

In [102]:
# Show the contents of documents for each topic
for topic_id, document_indices in topics_documents.items():
    print(f"\nDocuments for Topic {topic_id}:")
    for doc_index in document_indices[:5]:
        if 0 <= doc_index < len(corpus):
            print(f"Document Index: {doc_index}")
            print(f"Document Content: {corpus[doc_index]}")  # Print the original content of the document
            print("="*50)


Documents for Topic 0:

Documents for Topic 1:
Document Index: 597
Document Content: TaggedDocument(['shrub', 'high', 'spread', 'across', 'keep', 'growing', 'year', 'year', 'loses', 'year', 'slender', 'green', 'smell', 'strongly', 'lemon', 'oil', 'gland', 'give', 'sticky', 'almost', 'rough', 'feel', 'purple', 'white', 'form', 'feather', 'like', 'group', 'end', 'branch', 'aloysia', 'citriodora', 'young', 'eaten', 'cooked', 'raw', 'salad', 'lemon', 'fresh', 'flavour', 'fruit', 'salad', 'punch', 'fruit', 'cup', 'fresh', 'dried', 'grown', 'cutting', 'tree', 'best', 'pruned', 'give', 'thick', 'clumpy', 'bush', 'warm', 'temperate', 'requires', 'sunny', 'sheltered', 'position', 'need', 'moderately', 'fertile', 'well', 'drained', 'soil', 'requires', 'warm', 'damp', 'climate', 'hobart', 'botanical', 'garden', 'lippia', 'citriodora', 'suit', 'hardiness', 'zone', 'arboretum', 'tasmania', 'cultivated', 'food'], ['597'])
Document Index: 750
Document Content: TaggedDocument(['branched', 'branch', '

In [103]:
# Scientific name to search
search_scientific_name = "Malus pumila"

# Find the topic of the document associated with the searched scientific name
search_topic = None
for topic, document_indices in topics_documents.items():
    for doc_index in document_indices:
        # Get the original content from the DataFrame
        row = df.iloc[doc_index]
        scientific_name = row['ScientificName']
        if scientific_name == search_scientific_name:
            search_topic = topic
            search_doc_index = doc_index  # Store the index of the searched document
            break
    if search_topic is not None:
        break

In [104]:
from scipy.sparse import csr_matrix

# List to store similarity scores
similarity_scores = []

# Search for the scientific name and determine its topic
topic_id = None
for topic, document_indices in topics_documents.items():
    for doc_index in document_indices:
        # Get the original content from the DataFrame
        row = df.iloc[doc_index]
        scientific_name = row['ScientificName']
        
        # If the scientific name matches, determine the topic and break the loop
        if scientific_name == search_scientific_name:
            topic_id = topic
            break
    if topic_id is not None:
        break

In [105]:
# If the scientific name is found
if topic_id is not None:
    # Get the document index of the searched scientific name
    search_doc_index = topics_documents[topic_id][0] 
    
    # Convert topic distribution vectors to dense arrays
    search_vector = lda_model[corpus_gensim[search_doc_index]]
    search_vector_dense = [prob for _, prob in search_vector]
    
    # Compute cosine similarity between the searched document and all other documents
    for doc_index in range(len(corpus)):
        if doc_index != search_doc_index:  # Exclude the searched document itself
            # Convert topic distribution vectors to dense arrays
            doc_vector = lda_model[corpus_gensim[doc_index]]
            doc_vector_dense = [prob for _, prob in doc_vector]
            
            # Compute cosine similarity if the dimensions are compatible
            if len(search_vector_dense) == len(doc_vector_dense):
                similarity_score = cosine_similarity([search_vector_dense], [doc_vector_dense])[0][0]
                similarity_scores.append((doc_index, similarity_score))
    
    # Sort similarity scores based on similarity score
    similarity_scores.sort(key=lambda x: x[1], reverse=True)
    
    # Print top 10 most similar documents
    print(f"Top 10 Most Similar Documents to '{search_scientific_name}':")
    for i, (doc_index, similarity_score) in enumerate(similarity_scores[:10], 1):
        # Get the original content from the DataFrame
        row = df.iloc[doc_index]
        other_scientific_name = row['ScientificName']
        common_names = row['CommonNames']
        description = row['Description']
        
        # Print document details
        print(f"\nDocument {i}")
        print(f"Scientific Name: {other_scientific_name}")
        print(f"Common Names: {common_names}")
        print(f"Description: {description}")
        print(f"Similarity Score: {similarity_score}")
        print("="*50)
else:
    print(f"The scientific name '{search_scientific_name}' was not found in any document.")

Top 10 Most Similar Documents to 'Malus pumila':

Document 1
Scientific Name: Stillingia salpingadenia
Common Names: nan
Description: The branches are short and leafy.  The leaves are 4-9 cm long by 1-2 cm wide.  They are narrowly sword shaped.  They have small teeth along the edge.  The flowers are at the ends of branches and in spikes 4-5 cm long.  There are 3-9 flowers.  
Similarity Score: 0.9868494272232056

Document 2
Scientific Name: Polygonum thunbergii
Common Names: nan
Description: A herb.  It grows 50-70 cm tall.  The leaves are 5-7 cm long.  They are triangle shaped.  They are prickly underneath.  The flowers have hairs.  
Similarity Score: 0.9836193323135376

Document 3
Scientific Name: Cardamine komarovii
Common Names: nan
Description: A cabbage family herb.  It grows between 12-75 cm tall. 
Similarity Score: 0.9819674491882324

Document 4
Scientific Name: Petasites palmatus
Common Names: Palmate butterbur, Sweet coltsfoot, 
Description: A herb.  
Similarity Score: 0.98119

In [106]:
from scipy.sparse import csr_matrix

# Function to search using a scientific name and find similar scientific names within the same topic
def search_scientific_name(lda_model, corpus_gensim, dictionary, df, scientific_name):
    # Search for the scientific name in the DataFrame
    scientific_name = scientific_name.lower()
    scientific_name_row = df[df['ScientificName'].str.lower() == scientific_name]
    
    if not scientific_name_row.empty:
        scientific_name_index = scientific_name_row.index[0]
        
        # Determine the topic of the searched scientific name
        query_vector = lda_model[corpus_gensim[scientific_name_index]]
        query_topic = max(query_vector, key=lambda x: x[1])[0]
        
        # Retrieve other scientific names within the same topic
        topic_documents = topics_documents[query_topic]
        similar_names = []
        similarity_scores = []
        
        for doc_index in topic_documents:
            if doc_index != scientific_name_index:
                other_name = df.iloc[doc_index]['ScientificName']
                other_vector = lda_model[corpus_gensim[doc_index]]
                
                # Convert sparse matrices to dense arrays
                query_vector_dense = csr_matrix(query_vector)
                other_vector_dense = csr_matrix(other_vector)
                
                similarity = cosine_similarity(query_vector_dense, other_vector_dense)[0][0]
                
                similar_names.append(other_name)
                similarity_scores.append(similarity)
                
        return query_topic, similar_names, similarity_scores
    
    else:
        return None, None, None

In [107]:
import pandas as pd

scientific_names_df = pd.read_csv(r"C:\Users\USER\Documents\Github\h4h-submit version\INPUT DATA FOR MODELS\top50_new_again.csv", encoding='latin1')

results_df = pd.DataFrame(columns=['Searched Scientific Name', 'Topic', 'Similar Scientific Names', 'Similarity Scores'])

for index, row in scientific_names_df.iterrows():
    search_query = row['ScientificName'] 

    topic, similar_names, similarity_scores = search_scientific_name(lda_model, corpus_gensim, dictionary, df, search_query)

    if topic is not None:
        combined_results = list(zip(similar_names, similarity_scores))

        # Sort combined results by similarity scores
        sorted_results = sorted(combined_results, key=lambda x: x[1], reverse=True)

        # Append the results 
        results_df = results_df.append({'Searched Scientific Name': search_query,
                                        'Topic': topic,
                                        'Similar Scientific Names': [name for name, _ in sorted_results],
                                        'Similarity Scores': [score for _, score in sorted_results]}, 
                                        ignore_index=True)
    else:
        # Handle case where scientific name is not found
        results_df = results_df.append({'Searched Scientific Name': search_query,
                                        'Topic': None,
                                        'Similar Scientific Names': [],
                                        'Similarity Scores': []}, 
                                        ignore_index=True)

# Extract unique 'Similar Scientific Names'
unique_names = set()
for names_list in results_df['Similar Scientific Names']:
    unique_names.update(names_list)

# Create a new DataFrame to store transposed results
transposed_df = pd.DataFrame(index=unique_names)

# Transpose the data
for index, row in results_df.iterrows():
    for name, score in zip(row['Similar Scientific Names'], row['Similarity Scores']):
        transposed_df.loc[name, row['Searched Scientific Name']] = score

output_excel_path = r"C:\Users\USER\Documents\Github\h4h-submit version\OUTPUT DATA OF MODELS\LDA_FPI_top_results_new.xlsx"
transposed_df.to_excel(output_excel_path)

print(output_excel_path)


  results_df = results_df.append({'Searched Scientific Name': search_query,
  results_df = results_df.append({'Searched Scientific Name': search_query,
  results_df = results_df.append({'Searched Scientific Name': search_query,
  results_df = results_df.append({'Searched Scientific Name': search_query,
  results_df = results_df.append({'Searched Scientific Name': search_query,
  results_df = results_df.append({'Searched Scientific Name': search_query,
  results_df = results_df.append({'Searched Scientific Name': search_query,
  results_df = results_df.append({'Searched Scientific Name': search_query,
  results_df = results_df.append({'Searched Scientific Name': search_query,
  results_df = results_df.append({'Searched Scientific Name': search_query,
  results_df = results_df.append({'Searched Scientific Name': search_query,
  results_df = results_df.append({'Searched Scientific Name': search_query,
  results_df = results_df.append({'Searched Scientific Name': search_query,
  results_df

ValueError: Found array with 0 feature(s) (shape=(1, 0)) while a minimum of 1 is required by check_pairwise_arrays.