In [1]:
import os
import pandas as pd
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sklearn.cluster import KMeans, DBSCAN
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string
import logging
from nltk.tokenize import word_tokenize
from collections import Counter
import re
from sklearn.metrics.pairwise import cosine_similarity
import itertools

In [2]:
#NLTK Resources ni 
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
#Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

In [4]:
#streaming
class DocumentStreamer(object):
    def __init__(self, df):
        self.df = df

    def __iter__(self):
        for index, row in self.df.iterrows():
            # Tokenize the text
            tokens = word_tokenize(row['text'])
            yield TaggedDocument(words=tokens, tags=[index])

In [5]:
# List of file paths
file_paths = [r"C:\Users\USER\Documents\Github\h4h-submit version\INPUT DATA FOR MODELS\for_scraping_first_5000_done.xlsx",
            r"C:\Users\USER\Documents\Github\h4h-submit version\INPUT DATA FOR MODELS\for_scraping_second_5000_done.xlsx",
             r"C:\Users\USER\Documents\Github\h4h-submit version\INPUT DATA FOR MODELS\for_scraping_third_5000_done.xlsx",
             r"C:\Users\USER\Documents\Github\h4h-submit version\INPUT DATA FOR MODELS\for_scraping_fourth_5000_done.xlsx",
             r"C:\Users\USER\Documents\Github\h4h-submit version\INPUT DATA FOR MODELS\for_scraping_fifth_5000_done.xlsx",
             r"C:\Users\USER\Documents\Github\h4h-submit version\INPUT DATA FOR MODELS\for_scraping_sixth_5000_done.xlsx",
             r"C:\Users\USER\Documents\Github\h4h-submit version\INPUT DATA FOR MODELS\for_scraping_seventh_5000_done.xlsx"]

In [6]:
# Initialize an empty dataframe
all_dataframes = []

In [7]:
# Load data from multiple Excel files
for file_path in file_paths:
    if os.path.exists(file_path):
        df = pd.read_excel(file_path)
        all_dataframes.append(df)
    else:
        print(f"File not found: {file_path}")

In [8]:
df = pd.concat(all_dataframes, ignore_index=True)

In [9]:
# I-exclude ang rows kung asa ang "Title" column contains "Not Found" and the "Content" column is empty
df = df[(df['ScientificName'] != 'Not Found') & (df['Content'].notnull())]

In [10]:
df

Unnamed: 0,ScientificName,Title,Content,TableData
0,Abelmoschus ficulneus,Abelmoschus ficulneus,Abelmoschus ficulneus is a species of flowerin...,"[['Abelmoschus ficulneus', ''], ['', ''], ['Co..."
1,Abelmoschus manihot,Abelmoschus manihot,"Abelmoschus manihot, commonly known as aibika,...","[['Abelmoschus manihot', ''], ['', ''], ['Scie..."
2,Abelmoschus moschatus,Abelmoschus moschatus,"Abelmoschus moschatus (Abelmosk, ambrette, ann...","[['Abelmoschus moschatus', ''], ['', ''], ['Sc..."
3,Abies firma,Abies firma,"Abies firma, the momi fir, is a species of fir...","[['Abies firma', ''], ['', ''], ['Foliage', ''..."
4,Abies grandis,Abies grandis,"Abies grandis (grand fir, giant fir, lowland w...","[['Abies grandisGrand fir', ''], ['', ''], ['C..."
...,...,...,...,...
17172,Allium,Allium spp.,Allium spp. is a genus of monocotyledonous fl...,
17173,Citrus spp.,Citrus spp.,Citrus spp. is a genus of flowering trees and ...,
17174,Arachis hypogea,Arachis hypogea,"The peanut (Arachis hypogaea), also known as t...",
17175,Cucurbita spp.,Cucurbita spp.,Cucurbita spp. (Latin for 'gourd')[3][4] is a ...,


In [11]:
# words to remove
words_to_remove = [ "plant", "plants", "specie", 'flower', 'reference', 'external', 'links', 'also', 'var', 'name', 'used', 'leaf', 'tree', 'rknuth', '-']

In [12]:
# Additional step: Remove duplicates from the DataFrame
df = df.drop_duplicates(subset=['ScientificName'])

# Preprocess the data and create TaggedDocument instances
documents = []
all_words = []

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer() #reduce a word to its base or dictionary form ( eg running to run or better to good)

# mag-efine og translation table to remove punctuation
translator = str.maketrans('', '', string.punctuation)

for index, row in df.iterrows():
    title = row['ScientificName']
    content = row['Content']
    # mag-combine sa title and content na columns for better representation
    combined_text = f"{title.lower()} {content.lower()}"
    
    #Tokenize the text and lemmatize
    words = [
        lemmatizer.lemmatize(word.lower()) 
        for word in nltk.word_tokenize(combined_text.translate(translator)) 
        if word.lower() not in stop_words
        #and word != 'species' and word != 'plant' and word != 'flowers'
    ]
    
    #Remove specific words
    for word_to_remove in words_to_remove:
        words = [word for word in words if word != word_to_remove.lower()]

    documents.append(TaggedDocument(words, [str(index)]))
    all_words.extend(words)

In [13]:
#Build a corpus from TaggedDocument instances
corpus = documents

In [14]:
corpus[1]

TaggedDocument(words=['abelmoschus', 'manihot', 'abelmoschus', 'manihot', 'commonly', 'known', 'aibika', 'flowering', 'family', 'malvaceae', 'previously', 'classified', 'hibiscus', 'categorized', 'genus', 'abelmoschus', 'referred', 'sunset', 'muskmallow', 'sunset', 'hibiscus', 'hibiscus', 'manihot', 'growth', 'habit', 'although', 'technically', 'shrub', 'aibika', 'perennial', 'favorable', 'condition', 'grow', 'three', 'meter', 'height', 'easily', 'propagated', 'cutting', 'relatively', 'diseaseresistant', 'result', 'widely', 'cultivated', 'often', 'found', 'along', 'garden', 'border', 'intercrop', 'traditional', 'tropical', 'garden', 'growth', 'habit', 'along', 'nutritional', 'value', 'contributes', 'popularity', 'home', 'gardening', 'horticulture', 'nutrition', 'aibika', 'renowned', 'highly', 'nutritious', 'property', 'rich', 'essential', 'vitamin', 'including', 'high', 'content', 'vitamin', 'c', 'well', 'iron', 'moreover', 'contain', 'approximately', '12', 'protein', 'dry', 'weight', 

In [15]:
#i-determine ang number of corpus
n_docs = len(corpus)
print("Number of documents in the corpus:", n_docs)

Number of documents in the corpus: 17128


In [16]:
#Build a dictionary mapping words to their frequencies
word_frequency_dict = Counter(all_words)

In [17]:
#Print the most frequent words
most_common_words = Counter(all_words).most_common(10)
print("\nMost Common Words:")
for word, frequency in most_common_words:
    print(f"{word}: {frequency}")


Most Common Words:
dioscorea: 77368
–: 26316
fruit: 19034
genus: 18004
long: 17436
found: 16310
native: 15986
seed: 15262
cm: 15062
known: 14714


In [18]:
# Train a Doc2Vec model
model = Doc2Vec(vector_size=100, window=5, min_count=1, workers=4, epochs=10)
model.build_vocab(corpus)
model.train(corpus, total_examples=model.corpus_count, epochs=model.epochs)

INFO:gensim.utils:Doc2Vec lifecycle event {'params': 'Doc2Vec(dm/m,d100,n5,w5,s0.001,t4)', 'datetime': '2025-03-30T21:07:51.952003', 'gensim': '4.1.2', 'python': '3.9.13 (main, Aug 25 2022, 23:51:50) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22631-SP0', 'event': 'created'}
INFO:gensim.models.doc2vec:collecting all words and their counts
INFO:gensim.models.doc2vec:PROGRESS: at example #0, processed 0 words (0/s), 0 word types, 0 tags
INFO:gensim.models.doc2vec:PROGRESS: at example #10000, processed 3019175 words (5254086/s), 161462 word types, 10000 tags
INFO:gensim.models.doc2vec:collected 199090 word types and 17128 unique tags from a corpus of 17128 examples and 4732158 words
INFO:gensim.models.word2vec:Creating a fresh vocabulary
INFO:gensim.utils:Doc2Vec lifecycle event {'msg': 'effective_min_count=1 retains 199090 unique words (100.0%% of original 199090, drops 0)', 'datetime': '2025-03-30T21:07:53.605587', 'gensim': '4.1.2', 'python': '3.9.13 (main, Aug 25 2022, 

INFO:gensim.models.word2vec:worker thread finished; awaiting finish of 2 more threads
INFO:gensim.models.word2vec:worker thread finished; awaiting finish of 1 more threads
INFO:gensim.models.word2vec:worker thread finished; awaiting finish of 0 more threads
INFO:gensim.models.word2vec:EPOCH - 6 : training on 4732158 raw words (4659690 effective words) took 5.5s, 852122 effective words/s
INFO:gensim.models.word2vec:EPOCH 7 - PROGRESS: at 13.95% examples, 786504 words/s, in_qsize 7, out_qsize 0
INFO:gensim.models.word2vec:EPOCH 7 - PROGRESS: at 28.09% examples, 761572 words/s, in_qsize 7, out_qsize 0
INFO:gensim.models.word2vec:EPOCH 7 - PROGRESS: at 43.53% examples, 767574 words/s, in_qsize 8, out_qsize 0
INFO:gensim.models.word2vec:EPOCH 7 - PROGRESS: at 60.80% examples, 764208 words/s, in_qsize 7, out_qsize 0
INFO:gensim.models.word2vec:EPOCH 7 - PROGRESS: at 81.07% examples, 773231 words/s, in_qsize 7, out_qsize 0
INFO:gensim.models.word2vec:worker thread finished; awaiting finish of

In [19]:
#kuhaon ang inferred vectors for each document
#vectors = [model.dv[index] for index in range(len(df))]

In [20]:
vectors = [model.dv[str(index)] for index in df.index]

In [21]:
vectors_array = np.array(vectors)

In [22]:
'''PERFORM CLUSTER ANALYSIS'''

'PERFORM CLUSTER ANALYSIS'

In [23]:
num_clusters = 1000
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
kmeans.fit(vectors)

KMeans(n_clusters=1000, random_state=42)

In [24]:
df['cluster'] = kmeans.labels_

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['cluster'] = kmeans.labels_


In [25]:
labels = kmeans.labels_

In [28]:
from sklearn import metrics

In [29]:
metrics.silhouette_score(vectors, labels, metric='euclidean')

-0.018159125

In [30]:
cluster_distribution = df['cluster'].value_counts()
print("Cluster Distribution:")
print(cluster_distribution)

Cluster Distribution:
460    1159
553    1032
451     940
132     671
79      419
       ... 
904       1
547       1
474       1
499       1
366       1
Name: cluster, Length: 1000, dtype: int64


In [31]:
cluster_dict = {}

In [32]:
# Populate the dictionary with cluster information
for cluster_id in range(num_clusters):
    cluster_titles = df[df['cluster'] == cluster_id]['ScientificName']
    cluster_dict[cluster_id] = list(cluster_titles)

In [33]:
# Function to search for a title and retrieve cluster information
def search_title(title):
    for cluster_id, titles in cluster_dict.items():
        if title in titles:
            return cluster_id, titles

In [34]:
def calculate_similarity(vectors):
    return cosine_similarity(vectors)

In [35]:
searched_title = "Oryza sativa" 
result = search_title(searched_title)

In [36]:
######para di magbalikbalik ang sciname

In [37]:
if result:
    cluster_id, titles = result
    print(f"Cluster ID: {cluster_id}")
    print(f"Titles in Cluster {cluster_id}:\n{titles}")

    # Get vectors and titles for the specific cluster
    cluster_vectors = vectors_array[df['cluster'] == cluster_id]
    cluster_titles = df[df['cluster'] == cluster_id]['ScientificName']

    # Calculate cosine similarity matrix
    similarity_matrix = calculate_similarity(cluster_vectors)

    # Collect and sort the similarity pairs
    similarity_pairs = []
    for i, title_i in enumerate(cluster_titles):
        for j, title_j in enumerate(cluster_titles):
            if i < j:
                similarity = similarity_matrix[i, j]
                similarity_pairs.append(((title_i, title_j), similarity))

    # Sort the pairs by similarity in descending order
    similarity_pairs.sort(key=lambda x: x[1], reverse=True)

    # Get the top 50 titles most similar to the searched title
    top_similar_titles = []
    added_titles = set()  # Initialize a set to keep track of titles that have been added
    for pair, similarity in similarity_pairs[:50]:
        title_i, title_j = pair
        # Add the title that is not the searched title
        if title_i.lower() == searched_title.lower():
            if title_j not in added_titles:
                top_similar_titles.append((title_j, similarity))
                added_titles.add(title_j)
        else:
            if title_i not in added_titles:
                top_similar_titles.append((title_i, similarity))
                added_titles.add(title_i)

    # Print the top 50 titles with their similarity scores
    print(f"\nTop 10 Titles Similar to '{searched_title}':")
    for i, (title, similarity) in enumerate(top_similar_titles, start=1):
        print(f"{i}. {title} - Similarity: {similarity:.4f}")
else:
    print(f"Title '{searched_title}' not found in any cluster.")

Cluster ID: 525
Titles in Cluster 525:
['Oryza punctata', 'Oryza sativa', 'Oryza coarctata', 'Oryza minuta', 'Oryza meyeriana', 'Oryza longiglumis', 'Oryza ridleyi', 'Oryza schlechteri']

Top 10 Titles Similar to 'Oryza sativa':
1. Oryza ridleyi - Similarity: 0.9759
2. Oryza coarctata - Similarity: 0.9728
3. Oryza longiglumis - Similarity: 0.9705
4. Oryza minuta - Similarity: 0.9691
5. Oryza meyeriana - Similarity: 0.9660
6. Oryza schlechteri - Similarity: 0.6553
7. Oryza punctata - Similarity: 0.6324


In [38]:
import pandas as pd

scientific_names_df = pd.read_csv(r"C:\Users\USER\Downloads\FFAR NEW\top50_new_again.csv", encoding='latin1')


scientific_names = scientific_names_df['ScientificName'].tolist()

In [39]:
for searched_title in scientific_names:
    result = search_title(searched_title)

    if result:
        cluster_id, titles = result
#      print(f"Cluster ID for '{searched_title}': {cluster_id}")
#        print(f"Titles in Cluster {cluster_id}:\n{titles}")

        # Get vectors and titles for the specific cluster
        cluster_vectors = vectors_array[df['cluster'] == cluster_id]
        cluster_titles = df[df['cluster'] == cluster_id]['ScientificName']

        # Calculate cosine similarity matrix
        similarity_matrix = calculate_similarity(cluster_vectors)

        # Collect and sort the similarity pairs
        similarity_pairs = []
        for i, title_i in enumerate(cluster_titles):
            for j, title_j in enumerate(cluster_titles):
                if i < j:
                    similarity = similarity_matrix[i, j]
                    similarity_pairs.append(((title_i, title_j), similarity))

        # Sort the pairs by similarity in descending order
        similarity_pairs.sort(key=lambda x: x[1], reverse=True)

        # Get the top 50 titles most similar to the searched title
        top_similar_titles = []
        added_titles = set()  # Initialize a set to keep track of titles that have been added
        for pair, similarity in similarity_pairs[:500]:
            title_i, title_j = pair
            # Add the title that is not the searched title
            if title_i.lower() == searched_title.lower():
                if title_j not in added_titles:
                    top_similar_titles.append((title_j, similarity))
                    added_titles.add(title_j)
            else:
                if title_i not in added_titles:
                    top_similar_titles.append((title_i, similarity))
                    added_titles.add(title_i)

        # Print the top 50 titles with their similarity scores
        print(f"\nTop 10 Titles Similar to '{searched_title}':")
        for i, (title, similarity) in enumerate(top_similar_titles, start=1):
            print(f"{i}. {title} - Similarity: {similarity:.4f}")
    else:
        print(f"Title '{searched_title}' not found in any cluster.")


Top 10 Titles Similar to 'Apium graveolens':
1. Apium graveolens var. secalinum - Similarity: 0.9589

Top 10 Titles Similar to 'Solanum tuberosum':

Top 10 Titles Similar to 'Malus domestica':

Top 10 Titles Similar to 'Musa × paradisiaca':
1. Musa x paradisiaca - Similarity: 0.8954
2. Pelargonium zonale - Similarity: 0.6841
3. Salix fragilis - Similarity: 0.6688
4. Asplenium bulbiferum - Similarity: 0.6601
5. Erythrina bidwillii - Similarity: 0.6445
6. Gladiolus dalenii - Similarity: 0.6193
7. Selenicereus anthonyanus - Similarity: 0.6178
8. Fuchsia triphylla - Similarity: 0.6145
9. Rosa pendulina - Similarity: 0.6117
10. Taxus x media - Similarity: 0.6051
11. Vachellia hebeclada - Similarity: 0.5206
12. Petunia x atkinsiana - Similarity: 0.5075
13. Chlorophytum comosum - Similarity: 0.4740
14. Equisetum scirpoides - Similarity: 0.4555
15. Aloidendron barberae - Similarity: 0.4002

Top 10 Titles Similar to 'Capsicum annuum':
1. Cucumis sativus - Similarity: 0.6743
2. Cucumis melo - S


Top 10 Titles Similar to 'Brassica oleracea var. capitata':
1. Phyllostachys viridiglaucescens - Similarity: 0.9610
2. Potentilla supina - Similarity: 0.9590
3. Grielum humifusum - Similarity: 0.9530
4. Celtis luzonica - Similarity: 0.9515
5. Geissanthus vanderwerffii - Similarity: 0.9501
6. Burckella sorei - Similarity: 0.9496
7. Smyrnium creticum - Similarity: 0.9486
8. Canarium kipella - Similarity: 0.9481
9. Macrozamia pauli-guilielmi - Similarity: 0.9465
10. Mangifera macrocarpa - Similarity: 0.9464
11. Polygala persicariifolia - Similarity: 0.9464
12. Neolemonniera clitandrifolia - Similarity: 0.9459
13. Cyathea brunoniana - Similarity: 0.9459
14. Ramaria vinosimaculans - Similarity: 0.9457
15. Vicia pisiformis - Similarity: 0.9456
16. Knema riangensis - Similarity: 0.9456
17. Senna surattensis - Similarity: 0.9427
18. Mezilaurus itauba - Similarity: 0.9421
19. Macleania loeseneriana - Similarity: 0.9417
20. Inga silanchensis - Similarity: 0.9414
21. Micropholis crassipedicellat

In [40]:
import pandas as pd

# Initialize a dictionary to store similarity scores
similarity_dict = {}

# Create a list to store all unique scientific names
all_scientific_names = []

for searched_title in scientific_names:
    result = search_title(searched_title)

    if result:
        cluster_id, titles = result

        # Get vectors and titles for the specific cluster
        cluster_vectors = vectors_array[df['cluster'] == cluster_id]
        cluster_titles = df[df['cluster'] == cluster_id]['ScientificName']

        # Calculate cosine similarity matrix
        similarity_matrix = calculate_similarity(cluster_vectors)

        # Collect and sort the similarity pairs
        similarity_pairs = []
        for i, title_i in enumerate(cluster_titles):
            for j, title_j in enumerate(cluster_titles):
                if i < j:
                    similarity = similarity_matrix[i, j]
                    similarity_pairs.append(((title_i, title_j), similarity))

        # Sort the pairs by similarity in descending order
        similarity_pairs.sort(key=lambda x: x[1], reverse=True)

        # Get the top 50 titles most similar to the searched title
        top_similar_titles = []
        added_titles = set()  # Initialize a set to keep track of titles that have been added
        for pair, similarity in similarity_pairs[:50]:
            title_i, title_j = pair
            # Add the title that is not the searched title
            if title_i.lower() == searched_title.lower():
                if title_j not in added_titles:
                    top_similar_titles.append((title_j, similarity))
                    added_titles.add(title_j)
            else:
                if title_i not in added_titles:
                    top_similar_titles.append((title_i, similarity))
                    added_titles.add(title_i)

        # Store similarity scores in the dictionary
        similarity_dict[searched_title] = {title: similarity for title, similarity in top_similar_titles}

        # Update the list of all scientific names
        all_scientific_names.extend(cluster_titles)

    else:
        print(f"Title '{searched_title}' not found in any cluster.")

# Remove duplicates from the list of all scientific names
all_scientific_names = list(set(all_scientific_names))

# Create DataFrame with all scientific names as row names
similarity_df = pd.DataFrame(index=all_scientific_names)

# Add columns for the top 50 similar scientific names
for searched_title, similar_titles in similarity_dict.items():
    for similar_title, similarity in similar_titles.items():
        similarity_df.at[similar_title, searched_title] = similarity

# Save the DataFrame to an Excel file
similarity_df.to_excel(r"C:\Users\USER\Documents\Github\h4h-submit version\OUTPUT DATA OF MODELS\wiki_results_final_new.xlsx")

print("Results saved to wiki_results_final_new.xlsx file.")


Results saved to wiki_results_final_new.xlsx file.
