In [1]:
import re
import gensim
import pandas as pd
import logging
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [2]:
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [4]:
# Define the words to remove
words_to_remove = ["References", "External links", "Galleries", "plant", "plants", "crop", "crops", "Wikimedia", "Wikispecies"]

In [5]:
file_paths = ['C:/Users/USER/Final Scraped/wiki_scraped_FPIDatabase_RA_First_1000.xlsx',
              'C:/Users/USER/Final Scraped/wiki_scraped_FPIDatabase_RA_Second_1000.xlsx',
             'C:/Users/USER/Final Scraped/wiki_scraped_FPIDatabase_RA_Third_1000.xlsx']
#              'C:/Users/USER/Final Scraped/wiki_scraped_FPIDatabase_RA_Fourth_3001-6000_First_500.xlsx',
#               'C:/Users/USER/Final Scraped/wiki_scraped_FPIDatabase_RA_Fourth_3001-6000_Second_500.xlsx',
#               'C:/Users/USER/Final Scraped/wiki_scraped_FPIDatabase_RA_Fourth_3001-6000_Third_500.xlsx',
#               'C:/Users/USER/Final Scraped/wiki_scraped_FPIDatabase_RA_Fourth_3001-6000_Fourth_500.xlsx',
#               'C:/Users/USER/Final Scraped/wiki_scraped_FPIDatabase_RA_Fourth_3001-6000_Fifth_1000.xlsx',
#              'C:/Users/USER/Final Scraped/wiki_scraped_FPIDatabase_RA_Fifth_1000.xlsx',
#              'C:/Users/USER/Final Scraped/wiki_scraped_FPIDatabase_RA_Sixth_1000.xlsx',
#              'C:/Users/USER/Final Scraped/wiki_scraped_FPIDatabase_RA_Seventh_1000.xlsx',
#              'C:/Users/USER/Final Scraped/wiki_scraped_FPIDatabase_RA_Eighth_1000.xlsx',
#              'C:/Users/USER/Final Scraped/wiki_scraped_FPIDatabase_RA_Ninth_1000.xlsx',
#              'C:/Users/USER/Final Scraped/wiki_scraped_FPIDatabase_RA_Tenth_1000.xlsx',
#              'C:/Users/USER/Final Scraped/wiki_scraped_FPIDatabase_RA_Eleventh_2000.xlsx',
#              'C:/Users/USER/Final Scraped/wiki_scraped_FPIDatabase_RA_Twelveth_2000.xlsx',
#              'C:/Users/USER/Final Scraped/wiki_scraped_FPIDatabase_RA_Thirteenth_2000.xlsx',
#              'C:/Users/USER/Final Scraped/wiki_scraped_FPIDatabase_RA_Fourteenth_2000.xlsx']

In [6]:
# file_paths = ['C:/Users/USER/Downloads/Agrifood System - Copy.xlsx']

In [7]:
# Define stopwords
stop_words = set(stopwords.words('english'))

In [8]:
lemmatizer = WordNetLemmatizer() #reduce a word to its base or dictionary form ( eg running to run or better to good)

In [9]:
def clean_text(text):
    # Remove references, external links, and galleries
    for word in words_to_remove:
        text = text.replace(word, '')
    
    # Convert text to lowercase
    text = text.lower()
    
    # Remove special characters, numbers, and non-alphabetic characters
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    
    # Tokenize and remove stopwords, short words, and perform lemmatization
    tokens = gensim.utils.simple_preprocess(text)
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words and len(word) > 1]
    
    return tokens

In [10]:
def read_corpus_from_excel(file_path, tokens_only=False):
    # Read data
    data = pd.read_excel(file_path)

    # Create an empty set to keep track of seen content
    seen_content = set()

    # Iterate through the rows in the 'Content' column
    for i, item in enumerate(data['Content']):
        # Check if the item is not NaN and contains non-empty cells
        if pd.notna(item) and item.strip():
            # Convert the item to a string
            line = str(item)

            # Check if the content has been seen before
            if line not in seen_content:
                # If not seen, add it to the set to avoid duplicates
                seen_content.add(line)

                # Tokenize the content
                tokens = gensim.utils.simple_preprocess(line)

                if tokens_only:
                    # If tokens_only is True, yield only the tokens
                    yield tokens
                else:
                    # If tokens_only is False, create a unique tag for the document
                    # by combining the file path and document index
                    tag = str(file_path) + '-' + str(i)

                    # Yield a TaggedDocument, which pairs the tokens with a tag
                    yield gensim.models.doc2vec.TaggedDocument(tokens, [tag])


In [11]:
train_corpus = [] #This list will be used to store the documents or text data that will be processed and used to train the Doc2Vec model
for file_path in file_paths:
    train_corpus.extend(list(read_corpus_from_excel(file_path)))

In [30]:
model = gensim.models.doc2vec.Doc2Vec(vector_size=100, min_count=2, epochs=10)

2023-11-15 10:48:42,482 : INFO : Doc2Vec lifecycle event {'params': 'Doc2Vec(dm/m,d100,n5,w5,mc2,s0.001,t3)', 'datetime': '2023-11-15T10:48:42.482586', 'gensim': '4.1.2', 'python': '3.9.13 (main, Aug 25 2022, 23:51:50) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22621-SP0', 'event': 'created'}


In [31]:
model.build_vocab(train_corpus) #build the vocabulary for a Doc2Vec model based on the provided training corpus

2023-11-15 10:48:42,693 : INFO : collecting all words and their counts
2023-11-15 10:48:42,695 : INFO : PROGRESS: at example #0, processed 0 words (0/s), 0 word types, 0 tags
2023-11-15 10:48:42,931 : INFO : collected 60062 word types and 2236 unique tags from a corpus of 2236 examples and 1159683 words
2023-11-15 10:48:42,931 : INFO : Creating a fresh vocabulary
2023-11-15 10:48:43,047 : INFO : Doc2Vec lifecycle event {'msg': 'effective_min_count=2 retains 32052 unique words (53.36485631514102%% of original 60062, drops 28010)', 'datetime': '2023-11-15T10:48:43.047325', 'gensim': '4.1.2', 'python': '3.9.13 (main, Aug 25 2022, 23:51:50) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22621-SP0', 'event': 'prepare_vocab'}
2023-11-15 10:48:43,047 : INFO : Doc2Vec lifecycle event {'msg': 'effective_min_count=2 leaves 1131673 word corpus (97.58468478023735%% of original 1159683, drops 28010)', 'datetime': '2023-11-15T10:48:43.047325', 'gensim': '4.1.2', 'python': '3.9.13 (main, 

In [32]:
print(f"Word 'rice' appeared {model.wv.get_vecattr('rice', 'count')} times in the training corpus.")

Word 'rice' appeared 172 times in the training corpus.


In [33]:
model.train(train_corpus, total_examples=model.corpus_count, epochs=model.epochs)

2023-11-15 10:48:43,531 : INFO : Doc2Vec lifecycle event {'msg': 'training model with 3 workers on 32052 vocabulary and 100 features, using sg=0 hs=0 sample=0.001 negative=5 window=5 shrink_windows=True', 'datetime': '2023-11-15T10:48:43.531855', 'gensim': '4.1.2', 'python': '3.9.13 (main, Aug 25 2022, 23:51:50) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22621-SP0', 'event': 'train'}
2023-11-15 10:48:44,552 : INFO : EPOCH 1 - PROGRESS: at 96.24% examples, 849485 words/s, in_qsize 5, out_qsize 0
2023-11-15 10:48:44,586 : INFO : worker thread finished; awaiting finish of 2 more threads
2023-11-15 10:48:44,588 : INFO : worker thread finished; awaiting finish of 1 more threads
2023-11-15 10:48:44,599 : INFO : worker thread finished; awaiting finish of 0 more threads
2023-11-15 10:48:44,600 : INFO : EPOCH - 1 : training on 1159683 raw words (902968 effective words) took 1.1s, 851357 effective words/s
2023-11-15 10:48:45,586 : INFO : worker thread finished; awaiting finish of

In [34]:
vector = model.infer_vector(['rice', 'is', 'good']) #vector representation of the semantic content (significance of a piece of text)
#numerical representation of the meaning of the new document
print(vector)

[ 0.0257604   0.01848064  0.00999208 -0.02328071 -0.00672734 -0.04123252
  0.0500344   0.03372451 -0.00288891 -0.01007057 -0.0067927  -0.01307904
  0.03552444  0.00578831  0.00870091 -0.03514152  0.04326287 -0.0025482
 -0.0058521  -0.06088119 -0.00902999  0.00983492 -0.05473233 -0.00046851
  0.00362012  0.02439969 -0.07253241  0.01170101  0.01202417 -0.02232227
  0.02386621 -0.02044021 -0.0171322   0.01971511  0.02492699  0.04119387
  0.04080671 -0.03070465 -0.02930884 -0.03181521  0.01328057 -0.05948669
 -0.02142613 -0.00641596 -0.01850217 -0.03005423 -0.0051272  -0.01908077
  0.01309885  0.01654437 -0.02684827 -0.03099038 -0.02748269 -0.0395469
 -0.03330617 -0.02476671 -0.03020239 -0.00429308 -0.01266488 -0.0037087
 -0.01994755 -0.03861113  0.03821676  0.02578732  0.0069153   0.02916013
 -0.00027237 -0.00118128 -0.00530585  0.05347286  0.00325842  0.01221904
 -0.00853222  0.00969818  0.04003515 -0.0192888  -0.02499124 -0.02300078
 -0.02936913 -0.01565136  0.01217097 -0.0078511  -0.02

In [35]:
n_train_docs = len(train_corpus)
print("Number of documents in the corpus:", n_train_docs)

Number of documents in the corpus: 2236


In [36]:
def print_similar_documents(doc_id, sims):
    # Print the top 5 most similar documents with their similarity scores
    print("SIMILAR DOCS PER MODEL:\n")
    for label, index in [('MOST', 0), ('SECOND-MOST', 1), ('THIRD-MOST', 2), ('FOURTH-MOST', 3), ('FIFTH-MOST', 4)]:
        # Splits the document tag (a unique identifier) of the similar document to extract the file path and document index
        #Checks if the tag has two parts (file path and index) to ensure it's a valid tag
        #Extracts the file path, document index, and similarity score from the tag and similarity score pair
        tag_parts = sims[index][0].split('-')
        if len(tag_parts) == 2:
            file_path = tag_parts[0]
            doc_index = int(tag_parts[1])
            similarity_score = sims[index][1]
            doc_words = ' '.join(train_corpus[doc_index].words)
            print(f'{label} ({similarity_score:.2f}): Document at index {doc_index} in file {file_path}\n{doc_words}\n')

In [37]:
doc_id = 1 
inferred_vector = model.infer_vector(train_corpus[doc_id].words)
sims = model.dv.most_similar([inferred_vector], topn=n_train_docs)
print_similar_documents(doc_id, sims)

SIMILAR DOCS PER MODEL:

MOST (0.93): Document at index 1 in file C:/Users/USER/Final Scraped/wiki_scraped_FPIDatabase_RA_First_1000.xlsx
abelmoschus manihot commonly known as aibika is flowering plant in the family malvaceae it was previously classified as species of hibiscus but is now categorized under the genus abelmoschus this plant is also referred to as the sunset muskmallow sunset hibiscus or hibiscus manihot growth habit although technically shrub aibika is perennial plant that under favorable conditions can grow over three meters in height it is easily propagated through cuttings and relatively disease resistant as result it is widely cultivated and often found along garden borders or as an intercrop in traditional tropical gardens its growth habit along with its nutritional value contributes to its popularity in home gardening and horticulture nutrition aibika is renowned for its highly nutritious properties its leaves are rich in essential vitamins including high content of