<a href="https://colab.research.google.com/github/flxhrdyn/NLP-Similar-Words-Finder/blob/main/NLP_Similar_Words_Finder.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# SIMILAR WORDS FINDER
## A simple program to find word similarity using Natural language Processing

## 1. Library Import

In [11]:
import bs4 as bs  # Import BeautifulSoup library as bs
import nltk  # Import nltk (Natural Language Toolkit) library for natural language processing
import urllib.request  # Import urllib.request library to access URLs
import re  # Import re (regular expressions) library for text processing
import lxml  # Import lxml library for HTML parsing
import string  # Import String
from nltk.corpus import stopwords  # Import Stopwords
from gensim.models import Word2Vec  # Import Word2Vec class from gensim.model

## 2. Web Scraping

In [12]:
# Accessing the URL to be scraped
web_scraping = urllib.request.urlopen(input("Please input the URL you want to scrape: "))

# Reading the content of the downloaded web page
content = web_scraping.read()

# Using BeautifulSoup to parse HTML content with the lxml parser
parsing = bs.BeautifulSoup(content, 'lxml')

# Finding all <p> (paragraph) elements in the parsed HTML
paragraphs = parsing.find_all('p')

# Creating an empty string to store the article text
article_text = ""
for p in paragraphs:  # Extracting text from each paragraph and concatenating it into article_text
    article_text += p.text


Please input the URL you want to scrape: https://en.wikipedia.org/wiki/Formula_One


In [13]:
# Print the article text
# print("This is the text arcticle:\n")
# article_text

## 3. Removing Punctuation

In [14]:
# Class for raw text
class RawText(object):
    def __init__(self):
        pass

    # Private function to remove punctuation from the text
    def __remove_punctuation(self, text):
        message = [char for char in text if char not in string.punctuation]
        return ''.join(message)

    # Private function to remove stopwords from the text
    def __remove_stopwords(self, text):
        words = [word for word in text.split() if word.lower() not in stopwords.words('english')]
        return words

    # Public function to remove punctuation and stopwords from the text and return a list of remaining words
    def token_words(self, text=''):
        message = self.__remove_punctuation(text)
        words = self.__remove_stopwords(message)
        return words


In [15]:
# Print the words after we removed the stopwords and punctuation
# print("The words after we removed the stopwords and punctuation:\n")
# words

In [16]:
# download the "stopwords" dataset from nltk
flag = nltk.download("stopwords")

# Checking whether the download is successful or failed
if not flag:
    print("Download Failed!") # Display the string if the download is failed display
else:
    print("Download Success!")
    # If the download is successful, create an object from the RawText class
    helper = RawText()

    # Using the object to process the text (in this case, article text)
    words = helper.token_words(text=article_text)

# The result will be a list of processed words (without punctuation and stopwords).

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Download Success!


## 4. Model Training

In [17]:
# Creating a Word2Vec model with customized parameters

# model = Word2Vec([words], min_count=1)
model = Word2Vec([words], vector_size=100, window=10, min_count=10, workers=4)

# vector_size=100: The size of the word representation vectors is 100
# window=10: The desired number of words to be shown in the output is 10
# min_count=10: Words with a frequency of at least 10 occurrences will be included in the model
# workers=4: Using 4 CPU cores for model training

In [18]:
# Getting the list of words in the vocabulary of the Word2Vec model
vocabulary = list(model.wv.key_to_index.keys())

## 5. Output

In [19]:
# Finding words that are most similar and related in the Word2Vec model
similar_words = model.wv.most_similar(input("What word do you want to find the similarity?: "))
# The result of this statement is stored in the variable similar_words,
# which will contain a list of words most similar to the input word along with their similarity scores.

# Converting similarity scores to a percentage scale (0-100)
similar_words_percentage = [(word, (score + 1) * 50) for word, score in similar_words]

# Displaying the list of similar and related words for the requested word
for word, similarity_percentage in similar_words_percentage:
    print(f"'{word}': {similarity_percentage:.2f}% Similarity")


What word do you want to find the similarity?: Formula
'drivers': 99.95% Similarity
'One': 99.95% Similarity
'World': 99.95% Similarity
'team': 99.95% Similarity
'race': 99.94% Similarity
'championship': 99.94% Similarity
'cars': 99.94% Similarity
'races': 99.93% Similarity
'season': 99.93% Similarity
'car': 99.93% Similarity
