In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import spacy
import string
from spacy.lang.en.stop_words import STOP_WORDS
import re
import gensim
from gensim import corpora
from operator import itemgetter

In [2]:
pip install gensim
pip install spacy



In [3]:
# Read CSV files into DataFrames

df= pd.read_csv("/content/dogs.csv")
print(df)

                      Breed Name  \
0                         Afador   
1                    Affenhuahua   
2                  Affenpinscher   
3                   Afghan Hound   
4               Airedale Terrier   
..                           ...   
386  Wirehaired Pointing Griffon   
387               Xoloitzcuintli   
388               Yakutian Laika   
389                     Yorkipoo   
390            Yorkshire Terrier   

                                           Description    Dog Size  \
0    Afadors, a mix of Afghan Hound and Labrador Re...  Very Large   
1    Combining the spunky personality of Affenpinsc...       Small   
2    Affectionately dubbed the "monkey terrier," Af...       Small   
3    Originating from Afghanistan, Afghan Hounds ar...  Very Large   
4    Airedales, the largest of the terrier breeds, ...  Very Large   
..                                                 ...         ...   
386  Wirehaired Pointing Griffons are versatile hun...  Very Large   
387  Xo

In [4]:
spacy_nlp = spacy.load('en_core_web_sm')

#create list of punctuations and stopwords
punctuations = set(string.punctuation)
stop_words = spacy.lang.en.stop_words.STOP_WORDS


In [5]:
# re-sub: It replaces occurrences of a pattern in a string with another string.

def spacy_tokenizer(sentence):
    # remove distracting single quotes
    sentence = re.sub('\'', '', sentence)

    # remove digits and words containing digits
    sentence = re.sub('\w*\d\w*', '', sentence)

    # replace extra spaces with single space
    sentence = re.sub(' +', ' ', sentence)

    # remove unwanted lines starting from special characters
    sentence = re.sub(r'\n: \'\'.*', '', sentence)
    sentence = re.sub(r'\n!.*', '', sentence)
    sentence = re.sub(r'^:\'\'.*', '', sentence)

    # remove non-breaking new line characters
    sentence = re.sub(r'\n', ' ', sentence)

    # remove punctuations
    sentence = re.sub(r'[^\w\s]', ' ', sentence)

    # creating token object
    tokens = spacy_nlp(sentence)

    # lower, strip and lemmatize
    tokens = [word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in tokens]

    # remove stopwords and exclude words less than 2 characters
    tokens = [word for word in tokens if word not in stop_words and word not in punctuations and len(word) > 2]

    return tokens


In [6]:
# Load the spaCy English model
spacy_nlp = spacy.load("en_core_web_sm")

# Define the tokenization function
def spacy_tokenizer(Description):
    tokens = spacy_nlp(Description)
    return [token.text for token in tokens if not token.is_stop and not token.is_punct]

# Check for missing values in the 'overview' column and replace them with an empty string
df['Description'] = df['Description'].fillna('')

# Apply the tokenization function to the 'overview' column and create a new column 'tokenized_overview'
df['tokenized_Description'] = df['Description'].apply(spacy_tokenizer)

description_plot = df['tokenized_Description']

# Display the DataFrame with the new tokenized column
print(df.head())



         Breed Name                                        Description  \
0            Afador  Afadors, a mix of Afghan Hound and Labrador Re...   
1       Affenhuahua  Combining the spunky personality of Affenpinsc...   
2     Affenpinscher  Affectionately dubbed the "monkey terrier," Af...   
3      Afghan Hound  Originating from Afghanistan, Afghan Hounds ar...   
4  Airedale Terrier  Airedales, the largest of the terrier breeds, ...   

     Dog Size   Dog Breed Group                                Height  \
0  Very Large  Mixed Breed Dogs                       20 to 29 inches   
1       Small  Mixed Breed Dogs                        6 to 12 inches   
2       Small    Companion Dogs   9 to 11 inches tall at the shoulder   
3  Very Large        Hound Dogs  24 to 26 inches tall at the shoulder   
4  Very Large      Terrier Dogs  21 to 23 inches tall at the shoulder   

   Avg. Height, cm           Weight  Avg. Weight, kg       Life Span  \
0            62.23  50 to 75 pounds         

# gensim's Dictionary class, the token2id attribute is a Python dictionary that maps tokens (words) to their unique integer IDs.

In [7]:


%time dictionary = corpora.Dictionary(description_plot)


#list of few which which can be further removed
stoplist = set('hello and if can would should could tell ask stop come go')
stop_ids = [dictionary.token2id[stopword] for stopword in stoplist if stopword in dictionary.token2id]
dictionary.filter_tokens(stop_ids)

CPU times: user 20.3 ms, sys: 23 µs, total: 20.4 ms
Wall time: 29.5 ms


#feature extraction

# doc2bow method of dictionary, iterates through all the words in the text, if the word already exists in the corpus, it increments the frequency count, other wise it inserts the word into the corpus and sets it frequency count to 1

In simple terms, it's like counting unique words in a text document. If a word is encountered for the first time, it's added to the list of known words with a frequency count of 1. If the word has been seen before, its frequency count is increased by 1. This process helps in creating a bag-of-words representation of the text data.

In [8]:
corpus = [dictionary.doc2bow(desc) for desc in description_plot]
word_frequencies = [[(dictionary[id], frequency) for id,frequency in line] for line in corpus[0:3]]

Building Tf-idf model

Tf-Idf means, Term frequency-Inverse Document Frequency. it is a commonly used NLP model that helps you determine the most important words in each document in the corpus. Once the Tf-Idf is build, pass it to LSI model and specify the num of features to build

In [9]:
dog_tfidf_model = gensim.models.TfidfModel(corpus, id2word=dictionary)
dog_lsi_model = gensim.models.LsiModel(dog_tfidf_model[corpus], id2word=dictionary, num_topics=300)

# Serialize and store the corpus locally for easy retrieval whenever required.
gensim.corpora.MmCorpus.serialize('dog_tfidf_model_mm', dog_tfidf_model[corpus])
gensim.corpora.MmCorpus.serialize('dog_lsi_model_mm', dog_lsi_model[dog_tfidf_model[corpus]])

# Load the indexed corpus
dog_tfidf_corpus = gensim.corpora.MmCorpus('dog_tfidf_model_mm')
dog_lsi_corpus = gensim.corpora.MmCorpus('dog_lsi_model_mm')

# Load the MatrixSimilarity
from gensim.similarities import MatrixSimilarity
dog_index = MatrixSimilarity(dog_lsi_corpus, num_features=dog_lsi_corpus.num_terms)


 Semantic Search

We will input a search query and model will return relevant movie titles with “Relevance %” which is the similarity score. The higher the similarity score, the more similar the query to the document at the given index

In [10]:
def search_similar_dogs(search_term):

    query_bow = dictionary.doc2bow(spacy_tokenizer(search_term))
    query_tfidf = dog_tfidf_model[query_bow]
    query_lsi = dog_lsi_model[query_tfidf]

    dog_index.num_best = 5


    dog_list = dog_index[query_lsi]

    dog_list.sort(key=itemgetter(1), reverse=True)
    dog_names = []

    for j, dog in enumerate(dog_list):

        dog_names.append (
            {
                'Relevance': round((dog[1] * 100),2),
                'Breed Name': df['Breed Name'][dog[0]],
                'description_name': df['Description'][dog[0]]
            }

        )
        if j == (dog_index.num_best-1):
            break

    return pd.DataFrame(dog_names, columns=['Relevance','Breed Name','description_name'])

In [11]:
search_similar_dogs('small dogs')

Unnamed: 0,Relevance,Breed Name,description_name
0,32.11,Border Terrier,Border Terriers are small but sturdy dogs with...
1,27.47,Fox Terrier,Fox Terriers are small but energetic dogs know...
2,24.92,German Spitz,German Spitzs are small and lively dogs known ...
3,24.83,Havanese,Havanese are small and charming dogs known for...
4,24.32,Japanese Chin,Japanese Chins are small and elegant dogs know...


In [12]:
search_similar_dogs('black and tall dogs')

Unnamed: 0,Relevance,Breed Name,description_name
0,60.49,Schipperke,Schipperkes are small and spirited dogs known ...
1,53.32,Manchester Terrier,Manchester Terriers are small and sleek dogs k...
2,50.9,Black Russian Terrier,Black Russian Terriers are large and powerful ...
3,48.38,Gordon Setter,Gordon Setters are elegant and athletic dogs k...
4,40.27,Black and Tan Coonhound,Black and Tan Coonhounds are skilled scent hou...


In [13]:
!pip install fastapi nest-asyncio pyngrok uvicorn

Collecting fastapi
  Downloading fastapi-0.110.2-py3-none-any.whl (91 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m91.9/91.9 kB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m
Collecting pyngrok
  Downloading pyngrok-7.1.6-py3-none-any.whl (22 kB)
Collecting uvicorn
  Downloading uvicorn-0.29.0-py3-none-any.whl (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.8/60.8 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
Collecting starlette<0.38.0,>=0.37.2 (from fastapi)
  Downloading starlette-0.37.2-py3-none-any.whl (71 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m71.9/71.9 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
Collecting h11>=0.8 (from uvicorn)
  Downloading h11-0.14.0-py3-none-any.whl (58 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.3/58.3 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pyngrok, h11, uvicorn, starlette, fastapi
Successfully installed fastapi-0.11

In [14]:
from fastapi import FastAPI
from fastapi.middleware.cors import CORSMiddleware

app = FastAPI()

app.add_middleware(
    CORSMiddleware,
    allow_origins=['*'],
    allow_credentials=True,
    allow_methods=['*'],
    allow_headers=['*'],
)

@app.get('/search-query/${searchQuery}')
async def root(searchQuery: str):

    results=search_similar_dogs(searchQuery)

    return {'Result': results }

In [15]:
!ngrok config add-authtoken 2evoubpghhAZMj6XYIUC7Dg01kM_55yLZqsoynxJhRnrVvLzb


Authtoken saved to configuration file: /root/.config/ngrok/ngrok.yml


In [None]:
import nest_asyncio
from pyngrok import ngrok
import uvicorn

ngrok_tunnel = ngrok.connect(8000)
print('Public URL:', ngrok_tunnel.public_url)
nest_asyncio.apply()
uvicorn.run(app, port=8000)

INFO:     Started server process [323]
INFO:     Waiting for application startup.
INFO:     Application startup complete.
INFO:     Uvicorn running on http://127.0.0.1:8000 (Press CTRL+C to quit)


Public URL: https://36e0-34-121-234-8.ngrok-free.app
