# **Semantic Search Analysis**

In [59]:
import pandas as pd
import re
import nltk
import spacy
import pandas as pd
import joblib
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [14]:
nltk.download('punkt')
nltk.download('names')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package names to /root/nltk_data...
[nltk_data]   Package names is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [16]:
nlp = spacy.load('en_core_web_sm')

In [41]:
df = pd.read_csv('text.csv')
df.rename(columns={'Unnamed: 0': 'id'}, inplace=True)
df.head()

Unnamed: 0,id,text,label
0,0,i just feel really helpless and heavy hearted,4.0
1,1,ive enjoyed being able to slouch about relax a...,0.0
2,2,i gave up my internship with the dmrg and am f...,4.0
3,3,i dont know i feel so lost,0.0
4,4,i am a kindergarten teacher and i am thoroughl...,4.0


In [18]:
def tokenize(text):
    return nltk.word_tokenize(text)

In [20]:
def normalize(text):
    text = text.lower()
    text = re.sub(r'\b\w{1,2}\b', '', text)
    text = re.sub(r'\d+', '', text)
    return text

In [22]:
def remove_stopwords(words):
    stop_words = set(stopwords.words('english'))
    return [word for word in words if word not in stop_words]

In [23]:
def Stemming(words):
    stemmer = PorterStemmer()
    return [stemmer.stem(word) for word in words]

In [24]:
def Lemmatization(words):
    lemmatizer = WordNetLemmatizer()
    return [lemmatizer.lemmatize(word) for word in words]

In [25]:
def pos_tagging(words):
    return nltk.pos_tag(words)

In [37]:
def preprocess_text(text):
    text = normalize(text)
    words = tokenize(text)
    words = remove_stopwords(words)
    words = Stemming(words)
    words = Lemmatization(words)
    return ' '.join(words)

df['processed_text'] = df['text'].apply(preprocess_text)

In [48]:
def named_entity_recognition(text):
    doc = nlp(text)
    entities = [(ent.text, ent.label_) for ent in doc.ents]
    return entities

In [31]:
df.head()

Unnamed: 0.1,Unnamed: 0,text,label,processed_text
0,0,i just feel really helpless and heavy hearted,4.0,feel realli helpless heavi heart
1,1,ive enjoyed being able to slouch about relax a...,0.0,ive enjoy abl slouch relax unwind frankli need...
2,2,i gave up my internship with the dmrg and am f...,4.0,gave internship dmrg feel distraught
3,3,i dont know i feel so lost,0.0,dont know feel lost
4,4,i am a kindergarten teacher and i am thoroughl...,4.0,kindergarten teacher thoroughli weari job take...


In [38]:
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(df['processed_text'])

In [43]:
def query_system(query, vectorizer, tfidf_matrix, df, top_k=5):

    preprocessed_query = preprocess_text(query)

    query_vector = vectorizer.transform([preprocessed_query])

    similarities = cosine_similarity(query_vector, tfidf_matrix).flatten()

    top_indices = similarities.argsort()[-top_k:][::-1]

    results = [(df.iloc[i]['id'], df.iloc[i]['text'], similarities[i]) for i in top_indices]

    return results

In [50]:
def extract_named_entities(text):
    entities = named_entity_recognition(text)
    for entity, label in entities:
        print(f"Entity: {entity}, Label: {label}")

In [61]:
def user_query(query):
  results = query_system(query, vectorizer, tfidf_matrix, df)

  for idx, result in enumerate(results):
      print(f"Result {idx+1}:")
      print(f"Document ID: {result[0]}")
      print(f"Text: {result[1]}\n")
      print(f"Similarity Score: {result[2]:.4f}\n")
      print("Named Entities:")
      extract_named_entities(result[1])
      print("\n")

  return [ result[1] for result in results ]

In [62]:
query = "I feel lost and helpless"
user_query(query)

Result 1:
Document ID: 29014
Text: i hate it when i feel so lost and helpless

Similarity Score: 0.8575

Named Entities:


Result 2:
Document ID: 39095
Text: i feel so helpless and lost that i dont even know what to do with myself

Similarity Score: 0.7541

Named Entities:


Result 3:
Document ID: 15586
Text: im feeling helpless

Similarity Score: 0.7245

Named Entities:


Result 4:
Document ID: 35920
Text: i feel so helpless now

Similarity Score: 0.7245

Named Entities:


Result 5:
Document ID: 17460
Text: i am feeling helpless

Similarity Score: 0.7245

Named Entities:




['i hate it when i feel so lost and helpless',
 'i feel so helpless and lost that i dont even know what to do with myself',
 'im feeling helpless',
 'i feel so helpless now',
 'i am feeling helpless']

In [60]:
joblib.dump(vectorizer, 'vectorizer.joblib')
df.to_pickle('processed_data.pkl')