# Document Creation for NLP Tasks
This notebook loads the Google reviews data and creates two distinct documents for each review:
1. **Review Document:** The raw text of the customer review.
2. **Business Document:** A concatenation of the business name, category, and description.

In [1]:
import pandas as pd

## 1. Load the data

In [3]:
df = pd.read_csv('../data_gpt_labeler/final_data_sampled.csv')

In [5]:
df.head()

Unnamed: 0.1,Unnamed: 0,rating,text,business_name,business_category,business_description,_id
0,848717,5,great for sunset,'Ehukai Beach Park,"['Park', 'Public beach', 'Tourist attraction']",Popular surfing beach offering massive wintert...,1.1426748616291053e+20_1589655333572
1,848703,5,The main access point to popular surfing break...,'Ehukai Beach Park,"['Park', 'Public beach', 'Tourist attraction']",Popular surfing beach offering massive wintert...,1.1447491719186307e+20_1591852064598
2,848713,5,Great place to set up your camping chairs and ...,'Ehukai Beach Park,"['Park', 'Public beach', 'Tourist attraction']",Popular surfing beach offering massive wintert...,1.0893375045945795e+20_1463461472578
3,848706,5,My favorite Beach for surfing on Oahu North Sh...,'Ehukai Beach Park,"['Park', 'Public beach', 'Tourist attraction']",Popular surfing beach offering massive wintert...,1.1249899958787118e+20_1570685676722
4,848694,5,Excellent beach for family activities great su...,'Ehukai Beach Park,"['Park', 'Public beach', 'Tourist attraction']",Popular surfing beach offering massive wintert...,1.1730942640485394e+20_1605375558437


## 2. Create Review and Business Documents

In [8]:
df['review_document'] = df['text'].astype(str)
df['business_document'] = df['business_name'].fillna('') + ' ' + df['business_category'].fillna('') + ' ' + df['business_description'].fillna('')
df[['review_document', 'business_document']].head()

Unnamed: 0,review_document,business_document
0,great for sunset,"'Ehukai Beach Park ['Park', 'Public beach', 'T..."
1,The main access point to popular surfing break...,"'Ehukai Beach Park ['Park', 'Public beach', 'T..."
2,Great place to set up your camping chairs and ...,"'Ehukai Beach Park ['Park', 'Public beach', 'T..."
3,My favorite Beach for surfing on Oahu North Sh...,"'Ehukai Beach Park ['Park', 'Public beach', 'T..."
4,Excellent beach for family activities great su...,"'Ehukai Beach Park ['Park', 'Public beach', 'T..."


## 3. Display a single example

In [11]:
print('Review Document:')
print(df['review_document'].iloc[0])
print('---')
print('Business Document:')
print(df['business_document'].iloc[0])

Review Document:
great for sunset
---
Business Document:
'Ehukai Beach Park ['Park', 'Public beach', 'Tourist attraction'] Popular surfing beach offering massive wintertime waves, with calmer waters in the summer.


## 4. Create the Corpus

In [14]:
corpus = []
for index, row in df.iterrows():
    corpus.append(row['review_document'])     
    corpus.append(row['business_document']) 
print(f'Total number of documents in the corpus: {len(corpus)}')
print('---')
print('First 4 documents in the corpus:')
for doc in corpus[:4]:
    print(doc)    
print('---')

Total number of documents in the corpus: 20000
---
First 4 documents in the corpus:
great for sunset
'Ehukai Beach Park ['Park', 'Public beach', 'Tourist attraction'] Popular surfing beach offering massive wintertime waves, with calmer waters in the summer.
The main access point to popular surfing breaks such as Pipeline.
'Ehukai Beach Park ['Park', 'Public beach', 'Tourist attraction'] Popular surfing beach offering massive wintertime waves, with calmer waters in the summer.
---


## 5. Preprocess Corpus for LDA

In [17]:
import nltk 
from nltk.tokenize import word_tokenize 
from nltk.corpus import stopwords 
from nltk.stem import WordNetLemmatizer

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
def preprocess_text(text):    
    tokens = word_tokenize(text.lower())    
    return [lemmatizer.lemmatize(w) for w in tokens if w.isalpha() and w not in stop_words]
processed_corpus = [preprocess_text(doc) for doc in corpus]

[nltk_data] Downloading package punkt to /Users/yumin/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/yumin/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/yumin/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## 6. Create Dictionary and Corpus for LDA

In [20]:
from gensim import corpora
dictionary = corpora.Dictionary(processed_corpus)
bow_corpus = [dictionary.doc2bow(text) for text in processed_corpus]

## 7. Train the LDA Model

In [27]:
from gensim.models import LdaModel
lda_model = LdaModel(bow_corpus, num_topics=100, id2word=dictionary, passes=15)
for idx, topic in lda_model.print_topics(-1):    print(f'Topic: {idx} Words: {topic}')

Topic: 0 Words: 0.213*"getting" + 0.117*"job" + 0.083*"bill" + 0.052*"beauty" + 0.052*"crew" + 0.047*"traffic" + 0.036*"e" + 0.028*"blowing" + 0.017*"mine" + 0.016*"celebrating"
Topic: 1 Words: 0.252*"small" + 0.104*"stand" + 0.070*"stadium" + 0.069*"meet" + 0.061*"charge" + 0.059*"swap" + 0.055*"biggest" + 0.054*"aloha" + 0.053*"state" + 0.052*"admission"
Topic: 2 Words: 0.350*"ice" + 0.217*"cream" + 0.084*"enjoy" + 0.051*"sand" + 0.049*"frozen" + 0.028*"sweet" + 0.022*"trying" + 0.017*"swim" + 0.016*"horrible" + 0.016*"ambience"
Topic: 3 Words: 0.165*"pretty" + 0.143*"fried" + 0.129*"much" + 0.077*"cook" + 0.058*"server" + 0.051*"eating" + 0.049*"trip" + 0.038*"another" + 0.027*"run" + 0.023*"nearby"
Topic: 4 Words: 0.245*"relaxed" + 0.241*"waikiki" + 0.137*"contemporary" + 0.111*"naval" + 0.094*"prepared" + 0.022*"airy" + 0.019*"produce" + 0.012*"dirty" + 0.012*"cause" + 0.011*"smell"
Topic: 5 Words: 0.197*"trail" + 0.151*"area" + 0.104*"way" + 0.096*"fall" + 0.086*"clean" + 0.081*"

## 8. Visualize the Topics

In [29]:
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
pyLDAvis.enable_notebook()
vis = gensimvis.prepare(lda_model, bow_corpus, dictionary)
vis

  pid = os.fork()
  pid = os.fork()
  pid = os.fork()
  pid = os.fork()
  pid = os.fork()
  pid = os.fork()
  pid = os.fork()
  pid = os.fork()
  pid = os.fork()
  pid = os.fork()
  pid = os.fork()
  pid = os.fork()


## 9. Compute Similarity Score

In [31]:
import numpy as np 
from scipy.spatial.distance import cosine
def get_lda_vector(text):    
    processed_text = preprocess_text(text)    
    bow_vector = dictionary.doc2bow(processed_text)    
    lda_vector = lda_model.get_document_topics(bow_vector, minimum_probability=0.0)    
    dense_vector = np.zeros(lda_model.num_topics)    
    for topic_num, prop_topic in lda_vector:        
        dense_vector[topic_num] = prop_topic    
    return dense_vector
    
def calculate_cosine_similarity(vec1, vec2):    
    return 1 - cosine(vec1, vec2)
    
# Example with the first review
sample_review = df['review_document'].iloc[0]
sample_business = df['business_document'].iloc[0]
review_vector = get_lda_vector(sample_review)
business_vector = get_lda_vector(sample_business)
similarity_score = calculate_cosine_similarity(review_vector, business_vector)
print(f'Review: {sample_review}')
print(f'Business: {sample_business}')
print(f'Similarity Score (Trustworthiness): {similarity_score:.4f}')

Review: great for sunset
Business: 'Ehukai Beach Park ['Park', 'Public beach', 'Tourist attraction'] Popular surfing beach offering massive wintertime waves, with calmer waters in the summer.
Similarity Score (Trustworthiness): 0.0122
