## bigrams and trigrams using Phraser models

In [1]:
import gensim
from gensim import corpora
from pprint import pprint

In [2]:
# How to create a dictionary from a list of sentences?
documents = ["The Saudi Arabia are preparing a report that will acknowledge that", 
             "Saudi Arabia journalist Jamal Khashoggi's death was the result of an", 
             "interrogation that went wrong, one that was intended to lead", 
             "to his abduction from Turkey, according to two sources in Saudi Arabia."]

documents_2 = ["One source says the report will likely conclude that", 
                "the operation was carried out without clearance and", 
                "transparency and that those involved will be held", 
                "responsible. One of the sources acknowledged that the", 
                "report is still being prepared and cautioned that", 
                "things could change."]

In [3]:
# Tokenize(split) the sentences into words
texts = [[text for text in doc.split()] for doc in documents]
print(texts)

[['The', 'Saudi', 'Arabia', 'are', 'preparing', 'a', 'report', 'that', 'will', 'acknowledge', 'that'], ['Saudi', 'Arabia', 'journalist', 'Jamal', "Khashoggi's", 'death', 'was', 'the', 'result', 'of', 'an'], ['interrogation', 'that', 'went', 'wrong,', 'one', 'that', 'was', 'intended', 'to', 'lead'], ['to', 'his', 'abduction', 'from', 'Turkey,', 'according', 'to', 'two', 'sources', 'in', 'Saudi', 'Arabia.']]


In [4]:
# Create dictionary
dictionary = corpora.Dictionary(texts)

In [5]:
# Get information about the dictionary
print(dictionary)

Dictionary(35 unique tokens: ['Arabia', 'Saudi', 'The', 'a', 'acknowledge']...)


In [6]:
# Show the word to id map
print(dictionary.token2id)

{'Arabia': 0, 'Saudi': 1, 'The': 2, 'a': 3, 'acknowledge': 4, 'are': 5, 'preparing': 6, 'report': 7, 'that': 8, 'will': 9, 'Jamal': 10, "Khashoggi's": 11, 'an': 12, 'death': 13, 'journalist': 14, 'of': 15, 'result': 16, 'the': 17, 'was': 18, 'intended': 19, 'interrogation': 20, 'lead': 21, 'one': 22, 'to': 23, 'went': 24, 'wrong,': 25, 'Arabia.': 26, 'Turkey,': 27, 'abduction': 28, 'according': 29, 'from': 30, 'his': 31, 'in': 32, 'sources': 33, 'two': 34}


If you get new documents in the future, it is also possible to update an existing dictionary to include the new words.

In [7]:
documents_2 = ["The intersection graph of paths in trees",
               "Graph minors IV Widths of trees and well quasi ordering",
               "Graph minors A survey"]

texts_2 = [[text for text in doc.split()] for doc in documents_2]

dictionary.add_documents(texts_2)

In [8]:
# Show the word to id map
print(dictionary.token2id)

{'Arabia': 0, 'Saudi': 1, 'The': 2, 'a': 3, 'acknowledge': 4, 'are': 5, 'preparing': 6, 'report': 7, 'that': 8, 'will': 9, 'Jamal': 10, "Khashoggi's": 11, 'an': 12, 'death': 13, 'journalist': 14, 'of': 15, 'result': 16, 'the': 17, 'was': 18, 'intended': 19, 'interrogation': 20, 'lead': 21, 'one': 22, 'to': 23, 'went': 24, 'wrong,': 25, 'Arabia.': 26, 'Turkey,': 27, 'abduction': 28, 'according': 29, 'from': 30, 'his': 31, 'in': 32, 'sources': 33, 'two': 34, 'graph': 35, 'intersection': 36, 'paths': 37, 'trees': 38, 'Graph': 39, 'IV': 40, 'Widths': 41, 'and': 42, 'minors': 43, 'ordering': 44, 'quasi': 45, 'well': 46, 'A': 47, 'survey': 48}


In [9]:
corpus = [dictionary.doc2bow(line) for line in texts]

In [10]:
print(corpus)

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 2), (9, 1)], [(0, 1), (1, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 1)], [(8, 2), (18, 1), (19, 1), (20, 1), (21, 1), (22, 1), (23, 1), (24, 1), (25, 1)], [(1, 1), (23, 2), (26, 1), (27, 1), (28, 1), (29, 1), (30, 1), (31, 1), (32, 1), (33, 1), (34, 1)]]


__min_count__ ignore all words and bigrams with total collected count lower than this. Bydefault it value is 5

__threshold__ represents a threshold for forming the phrases (higher means fewer phrases). 

A phrase of words a and b is accepted if (cnt(a, b) - min_count) * N / (cnt(a) * cnt(b)) > threshold, where N is the total vocabulary size. Bydefault it value is 10.0

In [21]:
# Build the bigram models
# Detect phrases based on collocation counts
bigram = gensim.models.phrases.Phrases(texts, min_count=1, threshold=5)

In [24]:
bigram[texts[0]]

['The',
 'Saudi_Arabia',
 'are',
 'preparing',
 'a',
 'report',
 'that',
 'will',
 'acknowledge',
 'that']

In [26]:
# Construct bigram
print(bigram[texts[0]])

['The', 'Saudi_Arabia', 'are', 'preparing', 'a', 'report', 'that', 'will', 'acknowledge', 'that']


In [27]:
# Build the trigram models
trigram = gensim.models.phrases.Phrases(bigram[texts], threshold=5)

# Construct trigram
print(trigram[bigram[texts[0]]])

['The', 'Saudi_Arabia', 'are', 'preparing', 'a', 'report', 'that', 'will', 'acknowledge', 'that']


In [28]:
#load all libraries
import numpy as np
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
import spacy
import string
from nltk.stem import WordNetLemmatizer 

In [29]:
location = r'D:\AI-DATASETS\02-MISC-large\Datafiniti_Hotel_Reviews.csv'

In [30]:
#load reviews data
reviews = pd.read_csv(location)

In [31]:
reviews.head(2)

Unnamed: 0,id,dateAdded,dateUpdated,address,categories,primaryCategories,city,country,keys,latitude,...,reviews.dateSeen,reviews.rating,reviews.sourceURLs,reviews.text,reviews.title,reviews.userCity,reviews.userProvince,reviews.username,sourceURLs,websites
0,AVwc252WIN2L1WUfpqLP,2016-10-30T21:42:42Z,2018-09-10T21:06:27Z,5921 Valencia Cir,"Hotels,Hotels and motels,Hotel and motel reser...",Accommodation & Food Services,Rancho Santa Fe,US,us/ca/ranchosantafe/5921valenciacir/359754519,32.990959,...,"2016-08-03T00:00:00Z,2016-07-26T00:00:00Z,2016...",5.0,https://www.hotels.com/hotel/125419/reviews%20/,Our experience at Rancho Valencia was absolute...,Best romantic vacation ever!!!!,,,Paula,http://www.hotels.com/ho125419/%25252525253Flo...,http://www.ranchovalencia.com
1,AVwc252WIN2L1WUfpqLP,2016-10-30T21:42:42Z,2018-09-10T21:06:27Z,5921 Valencia Cir,"Hotels,Hotels and motels,Hotel and motel reser...",Accommodation & Food Services,Rancho Santa Fe,US,us/ca/ranchosantafe/5921valenciacir/359754519,32.990959,...,"2016-08-02T00:00:00Z,2016-08-26T00:00:00Z,2016...",5.0,https://www.hotels.com/hotel/125419/reviews%20/,Amazing place. Everyone was extremely warm and...,Sweet sweet serenity,,,D,http://www.hotels.com/ho125419/%25252525253Flo...,http://www.ranchovalencia.com


In [32]:
comments = reviews['reviews.text']

In [33]:
pd.set_option('display.max_colwidth', 120)

In [34]:
comments.head()

0    Our experience at Rancho Valencia was absolutely perfect from beginning to end!!!! We felt special and very happy du...
1    Amazing place. Everyone was extremely warm and welcoming. We've stayed at some top notch places and this is definite...
2    We booked a 3 night stay at Rancho Valencia to play some tennis, since it is one of the highest rated tennis resorts...
3    Currently in bed writing this for the past hr 1/2 there have been dogs barking and squealing call the front desk to ...
4    I live in Md and the Aloft is my Home away from home...we stayed 1 night 7-7-16 ...Staff is great ! Especially Olivi...
Name: reviews.text, dtype: object

In [35]:
comments = comments.astype('str')

In [36]:
# function to clean and lemmatize comments
def clean_comments(text):
    
    # remove punctuations
    regex   = re.compile('[' + re.escape(string.punctuation) + '\\r\\t\\n]')
    
    nopunct = regex.sub(" ", str(text))
    
    # Init the Wordnet Lemmatizer
    lemmatizer = WordNetLemmatizer()
    
    # Tokenize: Split the sentence into words
    word_list = nltk.word_tokenize(nopunct)
    
    # Lemmatize list of words and join
    lemmatized_output = ' '.join([lemmatizer.lemmatize(w) for w in word_list])
    
    clean_text = lemmatized_output.lower()
    
    return clean_text

In [37]:
sentence = "The striped bats are hanging on @@ ## their feet for best"
clean_comments(sentence)

'the striped bat are hanging on their foot for best'

In [38]:
%%time
# apply function to clean and lemmatize comments
lemmatized = comments.map(clean_comments)

Wall time: 6.4 s


In [39]:
lemmatized.head(8)

0    our experience at rancho valencia wa absolutely perfect from beginning to end we felt special and very happy during ...
1    amazing place everyone wa extremely warm and welcoming we ve stayed at some top notch place and this is definitely i...
2    we booked a 3 night stay at rancho valencia to play some tennis since it is one of the highest rated tennis resort i...
3    currently in bed writing this for the past hr 1 2 there have been dog barking and squealing call the front desk to a...
4    i live in md and the aloft is my home away from home we stayed 1 night 7 7 16 staff is great especially olivia who w...
5    i stayed here with my family for my daughter wedding it had a very accommodating staff olivia wa excellent the room ...
6    beautiful room and the nicest people working there the front desk lady olivia wa extremely patient and helpful we ha...
7    we stayed here while visiting maryland live cute hotel in a great location clean with a very modern look upgraded ba...


In [40]:
documents = lemmatized.values

In [41]:
# Tokenize(split) the sentences into words
texts = [[text for text in doc.split()] for doc in documents]

#### Build the bigram models

In [42]:
bigram = gensim.models.phrases.Phrases(texts, min_count=3, threshold=5)

In [43]:
# Construct bigram for document 1
print(bigram[texts[0]])

['our', 'experience', 'at', 'rancho', 'valencia', 'wa', 'absolutely_perfect', 'from_beginning', 'to', 'end', 'we', 'felt', 'special', 'and', 'very', 'happy', 'during_our', 'stayed', 'i_would', 'come_back', 'in', 'a', 'heart_beat']


In [44]:
import operator

results = sorted(
                {k:v for k,v in bigram.vocab.items() if '_' in k if v>=bigram.min_count}.items(),
                key     = operator.itemgetter(1),
                reverse = True)

res_df = pd.DataFrame(results)
res_df.head(10)

Unnamed: 0,0,1
0,the_room,3154
1,in_the,2911
2,the_hotel,2468
3,and_the,2175
4,of_the,2029
5,it_wa,1943
6,room_wa,1848
7,to_the,1759
8,wa_very,1669
9,the_staff,1629


In [45]:
# Filter and sort bigrams based on frequency counts
results = sorted(
                {k:v for k,v in bigram.vocab.items() if '_' in k and v >= bigram.min_count}.items(),
                key     = operator.itemgetter(1),
                reverse = True)

# Convert byte-like keys to strings before further processing
#results = [(k.decode('utf-8'), v) for k, v in results]

# Convert results to a pandas DataFrame
res_df = pd.DataFrame(results, columns=['Bigram', 'Frequency'])
# Display the top 10 bigrams
print(res_df.head(10))

      Bigram  Frequency
0   the_room       3154
1     in_the       2911
2  the_hotel       2468
3    and_the       2175
4     of_the       2029
5      it_wa       1943
6    room_wa       1848
7     to_the       1759
8    wa_very       1669
9  the_staff       1629


#### Build the trigram models

In [52]:
bigram = gensim.models.phrases.Phrases(texts, min_count=3, threshold=5)

In [53]:
# Construct bigram for document 1
print(bigram[texts[0]])

['our', 'experience', 'at', 'rancho', 'valencia', 'wa', 'absolutely_perfect', 'from_beginning', 'to', 'end', 'we', 'felt', 'special', 'and', 'very', 'happy', 'during_our', 'stayed', 'i_would', 'come_back', 'in', 'a', 'heart_beat']


In [54]:
import operator

results = sorted(
                {k:v for k,v in bigram.vocab.items() if '_' in k if v>=bigram.min_count}.items(),
                key=operator.itemgetter(1),
                reverse=True)

res_df = pd.DataFrame(results)
res_df.head(10)

Unnamed: 0,0,1
0,the_room,3154
1,in_the,2911
2,the_hotel,2468
3,and_the,2175
4,of_the,2029
5,it_wa,1943
6,room_wa,1848
7,to_the,1759
8,wa_very,1669
9,the_staff,1629


#### How does GENSIM detect n-grams

In [56]:
import random
import pandas as pd

In [86]:
# Sample text data
sentences = [
    "the quick brown fox jumps over the lazy dog",
    "the lazy dog slept in the shade of the tree"
]

In [87]:
# Define vocabulary (unique words in the corpus)
vocabulary = set()
for sentence in sentences:
    vocabulary.update(sentence.split())

In [88]:
# Create a dictionary to map each word to an index
word_to_index = {word: i for i, word in enumerate(vocabulary)}

In [89]:
word_to_index

{'over': 0,
 'shade': 1,
 'fox': 2,
 'brown': 3,
 'dog': 4,
 'lazy': 5,
 'slept': 6,
 'jumps': 7,
 'in': 8,
 'quick': 9,
 'the': 10,
 'of': 11,
 'tree': 12}

In [90]:
# Initialize co-occurrence matrix with zeros
co_occurrence_matrix = np.zeros((len(vocabulary), len(vocabulary)), dtype=int)

In [91]:
# Define window size for co-occurrence counting
window_size = 2

In [92]:
# Iterate through each sentence to update co-occurrence counts
for sentence in sentences:
    words = sentence.split()
    for i, word in enumerate(words):
        current_index = word_to_index[word]
        start = max(i - window_size, 0)
        end = min(i + window_size + 1, len(words))
        context = words[start:i] + words[i+1:end]
        for context_word in context:
            context_index = word_to_index[context_word]
            co_occurrence_matrix[current_index][context_index] += 1

In [93]:
# Convert co-occurrence matrix to DataFrame for better visualization
co_occurrence_df = pd.DataFrame(co_occurrence_matrix, index=vocabulary, columns=vocabulary)


In [94]:
co_occurrence_df

Unnamed: 0,over,shade,fox,brown,dog,lazy,slept,jumps,in,quick,the,of,tree
over,0,0,1,0,0,1,0,1,0,0,1,0,0
shade,0,0,0,0,0,0,0,0,1,0,2,1,0
fox,1,0,0,1,0,0,0,1,0,1,0,0,0
brown,0,0,1,0,0,0,0,1,0,1,1,0,0
dog,0,0,0,0,0,2,1,0,1,0,2,0,0
lazy,1,0,0,0,2,0,1,0,0,0,2,0,0
slept,0,0,0,0,1,1,0,0,1,0,1,0,0
jumps,1,0,1,1,0,0,0,0,0,0,1,0,0
in,0,1,0,0,1,0,1,0,0,0,1,0,0
quick,0,0,1,1,0,0,0,0,0,0,1,0,0
