# **1. Pre Processing**

---



**install Hazm library**

In [None]:
!pip install hazm



# **Import library**

In [None]:
import pandas as pd
from hazm import sent_tokenize, word_tokenize, stopwords_list, Normalizer, POSTagger
import re
import string
from collections import Counter
from itertools import chain

import nltk
from nltk.probability import LaplaceProbDist, SimpleGoodTuringProbDist

# **Read file**

In [None]:
data = pd.read_csv('/content/digikala_comment.csv')

# **Tpkenize to sentences**

In [None]:
sentences = [sent_tokenize(comment) for comment in data['comment']]

sentences[-2]

['سلام ٬ چندماهی میشه این پاور بانک رو تهیه کردم ٬ طبق تستی که انجام دادم 2 بار گوشی 3200 میلی آمپری رو شارژ میکنه که جمعاً 6400 میلی آمپر اینکه گفتن 12.500 ٬ ادعایی بیش نیست خیلی طول میکشه خودش شارژ شه ٬ زودم خالی میشه عدد 100 رو هم نمیتونه نشون بده چون LED که داره 2 رقمیه ٬ پس انتهاش 99 هست وزنش هم زیاده']

# **Remove Punctuation**

In [None]:
sentences = [[re.sub(r'[^\w\s]', '', sentence) for sentence in sublist] for sublist in sentences]

sentences[-2]

['سلام  چندماهی میشه این پاور بانک رو تهیه کردم  طبق تستی که انجام دادم 2 بار گوشی 3200 میلی آمپری رو شارژ میکنه که جمعا 6400 میلی آمپر اینکه گفتن 12500  ادعایی بیش نیست خیلی طول میکشه خودش شارژ شه  زودم خالی میشه عدد 100 رو هم نمیتونه نشون بده چون LED که داره 2 رقمیه  پس انتهاش 99 هست وزنش هم زیاده']

# **Rmove Extra Blanck Space**

In [None]:
sentences = [[re.sub(r"^\s+|\s+$", "", re.sub(r'\s+', ' ', sentence)) for sentence in sublist] for sublist in sentences]

sentences[-2]

['سلام چندماهی میشه این پاور بانک رو تهیه کردم طبق تستی که انجام دادم 2 بار گوشی 3200 میلی آمپری رو شارژ میکنه که جمعا 6400 میلی آمپر اینکه گفتن 12500 ادعایی بیش نیست خیلی طول میکشه خودش شارژ شه زودم خالی میشه عدد 100 رو هم نمیتونه نشون بده چون LED که داره 2 رقمیه پس انتهاش 99 هست وزنش هم زیاده']

# **Start and Stop Token**

In [None]:
SOS = "<s> "
EOS = "</s>"
UNK = "<UNK>"

def add_start_end_tokens(sentences, n):
    """Wrap each sentence in SOS and EOS tokens.

    For n >= 2, n-1 SOS tokens are added, otherwise only one is added.

    Args:
        sentences (list of str): the sentences to wrap.
        n (int): order of the n-gram model which will use these sentences.
    Returns:
        List of sentences with SOS and EOS tokens wrapped around them.

    """
    sos = SOS * (n-1) if n > 1 else SOS
    return [['{}{} {}'.format(sos, sentence, EOS) for sentence in sublist] for sublist in sentences]

# **Replace Singletones words**

In [None]:
def replace_singletons(sentences):
    """Replace tokens which appear only once in the corpus with <UNK>.

    Args:
        tokens (list of str): the tokens comprising the corpus.
    Returns:
        The same list of tokens with each singleton replaced by <UNK>.

    """
    # Tokenize each sentence using word_tokenize
    words = [sentence.split() for sentence in chain.from_iterable(sentences)]

    # Flatten the list of lists
    flat_tokens = list(chain.from_iterable(words))

    # Count the frequency of each word
    vocab = Counter(flat_tokens)
    return [token if vocab[token] > 1 else UNK for token in flat_tokens]

# **2. Model Language**

---



# **Generate N-gram**

In [None]:
def ngram(sentences, grams):
    model = {}
    sentences = add_start_end_tokens(sentences, grams)
    words = replace_singletons(sentences)
    for i in range(len(words) - grams + 1):
        n_gram = tuple(words[i:i + grams])
        if n_gram in model.keys():
            # Increment the count for this n-gram
            model[n_gram] += 1
        else:
            # Initialize this n-gram count to 1
            model[n_gram] = 1

    return model

# **Ngrams**

In [None]:
unigrams = ngram(sentences, 1)
bigrams = ngram(sentences, 2)
trigrams = ngram(sentences, 3)

print(f"unigrams:\n{unigrams}\n\nbigrams:\n{bigrams}\n\ntrigrams:\n{trigrams}")

unigrams:
{('<s>',): 463, ('نسبت',): 26, ('به',): 153, ('قیمتش',): 18, ('ارزش',): 11, ('خرید',): 28, ('داره',): 73, ('<UNK>',): 1483, ('طراحیش',): 2, ('قشنگه',): 3, ('تنها',): 4, ('هست',): 56, ('که',): 187, ('باعث',): 11, ('میشه',): 37, ('باشه',): 28, ('</s>',): 463, ('چند',): 16, ('ماهی',): 2, ('گرفتمش',): 2, ('برای',): 81, ('برنامه',): 8, ('نویسی',): 2, ('و',): 366, ('کارای',): 3, ('گرافیکی',): 2, ('ازش',): 12, ('استفاده',): 39, ('میکنم',): 25, ('واقعا',): 32, ('از',): 208, ('هر',): 18, ('لحاظ',): 4, ('عالیه',): 27, ('جدید',): 2, ('اقا',): 2, ('همه',): 11, ('چیش',): 2, ('خوبه',): 33, ('فقط',): 31, ('پایین',): 11, ('زیاد',): 18, ('با',): 99, ('روشن',): 5, ('شدن',): 6, ('گوشی',): 71, ('بیشتر',): 16, ('هم',): 121, ('نکته',): 2, ('دیگه',): 21, ('اینکه',): 18, ('خاطر',): 4, ('این',): 163, ('یه',): 41, ('گلس',): 6, ('بعد',): 26, ('مدتی',): 2, ('جدا',): 3, ('ولی',): 60, ('در',): 80, ('کل',): 17, ('قیمت',): 33, ('بهترین',): 7, ('دوربین',): 17, ('تا',): 40, ('رم',): 5, ('پی',): 2, ('یو',): 3,

# **Sort count matrix**

In [None]:
sorted_unigrams = sorted(unigrams.items(), key=lambda x: x[1], reverse=True)
sorted_bigrams = sorted(bigrams.items(), key=lambda x: x[1], reverse=True)
sorted_trigrams = sorted(trigrams.items(), key=lambda x: x[1], reverse=True)

**Top 8 Unigram**

In [None]:
pd.DataFrame(sorted_unigrams[:8], columns=['unigram', 'count'])

Unnamed: 0,unigram,count
0,"(<UNK>,)",1483
1,"(<s>,)",463
2,"(</s>,)",463
3,"(و,)",366
4,"(از,)",208
5,"(که,)",187
6,"(این,)",163
7,"(به,)",153


**Top 8 Bigram**

In [None]:
pd.DataFrame(sorted_bigrams[:8], columns=['bigram', 'count'])

Unnamed: 0,bigram,count
0,"(</s>, <s>)",462
1,"(<UNK>, <UNK>)",270
2,"(<UNK>, </s>)",80
3,"(و, <UNK>)",80
4,"(<UNK>, و)",71
5,"(<UNK>, که)",48
6,"(<s>, <UNK>)",42
7,"(از, <UNK>)",41


**Top 8 Trigram**

In [None]:
pd.DataFrame(sorted_trigrams[:8], columns=['trigram', 'count'])

Unnamed: 0,trigram,count
0,"(</s>, <s>, <s>)",462
1,"(<UNK>, </s>, <s>)",80
2,"(<UNK>, <UNK>, <UNK>)",56
3,"(<s>, <s>, <UNK>)",42
4,"(<s>, <s>, من)",34
5,"(<UNK>, و, <UNK>)",23
6,"(<s>, <s>, برای)",20
7,"(و, <UNK>, <UNK>)",20


# **Ngrams Probability**

In [None]:
def estimate_probability(word, previous_n_gram, n_gram_counts):
    previous_n_gram = tuple(previous_n_gram)
    previous_n_gram_count = n_gram_counts.get(previous_n_gram, 0)

    if len(previous_n_gram) == 1:
        # Unigram probability with Laplace smoothing
        prob_dist = nltk.LaplaceProbDist(nltk.FreqDist(n_gram_counts))
        ngram_prob = prob_dist.prob((word,))
    else:
        # Other gram-n probability with SimpleGoodTuring smoothing
        prob_dist = nltk.SimpleGoodTuringProbDist(nltk.FreqDist(n_gram_counts))
        ngram_prob = prob_dist.prob(previous_n_gram + (word,))

    return ngram_prob

In [None]:
def calculate_ngram_probabilities(ngram_counts):
    ngram_probabilities = {}

    # Iterate over each n-gram and its count
    for ngram, count in ngram_counts.items():
        # Check if it's a unigram or higher order n-gram
        if len(ngram) == 1:
            probability = estimate_probability(ngram[0], (), ngram_counts)
        else:
            probability = estimate_probability(ngram[-1], ngram[:-1], ngram_counts)

        # Store the probability in the dictionary
        ngram_probabilities[ngram] = probability

    return ngram_probabilities

In [None]:
unigram_prob = calculate_ngram_probabilities(unigrams)
bigram_prob = calculate_ngram_probabilities(bigrams)
trigram_prob = calculate_ngram_probabilities(trigrams)

print(f"unigrams:\n{unigram_prob}\n\nbigrams:\n{bigram_prob}\n\ntrigrams:\n{trigram_prob}")

unigrams:
{('<s>',): 0.05083912935376905, ('نسبت',): 0.002747043002222248, ('به',): 0.01672093416075611, ('قیمتش',): 0.0018684887298897875, ('ارزش',): 0.001101858726927239, ('خرید',): 0.002966847715077433, ('داره',): 0.007916969937549234, ('<UNK>',): 0.16310058358154794, ('طراحیش',): 0.00014193201410480537, ('قشنگه',): 0.00024184282359126686, ('تنها',): 0.00034578567149146143, ('هست',): 0.00604644204387057, ('که',): 0.020462843164672923, ('باعث',): 0.001101858726927239, ('میشه',): 0.003956380439750571, ('باشه',): 0.002966847715077433, ('</s>',): 0.05083912935376905, ('چند',): 0.0016491333870212207, ('ماهی',): 0.00014193201410480537, ('گرفتمش',): 0.00014193201410480537, ('برای',): 0.008797289984526703, ('برنامه',): 0.0007751033424238826, ('نویسی',): 0.00014193201410480537, ('و',): 0.04016333834191288, ('کارای',): 0.00024184282359126686, ('گرافیکی',): 0.00014193201410480537, ('ازش',): 0.0012111277677633928, ('استفاده',): 0.004176338349341506, ('میکنم',): 0.0026371588651964767, ('واقعا',)

# **Perplexity**

In [None]:
def calculate_perplexity(sentence: list, n_gram_counts: dict, flag=True):
    n = len(list(n_gram_counts.keys())[0])

    if flag:
      sentence = ["<s>"] * n + sentence + ["</s>"]
      words = [sub_sentence.split() for sub_sentence in sentence]
      words = list(chain.from_iterable(words))
    else:
      words = sentence

    words = tuple(words)
    N = len(words)

    product_pi = 1.0
    for t in range(n, N):
        n_gram = words[t - n: t]
        word = words[t]

        probability = estimate_probability(word=word, previous_n_gram=n_gram, n_gram_counts=n_gram_counts)
        product_pi *= 1/probability

    perplexity = product_pi**(1/N)
    return perplexity

In [None]:
sentence_test  = [
    ["این لپ تاپ سخت افزار خیلی قوی داره و از پس هرکاری به راحتی بر میاد"],
    ["این ساعت بسیار زیبا طراحی و ساخته شده"],
    ["یک محصول با کیفیت ایرانی که حقیقتا جای حمایت داره"],
    ["بوش و ماندگاری خوب هست من خیلی دوستش دارم"]
]

for sentence in sentence_test:
    print("\nSentence:", sentence)
    # perplexity unigram
    unigram_perplexity = calculate_perplexity(sentence, unigrams)
    print("Unigram Perplexity:", unigram_perplexity)

    # perplexity bigram
    bigram_perplexity = calculate_perplexity(sentence, bigrams)
    print("Bigram Perplexity:", bigram_perplexity)

    # perplexity trigram
    trigram_perplexity = calculate_perplexity(sentence, trigrams)
    print("Trigram Perplexity:", trigram_perplexity)


Sentence: ['این لپ تاپ سخت افزار خیلی قوی داره و از پس هرکاری به راحتی بر میاد']
Unigram Perplexity: 390.5734340154642
Bigram Perplexity: 1.973494497353576
Trigram Perplexity: 1.2504054780129021

Sentence: ['این ساعت بسیار زیبا طراحی و ساخته شده']
Unigram Perplexity: 196.14585947745513
Bigram Perplexity: 1.8619803457113262
Trigram Perplexity: 1.2179602608167037

Sentence: ['یک محصول با کیفیت ایرانی که حقیقتا جای حمایت داره']
Unigram Perplexity: 247.90061462679748
Bigram Perplexity: 1.9019759019782352
Trigram Perplexity: 1.2294500418365617

Sentence: ['بوش و ماندگاری خوب هست من خیلی دوستش دارم']
Unigram Perplexity: 175.646936278939
Bigram Perplexity: 1.8835390615058878
Trigram Perplexity: 1.2241336589080962


# **Predict word**

In [63]:
import random

def suggest_sentence(sentence, n_gram_models, n_gram_probabilities, num_words=12):
    random.seed(50)
    suggested_sentence = sentence.copy()
    n_gram_order = len(list(n_gram_models.keys())[0])
    suggested_words = set()

    while len(suggested_sentence) < num_words:
        # Determine the context based on the n-gram order
        if n_gram_order == 1:
            context = ()
        elif n_gram_order == 2:
            context = tuple(suggested_sentence[-1:])
        elif n_gram_order == 3:
            context = tuple(suggested_sentence[-2:])

        # Get the probabilities for the next word based on the context
        prob_dist = n_gram_probabilities

        # Get the possible next words and their probabilities
        possible_next_words = [ngram[-1] for ngram in prob_dist.keys() if ngram[:-1] == context and ngram[-1] not in ['<s>', '</s>']]
        next_word_probs = [prob_dist[ngram] for ngram in prob_dist.keys() if ngram[:-1] == context and ngram[-1] not in ['<s>', '</s>']]

        # Choose the next word based on the probabilities
        if next_word_probs:
            # Find the indices of the top 6 probabilities
            top_indices = sorted(range(len(next_word_probs)), key=lambda i: next_word_probs[i], reverse=True)[:6]

            # Randomly select one of the top 6 words
            next_word_index = random.choice(top_indices)
            next_word = possible_next_words[next_word_index]

            # Check if the suggested word has already been suggested
            while next_word in suggested_words:
                top_indices.remove(next_word_index)
                if top_indices:
                    next_word_index = random.choice(top_indices)
                    next_word = possible_next_words[next_word_index]
                else:
                    break

            # Add the suggested word to the suggested words set
            suggested_words.add(next_word)
        else:
            # If there are no words for the given context, break the loop
            break

        # Append the next word to the suggested sentence
        suggested_sentence.append(next_word)

    return suggested_sentence


In [None]:
def convert_oov(sentence):
  words = []
  for word in sentence:
    flag = 0
    for uni, count in unigrams.items():
      if uni[0] == word:
        flag = 1
        words.append(word)
        break
    if flag == 0:
      words.append('<UNK>')
  return words

# **Show Suggests and Perplexity**

In [64]:
sentence_test  = [
    ["کیفیت محصولات چینی زرین"],
    ["از لحاظ جنس جنس خوبی داره"],
    ["حتما پیشنهاد میکنم"],
    ["بعد از چند روز استفاده"]
]

for sentence in sentence_test:
  words_list = [sub_sentence.split() for sub_sentence in sentence]
  words_list = list(chain.from_iterable(words_list))
  words_list = convert_oov(words_list)

  print("\nSentence:", sentence)
  unigram_suggestion = suggest_sentence(words_list, unigrams, unigram_prob)
  print("\nUnigram suggestion:")
  for i, item in enumerate(unigram_suggestion, start=1):
        print(f"{i}. {item}")
  print("Perplexity: ", calculate_perplexity(unigram_suggestion, unigrams, False))


  bigram_suggestion = suggest_sentence(words_list, bigrams, bigram_prob)
  print("\nBigram suggestion:")
  for i, item in enumerate(bigram_suggestion, start=1):
        print(f"{i}. {item}")
  print("Perplexity: ", calculate_perplexity(bigram_suggestion, bigrams, False))

  trigram_suggestion = suggest_sentence(words_list, trigrams, trigram_prob)
  print("\nTrigram suggestion:")
  for i, item in enumerate(trigram_suggestion, start=1):
        print(f"{i}. {item}")
  print("Perplexity: ", calculate_perplexity(trigram_suggestion, trigrams, False))


Sentence: ['کیفیت محصولات چینی زرین']

Unigram suggestion:
1. کیفیت
2. محصولات
3. چینی
4. <UNK>
5. که
6. از
7. و
8. به
9. <UNK>
10. این
11. این
12. به
Perplexity:  54.37351766214306

Bigram suggestion:
1. کیفیت
2. محصولات
3. چینی
4. <UNK>
5. باشه
6. کامل
7. اما
8. وقتی
9. با
10. تنظیمات
11. <UNK>
12. هست
Perplexity:  1.8835390615058878

Trigram suggestion:
1. کیفیت
2. محصولات
3. چینی
4. <UNK>
5. مدل
6. <UNK>
7. رو
8. خرید
9. ک
10. <UNK>
11. باتری
12. داد
Perplexity:  1.2179602608167037

Sentence: ['از لحاظ جنس جنس خوبی داره']

Unigram suggestion:
1. از
2. لحاظ
3. جنس
4. جنس
5. خوبی
6. داره
7. که
8. از
9. و
10. به
11. <UNK>
12. این
Perplexity:  87.29164571022395

Bigram suggestion:
1. از
2. لحاظ
3. جنس
4. جنس
5. خوبی
6. داره
7. دسته
8. همیشه
9. <UNK>
10. جدید
11. اون
12. قیمت
Perplexity:  1.8835390615058878

Trigram suggestion:
1. از
2. لحاظ
3. جنس
4. جنس
5. خوبی
6. داره
7. شبیه
8. پرفیوم
9. <UNK>
10. <UNK>
11. که
12. خیلی
Perplexity:  1.2179602608167037

Sentence: ['حتما پیشنهاد میکنم

# **3. POS tagging**

---



In [None]:
tagger = POSTagger(model='/content/pos_tagger.model')
tagged_dataset = [[tagger.tag(sentence.split()) for sentence in sublist] for sublist in sentences]

for sentence_tags in tagged_dataset:
    print(sentence_tags)

[[('نسبت', 'NOUN'), ('به', 'ADP'), ('قیمتش', 'NOUN'), ('ارزش', 'NOUN,EZ'), ('خرید', 'NOUN'), ('داره', 'VERB'), ('جاداره', 'NOUN,EZ'), ('طراحیش', 'NOUN'), ('قشنگه', 'VERB'), ('تنها', 'ADV'), ('مشکلش', 'NOUN'), ('بندهای', 'NOUN,EZ'), ('ضعیفش', 'ADJ'), ('هست', 'VERB'), ('که', 'SCONJ'), ('باعث', 'ADJ,EZ'), ('میشه', 'NOUN'), ('استحکام', 'NOUN,EZ'), ('چندانی', 'ADJ'), ('نداشنه', 'VERB'), ('باشه', 'VERB')]]
[[('چند', 'DET'), ('ماهی', 'NOUN'), ('میشه', 'VERB'), ('که', 'SCONJ'), ('گرفتمش', 'VERB')], [('برای', 'ADP,EZ'), ('برنامه', 'NOUN,EZ'), ('نویسی', 'NOUN'), ('و', 'CCONJ'), ('کارای', 'NOUN,EZ'), ('گرافیکی', 'ADJ'), ('ازش', 'ADP'), ('استفاده', 'NOUN'), ('میکنم', 'VERB')], [('واقعا', 'ADV'), ('از', 'ADP'), ('هر', 'DET'), ('لحاظ', 'NOUN'), ('بگین', 'NOUN'), ('عالیه', 'ADJ')]]
[[('پراید', 'NOUN,EZ'), ('ستون', 'NOUN,EZ'), ('جدید', 'ADJ')]]
[[('اقا', 'NOUN'), ('همه', 'DET,EZ'), ('چیش', 'NOUN'), ('خوبه', 'VERB'), ('فقط', 'ADV'), ('از', 'ADP'), ('پایین', 'ADV'), ('زیاد', 'ADJ,EZ'), ('حاشیه', 'NOUN')

# **Occurence of tags**

In [None]:
tag_counts = {}

for sentence_tags in tagged_dataset:
    for word_tag_pair in sentence_tags:
        if len(word_tag_pair) > 1:
            tag = word_tag_pair[1]
            if tag in tag_counts:
                tag_counts[tag] += 1
            else:
                tag_counts[tag] = 1

for tag, count in tag_counts.items():
    print(f"Tag: {tag}, Count: {count}")

Tag: ('به', 'ADP'), Count: 15
Tag: ('ماهی', 'NOUN'), Count: 1
Tag: ('برنامه', 'NOUN,EZ'), Count: 3
Tag: ('از', 'ADP'), Count: 13
Tag: ('ستون', 'NOUN,EZ'), Count: 1
Tag: ('همه', 'DET,EZ'), Count: 1
Tag: ('نکته', 'NOUN,EZ'), Count: 1
Tag: ('در', 'ADP'), Count: 2
Tag: ('هو', 'NOUN'), Count: 1
Tag: ('سبک', 'ADJ'), Count: 1
Tag: ('خیلی', 'ADV'), Count: 10
Tag: ('خوبه', 'VERB'), Count: 4
Tag: ('پیشنهاد', 'NOUN'), Count: 8
Tag: ('چند', 'DET'), Count: 3
Tag: ('کارکرد', 'NOUN,EZ'), Count: 1
Tag: ('های', 'NOUN'), Count: 4
Tag: ('و', 'CCONJ'), Count: 8
Tag: ('خوب', 'ADJ,EZ'), Count: 1
Tag: ('ساخت', 'NOUN,EZ'), Count: 1
Tag: ('کل', 'NOUN,EZ'), Count: 3
Tag: ('گردن', 'NOUN'), Count: 1
Tag: ('عالی', 'ADJ'), Count: 2
Tag: ('کفش', 'NOUN,EZ'), Count: 2
Tag: ('عنوان', 'NOUN,EZ'), Count: 1
Tag: ('من', 'PRON'), Count: 9
Tag: ('واقعا', 'ADV'), Count: 1
Tag: ('بهش', 'ADP'), Count: 1
Tag: ('تا', 'NOUN'), Count: 1
Tag: ('با', 'ADP'), Count: 4
Tag: ('بر', 'NOUN'), Count: 1
Tag: ('پیشنهاد', 'NOUN,EZ'), Count: 2

# **Nouns**

In [None]:
noun_counts = {}
for tag, count in tag_counts.items():
    if 'NOUN' in tag or 'NOUN,EZ' in tag:
      noun_counts[tag] = count

sorted_nouns = sorted(noun_counts.items(), key=lambda x: x[1], reverse=True)
pd.DataFrame(sorted_nouns[:10], columns=['tag', 'count'])

Unnamed: 0,tag,count
0,"(پیشنهاد, NOUN)",8
1,"(های, NOUN)",4
2,"(کل, NOUN)",4
3,"(برنامه, NOUN,EZ)",3
4,"(کل, NOUN,EZ)",3
5,"(ساعت, NOUN)",3
6,"(نظرم, NOUN)",3
7,"(گزینه, NOUN,EZ)",3
8,"(کفش, NOUN,EZ)",2
9,"(پیشنهاد, NOUN,EZ)",2
