In [1]:
#All code from stech
import pandas as pd
import re
import string
import nltk
from nltk import FreqDist
from nltk.tokenize import word_tokenize


pd.set_option('display.max_colwidth',100)


In [2]:
#Retrieve Positive comment corpus
Positivedata = pd.read_csv('Positive comments', sep='\t', header = None)
Positivedata.columns = ['msg']

In [3]:
#Retrieve Negative comment corpus
Negativedata = pd.read_csv('Negative comments', sep='\t', header = None)
Negativedata.columns = ['msg']

In [4]:
#function for corpus comment preprocessing
def clean_text(text):
    text = "".join([c for c in text.lower() if c not in string.punctuation])
    tokens = re.split('\W+', text)
    text = " ".join([word for word in tokens])
    return text

In [5]:
#Cleaned positive data
Positivedata['msg_clean'] = Positivedata['msg'].apply(lambda x: clean_text(x))
Positivedata.head()

Unnamed: 0,msg,msg_clean
0,me wage watina deyak ratata labadun harshada Silva mathithumani oba thumata vijayakgrahanayak la...,me wage watina deyak ratata labadun harshada silva mathithumani oba thumata vijayakgrahanayak la...
1,Oba thamayi niyama nayakaya daksayi.parinathyi,oba thamayi niyama nayakaya daksayiparinathyi
2,Api pinkrala nha obavgenayakyekugen sewayalabanna gunagaruk budimat naykaykek obatuma ape minisu...,api pinkrala nha obavgenayakyekugen sewayalabanna gunagaruk budimat naykaykek obatuma ape minisu...
3,Oba parliament ekata awasyama mohothai me. Chanda laksa 2.5 k ho dunna minissu wenuwen Enna plz,oba parliament ekata awasyama mohothai me chanda laksa 25 k ho dunna minissu wenuwen enna plz
4,podi minihage duka hadunana ekama nayakaya obathuma pamanai,podi minihage duka hadunana ekama nayakaya obathuma pamanai


In [6]:
#Cleaned negative data
Negativedata['msg_clean'] = Negativedata['msg'].apply(lambda x: clean_text(x))
Negativedata.head()


Unnamed: 0,msg,msg_clean
0,VIP ratha walata janathawa me dakwana wiodaya anduwa kerehi uu yanatha wirodaya saha tharahawa s...,vip ratha walata janathawa me dakwana wiodaya anduwa kerehi uu yanatha wirodaya saha tharahawa s...
1,"Duminda,S.M.Chandrasena, ranjith , uddika premarathna yana mantri waruni, Thalawa eppawala pare ...",dumindasmchandrasena ranjith uddika premarathna yana mantri waruni thalawa eppawala pare deshapa...
2,wathman arbudayata hethuwa asarthaka uu palakainya,wathman arbudayata hethuwa asarthaka uu palakainya
3,rassana ne hemadama pare. chandeta witharak api mathak wena deshapalanaya,rassana ne hemadama pare chandeta witharak api mathak wena deshapalanaya
4,Janathawa biyata path wela adahas dakwanna beri welaaka janawaramatath pitupaala anduwa port cit...,janathawa biyata path wela adahas dakwanna beri welaaka janawaramatath pitupaala anduwa port cit...


In [7]:
#function for user enterd comment preprocessing 
def comment_clean_text(text):
    text = "".join([c for c in text.lower() if c not in string.punctuation])
    tokens = re.split('\W+', text)
    return tokens

In [8]:
#probability calculation
def probability_calcuation(user_comment, comments):
    unigram_list = []
    bigram_list = []

    for row in comments:
        tokens = re.split('\W+', row)
        unigram_list = unigram_list + list(tokens)
        bigrams = nltk.bigrams(tokens)
        bigram_list = bigram_list + list(bigrams)
    
    unigramfreq = FreqDist(unigram_list)
    bigramfreq = FreqDist(bigram_list)
  
    probability = 1.0
    
    for word in range(len(user_comment)):
        if user_comment[word] in bigramfreq:
            bigram_count = bigramfreq[user_comment[word]]
        else:
            bigram_count = 0

        if user_comment[word][0] in unigramfreq:
            unigram_count = unigramfreq[user_comment[word][0]]
        else:
            unigram_count = 0

        V = len(set(unigram_list))
        probability = probability * ((bigram_count + 1) / (unigram_count + V))

    return probability

In [9]:
#perplexity calculation
def perplexity_calculation(probability, n):
    return pow((1/probability),(1/n))

In [10]:
comment = input("Please enter your comment: ")

cleaned_comment = comment_clean_text(comment)
number_of_words = len(cleaned_comment)
user_comment = list(nltk.bigrams(cleaned_comment))

negative_comments = Negativedata['msg_clean']
positive_comments = Positivedata['msg_clean']

positive_probability = probability_calcuation(user_comment, positive_comments)
negative_probability = probability_calcuation(user_comment, negative_comments)

if(positive_probability > negative_probability):  #compare positive and negative probabilities
    perplexity = perplexity_calculation(positive_probability, number_of_words)
    print(" ")
    print("Positive Comment")
    print("Perplexity of comment  = " + str(perplexity))
else:
    perplexity = perplexity_calculation(negative_probability, number_of_words)
    print(" ")
    print("Negative comment")
    print("Perplexity of comment = " + str(perplexity))
    
print("")
print("Positive probability = " , positive_probability)
print("Negative probability =" , negative_probability)

Please enter your comment: dheshapalana jiwithaye madhutu nihathama nayakaya obata jaya niyathayi mewarath ,..
 
Positive Comment
Perplexity of comment  = 324.9100099599818

Positive probability =  7.62726938347061e-26
Negative probability = 8.525370401911896e-27
