# IMPORTS

In [21]:
import re
from nltk.tokenize import word_tokenize
from string import punctuation 
import nltk
# nltk.download('stopwords')
# nltk.download('punkt')
from nltk.corpus import stopwords
from nltk import punkt
import random as r
import pymysql

# DATA

In [2]:
import csv

input_file = csv.DictReader(open("review_data_sentiment.csv",encoding='cp850'))
data = []
k = 0
for i in input_file:
    data.append(dict(i))

x = round(0.8*(len(data)))
data2 = data[:]

train_data = r.sample(data, x)
for i in train_data:
    data2.remove(i)
test_data = data2[:]

print("TRAIN DATA")
print(len(train_data))
print("TEST DATA")
print(len(test_data))
# type(input_file)

TRAIN DATA
21230
TEST DATA
5308


# MODEL

In [3]:
class PreProcessReviews:
    def __init__(self):
        self._stopwords = set(stopwords.words('english') + list(punctuation) + ['AT_USER','URL'])
        
    def processReviews(self, list_of_reviews):
        processedReviews=[]
            
        for review in list_of_reviews:
            processedReviews.append((self._processReview(review["text"]),review["sentiment"]))
        return processedReviews
    
    def _processReview(self, review):
        review = review.lower() # convert text to lower-case
        review = re.sub('((www\.[^\s]+)|(https?://[^\s]+))', 'URL', review) # remove URLs
        review = re.sub('@[^\s]+', 'AT_USER', review) # remove usernames
        review = re.sub(r'#([^\s]+)', r'\1', review) # remove the # in #hashtag
        review = word_tokenize(review) # remove repeated characters (helloooooooo into hello)
        return [word for word in review if word not in self._stopwords]
    
reviewProcessor = PreProcessReviews()
preprocessedTrainingSet = reviewProcessor.processReviews(train_data)
print(preprocessedTrainingSet[0])
def buildVocabulary(preprocessedTrainingData):
    all_words = []
    
    for (words, sentiment) in preprocessedTrainingData:
        all_words.extend(words)

    wordlist = nltk.FreqDist(all_words)
    word_features = wordlist.keys()
    
    return word_features
def extract_features(review):
    review_words = set(review)
    features = {}
    for word in word_features:
        features['contains(%s)' % word] = (word in review_words)
    return features

word_features = buildVocabulary(preprocessedTrainingSet)
trainingFeatures = nltk.classify.apply_features(extract_features, preprocessedTrainingSet)
print(trainingFeatures[:5])
print(word_features[:5])

(['bought', '3', 'different', 'micro', 'bluetooth', 'speakers', 'one', 'finished', 'close', 'first', 'place', 'get', 'quality', 'standpoint', 'speaker', 'one-quarter', 'price', 'speaker', 'rated', 'number', 'one', 'one', 'lasts', 'long', 'number', 'one', 'rated', 'speaker', 'one', 'far', 'best', 'value'], 'T')


TypeError: 'dict_keys' object is not subscriptable

# SAVE WORD FEATURES

In [73]:
w = list(word_features)
with open('wordfeatures.txt', 'w') as f:
    for item in w:
        f.write("%s\n" % item)

# TRAINING

In [7]:
NBayesClassifier = nltk.NaiveBayesClassifier.train(trainingFeatures)
print(NBayesClassifier)

<nltk.classify.naivebayes.NaiveBayesClassifier object at 0x000000523C38AC18>


# SAVE MODEL

In [8]:
import pickle
f = open('NBClassifier_1', 'wb')
pickle.dump(NBayesClassifier, f)
f.close()

# TESTING

In [9]:
preprocessedTestingSet = reviewProcessor.processReviews(test_data)
test = []
result = []
for i in range(len(data)-x):
    t = r.choice(preprocessedTestingSet)
    preprocessedTestingSet.remove(t)
    test.append(t[0])
    result.append(t[1])
# print(test)
# print(result)
k = 0
p = 0
for i in test:
    NB = NBayesClassifier.classify(extract_features(i))
#     print(NB," ",result[k])
    if NB == result[k]:
        p = p + 1
    k = k + 1
accuracy = p*100/k
print("ACCURACY =")
print(accuracy)

ACCURACY =
84.8153730218538


# DATABASE

In [56]:
import pymysql
conn = pymysql.connect(host="localhost",user="root",passwd="",db="fake_feedback")
cur = conn.cursor()
print("Database Connected")

try:
    d = "select * from reviews"
    cur.execute(d)
    review_data = cur.fetchall()
#     print(data)
    neg = "select * from negative_words"
    cur.execute(neg)
    neg_words = cur.fetchall()
#     print(neg_words)
except Exception as e:
    print("ERROR=",e)
    
review = []
ip_addr = []
negatives = []

for i in review_data:
    review.append(i[2])
    ip_addr.append(i[1])

for i in neg_words:
    negatives.append(i[1])

me_list = ['i','me','myself','my','mine','us','we','our']

ln = ['.com','.org','.net','.gov']

# print(ip_addr)
# print(negatives)

Database Connected


# TO REMOVE PUNCTUATIONS

In [57]:
def remove_punc(s):
    # define punctuation
    punctuations = '''!()-[]{};:'"\,<>./?@#$%^&*_~'''
    
    no_punct = ""
    for char in s:
        if char not in punctuations:
            no_punct = no_punct + char

    # display the unpunctuated string
    return no_punct

# VERIFICATION

In [64]:
def verify_fake(ip,r):
    v = [0 for i in range(4)]
    r1 = remove_punc(r)
    rev1 = r1.split(" ")
    rev = [i.lower() for i in rev1]
    nc = 0
    myc = 0
    linkc = 0
    
    ipc = ip_addr.count(ip)
    for i in negatives:
        nc = nc + rev.count(i)
    myc = 0
    for i in me_list:
        myc = myc + rev.count(i)
    for i in r.split(" "):
        for j in ln:
            if j in i:
                linkc = linkc + 1
    if ipc > 1:
        v[0] = 1
    if nc > 3:
        v[1] = 1
    if myc > 4:
        v[2] = 1
    if linkc > 0:
        v[3] = 1

    return v   

In [65]:
verify = ['Multiple reviews from same IP','Too many Negative Words','Self Promotion','Promotions via Links']

# IP GENERATION (FOR OFFLINE PURPOSES)

In [66]:
import random as r
def ip_generator():
    num = [i for i in range(255)]
    ip = str(r.choice(num))+"."+str(r.choice(num))+"."+str(r.choice(num))+"."+str(r.choice(num))
    return ip

# FAKE REVIEW DETECTION WITH ANALYSIS

In [67]:
import random as r
def review_detection(r_ip,ipr,model):
    result = ['','']
    fake_reasons = []
    
    if ipr == -1:
        return -1

    v = verify_fake(ipr,r_ip)
    if v.count(1)>=2:
        result[0] = 'FAKE'
    else:
        result[0] = 'NOT FAKE'
    for i in range(4):
        if v[i] == 1:
            fake_reasons.append(verify[i])
    
    preprocessedReview = reviewProcessor._processReview(r_ip)
    NB = model.classify(extract_features(preprocessedReview))
    
    if NB == 'T':
        result[1] = 'POSITIVE'
    elif NB == 'F':
        result[1] = 'NEGATIVE'
    else:
        result[1] = 'UNDEFINED'
    
    result.append(fake_reasons)
    
    return result

# LOAD MODEL

In [68]:
import pickle
f = open('NBClassifier_1', 'rb')
model = pickle.load(f)
f.close()

# EXAMPLE

In [69]:
r_ip = input('Enter Review:-')
ip_ip = int(input('0 for repeat ip; 1 for new ip:-'))

if ip_ip == 0:
    ipr = r.choice(ip_addr)
elif ip_ip == 1:
    ipr = ip_generator()
else: 
    print("Invalid option")
    ipr = -1

# print(ipr)

answer = review_detection(r_ip,ipr,model)

if answer == -1:
    print("ERROR")
else:
    print()
    print()
    print("REVIEW CLASSIFICATION:")
    print(answer[1])
    print(answer[0])
    print()
    if answer[0] == 'FAKE':
        print("REASONS:")
        for i in answer[2]:
            print(i)
    if answer[0] == 'NOT FAKE':
        print("RAISED RED FLAGS:")
        for i in answer[2]:
            print(i)
#     print(answer)

Enter Review:-Finger print very poor, can't function well apart from this there is Heating Problems. My phone is my, me I myself. use jhghjag.com
0 for repeat ip; 1 for new ip:-0


REVIEW CLASSIFICATION:
NEGATIVE
FAKE

REASONS:
Multiple reviews from same IP
Self Promotion
Promotions via Links


In [30]:
# Very Nice Phone. Nice colour and Design. 6.5 inch screen makes a great viewing experience for movies and game's specialy PUBG. Super fast Charging. Full Charge in just 1.5 Hrs. But memory need to be 128 GB for this price range. Me me mine my visit aksjdgjksahd.com us we mine.
# Finger print very poor, can't function well apart from this there is Heating Problems
# Very Nice Phone. Nice colour and Design. 6.5 inch screen makes a great viewing experience for movies and game's specialy PUBG. Super fast Charging. Full Charge in just 1.5 Hrs. But memory need to be 128 GB for this price range.