In [1]:
from IPython.display import display
import timeit
from collections import defaultdict
import math
import numpy as np
import pandas as pd
import random
import seaborn as sns
from matplotlib import pyplot as plt
import matplotlib.dates as md
%matplotlib inline
import operator 

from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans

from bs4 import BeautifulSoup
import re
import nltk
from nltk import word_tokenize
from nltk.corpus import sentiwordnet as swn
from nltk.corpus import wordnet as wn
from nltk.corpus import wordnet
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk import sentiment
from autocorrect import spell # For spelling correction
from urllib import request


In [2]:

url_pos = r'https://raw.githubusercontent.com/jeffreybreen/twitter-sentiment-analysis-tutorial-201107/master/data/opinion-lexicon-English/positive-words.txt'

url_neg = r'https://raw.githubusercontent.com/jeffreybreen/twitter-sentiment-analysis-tutorial-201107/master/data/opinion-lexicon-English/negative-words.txt'

pos_list = request.urlopen(url_pos).read().decode('utf-8')[1:]
pos_list = pos_list[pos_list.find("a+"):].split("\n")

neg_list = request.urlopen(url_neg).read().decode('ISO-8859-1')[1:]
neg_list = neg_list[neg_list.find("2-faced"):].split("\n")


In [3]:
# The initial format of he annotated test_set is difficult to read
# as a dataframe, transformation to .csv format is computed first 
# with regular expressions.

test = open('data/annotated_test_set.txt','r', encoding='utf8')
test_file = test.read()
test.close()
test_file[:200]

test_file = re.sub(r"{[^{}]+}", lambda x: x.group(0).replace(",", ";"), test_file)
test_file = test_file.replace(';', "%")
test_file = test_file.replace(',', ";")
test_file = test_file.replace('%', ",")
test_file = test_file.replace('{', "{'")
test_file = test_file.replace(',', ",'")
test_file = test_file.replace(':', "':")
test_file = test_file.replace("},'", "}")

# Once fixed, save and load:
text_file = open("data/annotated_test_set_corrected.csv", "w")
for row in test_file.split(",\n"):
    text_file.write(row)
    text_file.write("\n")
text_file.close()


test = open('data/annotated_test_set_corrected.csv','r', encoding='utf8')
test_file = test.read()
test.close()

test = pd.read_csv('data/annotated_test_set_corrected.csv', delimiter = ";")
test.columns = ['review_id', 'Product', 'Sentiments_test']


In [49]:
df = pd.read_csv('data/Amazon_Unlocked_Mobile.csv', delimiter = ",")
n = len(df)
df.columns = ['Product', 'Brand', 'Price', 'Rating', 'Review', 'Votes']
df['id_col'] = range(0, n)

n_reviews = 500 # Let's get a sample
keep = sorted(random.sample(range(1,n),n_reviews))
keep += list(set(test.review_id)) # this are the reviews annotated for test

df = df[df.id_col.isin(keep)]
n_reviews = len(df)
df['id_new_col'] = range(0, n_reviews)

df.head()


Unnamed: 0,Product,Brand,Price,Rating,Review,Votes,id_col,id_new_col
53,"""Nokia Asha 302 Unlocked GSM Phone with 3.2MP ...",Nokia,299.0,5,muy buen producto,0.0,53,0
69,"""Nokia Asha 302 Unlocked GSM Phone with 3.2MP ...",Nokia,299.0,5,Nokia Asha 302 Unlocked GSM Phone with 3.2MP C...,13.0,69,1
71,"""Nokia Asha 302 Unlocked GSM Phone with 3.2MP ...",Nokia,299.0,1,"Hola, compramos dos teléfonos y vienieron tota...",2.0,71,2
73,"""Nokia Asha 302 Unlocked GSM Phone with 3.2MP ...",Nokia,299.0,5,GRACIAS ME LLEGO EL PROCTO QUE COMPRE Y LLEVO ...,0.0,73,3
75,"""Nokia Asha 302 Unlocked GSM Phone with 3.2MP ...",Nokia,299.0,4,"The keys are a little hard to hit, and I didn'...",0.0,75,4


In [5]:
id_prod = 69

for val in df[df.id_col == id_prod].Review:
    print(val)
    

Nokia Asha 302 Unlocked GSM Phone with 3.2MP Camera, Video, QWERTYDependableTraditional Nokia Menu'sNot Complicated like 'Smart Phones'DurableEasy to use on Straighttalk, Internet, WiFi, Bluetooth.


In [6]:
def get_tokens(df, stem = False, negation = False):
    stemmer = PorterStemmer()
    stop = set(stopwords.words('english'))
    reviews = []    
    i = 1
    
    for review in df["Review"]:
        tokenized_review = []      

        review = str(review).lower() # lowercase
        
        # Remove every character except A-Z, a-z,space 
        # and punctuation (we'll need it for negation)
        review = re.sub(r'[^A-Za-z /.]','',review) 
        
        # mark_negation needs punctuation separated by white space.
        review = review.replace(".", " .")   
        
        tokens = word_tokenize(review)
        
        
        for token in tokens:
            # Remove single characters and stop words
            if (len(token)>1 or token == ".") and token not in stop: 
                if stem:
                    tokenized_review.append(stemmer.stem(get_synonym(token)))            
                else:
                    tokenized_review.append(get_synonym(token))
        
        if negation:
            tokenized_review = sentiment.util.mark_negation(tokenized_review)   
        
        # Now we can get rid of punctuation and also let's fix some spellings:
        tokenized_review = [correction(x) for x in tokenized_review if x != "." ]
        
            
        reviews.append(tokenized_review)
        
        if i%100 == 0:
            print('progress: ', (i/len(df["Review"]))*100, "%")
        i = i + 1
        
    return reviews
 

def get_pos(tokenized_reviews):
    tokenized_pos = []
    
    for review in tokenized_reviews:
        tokenized_pos.append(nltk.pos_tag(review))
    
    return tokenized_pos
        
    
def get_frequency(tokens):    
    term_freqs = defaultdict(int)    
    
    for token in tokens:
        term_freqs[token] += 1 
            
    return term_freqs


def get_tdm(tokenized_reviews):
    tdm = []
    
    for tokens in tokenized_reviews:
        tdm.append(get_frequency(tokens))
    
    return tdm

def normalize_tdm(tdm):    
    tdm_normalized = []
        
    for review in tdm:
        den = 0
        review_normalized = defaultdict(int)
        
        for k,v in review.items():
            den += v**2
        den = math.sqrt(den)
    
        for k,v in review.items():
            review_normalized[k] = v/den
        
        tdm_normalized.append(review_normalized)
        
    return tdm_normalized

def get_all_terms(tokenized_reviews):
    all_terms = []
    
    for tokens in tokenized_reviews:
        for token in tokens:
            all_terms.append(token)
            
    return(set(all_terms))
    
def get_all_terms_dft(tokenized_reviews, all_terms):
    terms_dft = defaultdict(int)  
    
    for term in all_terms: 
        for review in tokenized_reviews:
            if term in review:
                terms_dft[term] += 1
                
    return terms_dft


def get_tf_idf_transform(tokenized_reviews, tdm, n_reviews):
    tf_idf = []        
    all_terms = get_all_terms(tokenized_reviews)    
    terms_dft = get_all_terms_dft(tokenized_reviews, all_terms)
    
    for review in tdm:
        review_tf_idf = defaultdict(int)
        for k,v in review.items():
            review_tf_idf[k] = v * math.log(n_reviews / terms_dft[k], 2)
        
        tf_idf.append(review_tf_idf)     
    
    return tf_idf


def get_idf_transform(tokenized_reviews, tdm, n_reviews):
    idf = []    
    terms_dft = defaultdict(int)    
    
    all_terms = get_all_terms(tokenized_reviews)
    
    for term in all_terms: 
        for review in tokenized_reviews:
            if term in review:
                terms_dft[term] += 1
    
    for review in tdm:
        review_idf = defaultdict(int)
        for k,v in review.items():
            review_idf[k] = math.log(n_reviews / terms_dft[k], 2)
        
        idf.append(review_idf)     
    
    return idf


def correction(x):
    ok_words = ["microsd"]
    
    if x.find("_NEG") == -1 and x not in ok_words: # Don't correct if they are negated words or exceptions
        return spell(x)
    else:
        return x

def get_synonym(word):
    synonyms = [["camera","video", "display"], 
                ["phone", "cellphone", "smartphone", "phones"],
               ["setting", "settings"],
               ["feature", "features"],
               ["pictures", "photos"],
               ["speakers", "speaker"]]
    synonyms_parent = ["camera", "phone", "settings", "features", "photos", "speakers"]
    
    for i in range(len(synonyms)):
        if word in synonyms[i]:
            return synonyms_parent[i]
    
    return word


def get_similarity_matrix(similarity, tokenized_reviews):
    similarity_matrix = []
    all_terms = get_all_terms(tokenized_reviews)
    
    for review in similarity:
        similarity_matrix_row = []
        for term in all_terms:
            similarity_matrix_row.append(review[term])
            
        similarity_matrix.append(similarity_matrix_row)
            
    return similarity_matrix
     


In [7]:
# EXECUTE
tic=timeit.default_timer()

tokenized_reviews = get_tokens(df, stem = False, negation = False)
tokenized_pos = get_pos(tokenized_reviews)
tdm = get_tdm(tokenized_reviews)
vsm = normalize_tdm(tdm)
tf_idf = get_tf_idf_transform(tokenized_reviews, tdm, n_reviews)

toc=timeit.default_timer()

print("minutes: ", (toc - tic)/60)


progress:  8.865248226950355 %
progress:  17.73049645390071 %
progress:  26.595744680851062 %
progress:  35.46099290780142 %
progress:  44.32624113475177 %
progress:  53.191489361702125 %
progress:  62.056737588652474 %
progress:  70.92198581560284 %
progress:  79.7872340425532 %
progress:  88.65248226950354 %
progress:  97.51773049645391 %
minutes:  6.4079536137833335


In [8]:

lookup_review = 1
for val in df[df.id_new_col == lookup_review]["Review"]: print(val)
display(tokenized_reviews[lookup_review])
display(tokenized_pos[lookup_review])
display(tdm[lookup_review])
display(tf_idf[lookup_review])


muy buen producto


['may', 'been', 'products']

[('may', 'MD'), ('been', 'VBN'), ('products', 'NNS')]

defaultdict(int, {'may': 1, 'been': 1, 'products': 1})

defaultdict(int,
            {'may': 5.615989396341781,
             'been': 7.554588851677638,
             'products': 6.232660756790275})

In [9]:
def get_product_tokens(df):
    stop = set(stopwords.words('english'))
    products = []
    i = 1
    
    for product in df["Product"]:
        tokenized_product = []      

        product = product.lower() # lowercase
        
        # Remove every character except A-Z, a-z,space 
        # and punctuation (we'll need it for negation)
        product = re.sub(r'[^0-9A-Za-z \.]','',product)    
    
        # Only consider first 10 words of the product names
        tokens = word_tokenize(product)[:11]
        
        for token in tokens:
            # Remove stop words
            if token not in stop:
                tokenized_product.append(token)       
            
        products.append(tokenized_product)
        
        if i%100 == 0:
            print('progress: ', (i/len(df["Product"]))*100, "%")
        i = i + 1
        
    return products


In [10]:

tokenized_products = get_product_tokens(df)
products_tokenized_pos = get_pos(tokenized_products)
products_tdm = get_tdm(tokenized_products)
products_tf_idf = get_tf_idf_transform(tokenized_products, products_tdm, n_reviews)
products_idf = get_idf_transform(tokenized_products, products_tdm, n_reviews)


progress:  8.865248226950355 %
progress:  17.73049645390071 %
progress:  26.595744680851062 %
progress:  35.46099290780142 %
progress:  44.32624113475177 %
progress:  53.191489361702125 %
progress:  62.056737588652474 %
progress:  70.92198581560284 %
progress:  79.7872340425532 %
progress:  88.65248226950354 %
progress:  97.51773049645391 %


In [11]:
lookup_product = 53
display(df[df.id_new_col== lookup_product]["Product"])

# we want to grab those with higher scores (least common terms)
display(sorted(products_idf[lookup_product].items(), 
               key=operator.itemgetter(1), reverse = True)) 

# Unfortunately we can't filter through POS
display(products_tokenized_pos[lookup_product])



9803    Apple iPhone 4 8GB Unlocked- Black
Name: Product, dtype: object

[('4', 5.09515723304034),
 ('8gb', 4.117183539370339),
 ('black', 2.7386719161166093),
 ('apple', 2.319372389983606),
 ('iphone', 2.3003475643018496),
 ('unlocked', 0.4956951626240688)]

[('apple', 'NN'),
 ('iphone', 'NN'),
 ('4', 'CD'),
 ('8gb', 'CD'),
 ('unlocked', 'JJ'),
 ('black', 'JJ')]

In [12]:
colors = ["black", "red", "blue", "white", "gray", "green","yellow", "pink", "gold"]
common_terms = ["smarthphone", "phone", "cellphone", "retail", "warranty", 
                "silver", "bluetooth", "wifi", "wireless", "keyboard", "gps",
               "original", "unlocked", "camera", "certified", "international",
               "actory", "packaging", "us", "usa", "international", "refurbished", 
               "phones", "att", "verizon", "-", "8gb", "16gb", "32gb", "64gb", "contract"]

def standardize_names(products_idf, colors, common_terms):
    standard_names = []
    brands = [str(x).lower() for x in set(df.Brand)]
    
    for product in products_idf:
        
        for k, v in product.items():
            # Remove color and brand words
            if k in colors or k in common_terms or k in brands:
                product[k] = 0
        
        # Grab the first 5 words with highest score
        product = sorted(product.items(), key=operator.itemgetter(1), reverse = True)[:5]
        
        standard_names.append(product)
        
        tokenized_standard_product_names = []
        
    for product in standard_names:
        product_name = []
        for word in product:
            if word[1] > 0:
                product_name.append(word[0])

        tokenized_standard_product_names.append(product_name)
    
    
        
    return tokenized_standard_product_names



In [13]:
standard_product_names = standardize_names(products_idf, colors, common_terms)

product_tdm = get_tdm(standard_product_names)
product_vsm = normalize_tdm(product_tdm)
product_vsm[1]

defaultdict(int,
            {'302': 0.4472135954999579,
             '3.2mp': 0.4472135954999579,
             'asha': 0.4472135954999579,
             'video': 0.4472135954999579,
             'qwerty': 0.4472135954999579})

In [14]:

similarity = product_tdm
product_names_clusters = int(round(n_reviews/2,0))

similarity_matrix = pd.DataFrame(get_similarity_matrix(similarity, standard_product_names), columns = get_all_terms(standard_product_names))

kmeans = KMeans(n_clusters=product_names_clusters, random_state=0).fit(similarity_matrix)
clusters=kmeans.labels_.tolist()

clustered_matrix = similarity_matrix.copy()
clustered_matrix['product_name_cluster'] = clusters
clustered_matrix['id_col'] = range(0, n_reviews)

display(clustered_matrix[:5])

count_clusters = pd.DataFrame(clustered_matrix.product_name_cluster.value_counts())
display(count_clusters[:5])


Unnamed: 0,ace,haweel,stylus,1800,orange,steelwhite,alpha,7.0,gemini,extended,...,kitkat,20mp,d618,curve,wifiinternational,liquid,leather,x2,product_name_cluster,id_col
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,45,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,45,1
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,45,2
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,45,3
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,45,4


Unnamed: 0,product_name_cluster
8,31
18,21
7,16
19,13
45,12


In [15]:
df["cluster_name"] = list(clustered_matrix.product_name_cluster)

def create_standard_name(df):
    new_names = defaultdict(int)
    
    current_names = df.groupby('cluster_name').first().Product
    
    
    for i in set(clusters):
        cluster_name = df[df.cluster_name == i].Product.value_counts().index[0]
        new_name = []
        
        for word in cluster_name.split():
            temp_word= re.sub(r'[^0-9A-Za-z \.\-]','',word).lower()
            if temp_word not in colors and temp_word not in common_terms :
                new_name.append(word)
        new_names[i] = ' '.join(new_name)
    
    new_standard_names = []
    

    for row in df.cluster_name:
        
        new_standard_names.append(new_names[row])
    
    df["Standard_Product_Name"] = new_standard_names
    
    return df

df = create_standard_name(df)         
        
df.head()    

Unnamed: 0,Product,Brand,Price,Rating,Review,Votes,id_col,id_new_col,cluster_name,Standard_Product_Name
50,"""Nokia Asha 302 Unlocked GSM Phone with 3.2MP ...",Nokia,299.0,5,Shipped quickly and was exactly what I expected!,0.0,50,0,45,"""Nokia Asha 302 GSM with 3.2MP Video, QWERTY W..."
53,"""Nokia Asha 302 Unlocked GSM Phone with 3.2MP ...",Nokia,299.0,5,muy buen producto,0.0,53,1,45,"""Nokia Asha 302 GSM with 3.2MP Video, QWERTY W..."
69,"""Nokia Asha 302 Unlocked GSM Phone with 3.2MP ...",Nokia,299.0,5,Nokia Asha 302 Unlocked GSM Phone with 3.2MP C...,13.0,69,2,45,"""Nokia Asha 302 GSM with 3.2MP Video, QWERTY W..."
71,"""Nokia Asha 302 Unlocked GSM Phone with 3.2MP ...",Nokia,299.0,1,"Hola, compramos dos teléfonos y vienieron tota...",2.0,71,3,45,"""Nokia Asha 302 GSM with 3.2MP Video, QWERTY W..."
73,"""Nokia Asha 302 Unlocked GSM Phone with 3.2MP ...",Nokia,299.0,5,GRACIAS ME LLEGO EL PROCTO QUE COMPRE Y LLEVO ...,0.0,73,4,45,"""Nokia Asha 302 GSM with 3.2MP Video, QWERTY W..."


In [16]:
df[["Product","Standard_Product_Name"]][df['Product'].str.contains("iPhone")][:8]

Unnamed: 0,Product,Standard_Product_Name
3892,"Apple A1533 Unlocked iPhone 5S Smart Phone, 16...",Apple A1533 iPhone 5S Smart 16 GB
4296,Apple a1549 iPhone 6 64GB T-Mobile (silver),Apple a1549 iPhone 6 T-Mobile
4730,Apple iPhone 3G 8GB (Black) - AT&T,Apple iPhone 3G
5357,Apple iPhone 3GS 16GB (White) - AT&T,Apple iPhone 3GS
5763,Apple iPhone 3GS 8GB Black Factory Unlocked / ...,Apple iPhone 3GS Factory / Not Jailbroken
5822,Apple iPhone 3GS 8GB Black Factory Unlocked / ...,Apple iPhone 3GS Factory / Not Jailbroken
6985,Apple iPhone 4 16GB (Black) - AT&T,Apple iPhone 4
7388,Apple iPhone 4 16GB (Black) - AT&T,Apple iPhone 4


In [17]:
            
                          
def get_all_terms_pos_dft(all_terms, terms_dft):
    all_terms_pos = nltk.pos_tag(all_terms)
    
    i = 0
    for k, v in terms_dft.items():
        all_terms_pos[i] = all_terms_pos[i] + (v,)        
        i+=1
        
    return all_terms_pos

threshold_terms=[]
def get_threshold_terms(all_terms_pos_dft, threshold = 20): 
    for term in all_terms_pos_dft:
        if term[0] in exceptions_to_consider or (term[2] >= threshold and term[1] in ["NN", "NNS", "NNP", "NNPS"] and term[0] not in exceptions_not_to_consider):
                                                              threshold_terms.append(term)
    
    return threshold_terms
            
    
exceptions_to_consider = ["apps", "android", "buttons", "hardware", "wifi",
                         "audio", "speed", "settings", "charger", "design",
                         "price", "look", "trackball", "microsd", "speaker"]

exceptions_not_to_consider = ["phone", "cool", "love", "awesome", "tell",  'tell',
 'feels',
  'works',
 'excelente',
 'item',
 'get',
 'iPhone',
 'dont',
 'lot',
 'let',
 'money',
 'brand',
 'recommend',
 'issues',
 'cant',
 'nothing',
 'number',
 'check',
 'month',
 'husband',
 'need',
 'note',
 'venezuela',
 'give',
 'Samsung',
 'see',
 'turn',
 'pocket',
 'amazing',
 'hands',
 'couldnt',
 'fast',
 'condition',
 'super',                    
 'today',
 'star',
 'life',
 'anyone',
 'storage',
 'speaker',
 'internet',
 'delivery',
 'picture',
 'games',
 'hand',
 'model',
 'glass',
 'case',
 'micro',
 'sound',
 'mp',
 'watch',
 'grm',
 'try',
 'line',
 'thing',
 'isnt',
 'thanks',
 'Verizon',
 'experience',
 'box',
 'scratches',
 'problems',
 'waste',
 'bottom',
 'company',
 'bit',
 'youre',
 'lack',
 'deal',
 'pay',
 'i',
 'reason',
 'issue',
 'couple',
 'option',
 'beautiful',
 'mobile',
 'replacement',
 'wasnt',
 'way',
 'days',
 'loves',
 'trouble',
 'quick',
 'someone',
 'glad',
 'weeks',
 'ones',
 'something',
 'market',
 'galaxy',
 'apple',
 'havent',
 'download',
 'time',
 'lg',
 'send',
 'home',
 'years',
 'product',
 'change',
 'people',
 'review',
 'price',
 'simple',
 'person',
 'lasts',
 'user',
 'hold',
 'please',
 'reviews',
 'work',
 'thats',
 'text',
 'im',
 'end',
 'thank',
 'look',
 'cost',
 'months',
 'buying',
 'point',
 'version',
 'web',
 'times',
 'Nokia',
 'problem',
 'wouldnt',
 'performance',
 'products',
 'minutes',
 'customer',
 'order',
 'guess',
 'things',
 'everything',
 'week',
 'play',
 'daughter',
 'anything',
 'purchase',
 'ok',
 'year',
 'stars',
 'day',
 'wife',
 'son',
 'doesnt',
 'blackberry',
 'hours',
 'return',
 'use']

print("chethan")

chethan


In [18]:
all_terms = get_all_terms(tokenized_reviews)    
terms_dft = get_all_terms_dft(tokenized_reviews, all_terms)
all_terms_pos_dft = get_all_terms_pos_dft(all_terms, terms_dft)
threshold_terms = get_threshold_terms(all_terms_pos_dft, threshold = 0.01 * n_reviews)

threshold_terms[:10]


[('con', 'NN', 13),
 ('att', 'NN', 26),
 ('quality', 'NN', 54),
 ('plan', 'NN', 20),
 ('make', 'NN', 29),
 ('google', 'NN', 12),
 ('hardware', 'NN', 4),
 ('camera', 'NN', 98),
 ('speakers', 'NNS', 17),
 ('wont', 'NN', 18)]

In [19]:
characteristics = [x[0] for x in threshold_terms]

characteristics[:10]

['con',
 'att',
 'quality',
 'plan',
 'make',
 'google',
 'hardware',
 'camera',
 'speakers',
 'wont']

In [20]:
# first import 1000 rows in dataframe
to_prune = [i+1 for i in range(n_reviews)]
ratings = list(df['Rating'])

In [21]:
def get_wordnet_pos(pos):   
    for tag in [('J','ADJ'),('V','VERB'),('N','NOUN'),('R','ADV')]:
        if pos.startswith(tag[0]):
            return getattr(wordnet,tag[1])
    else:
        return 'null'

def get_adj(review):
    with_adj = [tup for tup in review if tup[1] == 'JJ']
    return with_adj

# score for each word
def senti(synset):
    s = swn.senti_synset(synset).pos_score() - swn.senti_synset(synset).neg_score()
    if s>=0:
        return 1
    else:
        return -1

adjs = {x.name().split('.', 1)[0] for x in wn.all_synsets('a')}

### 1. prune reviews without adjectives recognised by wordnet
def prune_adj(tokenized_pos):    
    for k in [i for i in to_prune if i!=0]:
        if not len(get_adj(tokenized_pos[k-1])) or not all(i[0] in adjs for i in get_adj(tokenized_pos[k-1])):
                to_prune[k-1] = 0
    return to_prune

### 2. prune by number of pos and neg adj
# list of scores for each review
def slist(tokenized_pos):
    score = []
    for k in [i for i in to_prune if i!=0]:
        r = get_adj(tokenized_pos[k-1])
        tag = [get_wordnet_pos(tuple[1]) for tuple in r]
        synsets = [r[i][0] + '.' + tag[i] + '.01' for i in range(len(r))] 
        score.append([senti(i) for i in synsets])
    return score

def balance(score_list):
    m=-1
    for k in [i for i in to_prune if i!=0]:
        m+=1
        s = score_list[m]
        if 1 in s and -1 in s and max([s.count(1),s.count(-1)])/min([s.count(1),s.count(-1)]) <= 3:
            to_prune[k-1] = 0
    return to_prune


### 3. prune by average score compared to rating score
def average_score(score_list):
    m = -1
    for k in [i for i in to_prune if i!=0]:
        m += 1
        s = score_list[m]
        #(sum >=0, then rating >=3)
        if sum(s)>=0 and (sum(s)+1)*(ratings[k-1]-2.5)<=0:
            to_prune[k-1] = 0
        elif sum(s)<0 and (sum(s)+1)*(2.5-ratings[k-1])<=0:
            to_prune[k-1] = 0
    return to_prune

In [22]:
# initialise index to_prune = [1,2,3,...,1000]
to_prune = [i+1 for i in range(n_reviews)]
#to_prune = list(set(df.id_col))

to_prune = prune_adj(tokenized_pos)
score_list = slist(tokenized_pos)
to_prune = balance(score_list)
to_prune = average_score(score_list)
# len([i for i in to_prune if i!=0])


In [23]:
# initialise index to_prune = [1,2,3,...,1000]
to_prune = [i+1 for i in range(n_reviews)]
#to_prune = list(set(df.id_col))

to_prune = prune_adj(tokenized_pos)
score_list = slist(tokenized_pos)
to_prune = balance(score_list)
to_prune = average_score(score_list)
# len([i for i in to_prune if i!=0])


In [24]:
to_keep = [i for i in to_prune if i!=0]
to_keep += list(df[df.id_col.isin(list(set(test.review_id)))].id_new_col) # this are the reviews annotated for test
to_keep = list(set(to_keep))

df_filtered = df[df.id_new_col.isin(to_keep)]
df_filtered[:3]

Unnamed: 0,Product,Brand,Price,Rating,Review,Votes,id_col,id_new_col,cluster_name,Standard_Product_Name
53,"""Nokia Asha 302 Unlocked GSM Phone with 3.2MP ...",Nokia,299.0,5,muy buen producto,0.0,53,1,45,"""Nokia Asha 302 GSM with 3.2MP Video, QWERTY W..."
69,"""Nokia Asha 302 Unlocked GSM Phone with 3.2MP ...",Nokia,299.0,5,Nokia Asha 302 Unlocked GSM Phone with 3.2MP C...,13.0,69,2,45,"""Nokia Asha 302 GSM with 3.2MP Video, QWERTY W..."
71,"""Nokia Asha 302 Unlocked GSM Phone with 3.2MP ...",Nokia,299.0,1,"Hola, compramos dos teléfonos y vienieron tota...",2.0,71,3,45,"""Nokia Asha 302 GSM with 3.2MP Video, QWERTY W..."


In [25]:
len(list(df_filtered[df_filtered.id_col.isin(list(set(test.review_id)))].id_new_col))

128

In [26]:
to_keep = list(df_filtered.id_new_col)

In [27]:
n_reviews = len(to_keep)

tokenized_reviews = get_tokens(df_filtered, stem = False, negation = False)
tokenized_pos = get_pos(tokenized_reviews)
tdm = get_tdm(tokenized_reviews)
vsm = normalize_tdm(tdm)
tf_idf = get_tf_idf_transform(tokenized_reviews, tdm, n_reviews)


similarity = vsm #vsm # tdm

similarity_matrix = pd.DataFrame(get_similarity_matrix(similarity, tokenized_reviews), columns = get_all_terms(tokenized_reviews))

similarity_matrix[:10]

progress:  25.062656641604008 %
progress:  50.125313283208015 %
progress:  75.18796992481202 %


Unnamed: 0,thats,incoming,comes,saying,publican,decision,specifically,helping,waster,anything,...,afterwale,sapphire,worked,worst,promise,like,owners,wifiinternational,usually,leather
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.204124,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.102062,0.0,0.0,0.0,0.0


In [28]:
kmeans = KMeans(n_clusters=int(round(math.sqrt(n_reviews),0)), random_state=0).fit(similarity_matrix)
clusters=kmeans.labels_.tolist()

# clustered_matrix = pd.DataFrame(tf_idf_matrix, clusters)
clustered_matrix = similarity_matrix.copy()
clustered_matrix['cluster'] = clusters
clustered_matrix['id_col'] = to_keep

display(len(clustered_matrix))
display(clustered_matrix[:5])

top_clusters = pd.DataFrame(clustered_matrix.cluster.value_counts())
display(top_clusters)


399

Unnamed: 0,thats,incoming,comes,saying,publican,decision,specifically,helping,waster,anything,...,worked,worst,promise,like,owners,wifiinternational,usually,leather,cluster,id_col
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,13,1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.204124,0.0,0.0,0.0,0.0,4,2
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,18,3
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,13,4
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,11,5


Unnamed: 0,cluster
3,88
10,42
11,33
15,30
2,25
5,21
1,21
4,19
12,17
19,14


In [29]:
limit = top_clusters.cluster.quantile(0.3)
cluster_filter = top_clusters[top_clusters.cluster > limit]

display(cluster_filter)

list(cluster_filter.index)

Unnamed: 0,cluster
3,88
10,42
11,33
15,30
2,25
5,21
1,21
4,19
12,17
19,14


[3, 10, 11, 15, 2, 5, 1, 4, 12, 19, 0, 16, 6, 7]

In [30]:
df_filtered["cluster"] = list(clustered_matrix.cluster)
df_filtered[:3]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,Product,Brand,Price,Rating,Review,Votes,id_col,id_new_col,cluster_name,Standard_Product_Name,cluster
53,"""Nokia Asha 302 Unlocked GSM Phone with 3.2MP ...",Nokia,299.0,5,muy buen producto,0.0,53,1,45,"""Nokia Asha 302 GSM with 3.2MP Video, QWERTY W...",13
69,"""Nokia Asha 302 Unlocked GSM Phone with 3.2MP ...",Nokia,299.0,5,Nokia Asha 302 Unlocked GSM Phone with 3.2MP C...,13.0,69,2,45,"""Nokia Asha 302 GSM with 3.2MP Video, QWERTY W...",4
71,"""Nokia Asha 302 Unlocked GSM Phone with 3.2MP ...",Nokia,299.0,1,"Hola, compramos dos teléfonos y vienieron tota...",2.0,71,3,45,"""Nokia Asha 302 GSM with 3.2MP Video, QWERTY W...",18


In [31]:
to_keep = list(df_filtered[df_filtered.cluster.isin(list(cluster_filter.index))].id_new_col)
to_keep += list(df[df.id_col.isin(list(set(test.review_id)))].id_new_col) # this are the reviews annotated for test
to_keep = list(set(to_keep))

df_filtered = df_filtered[df_filtered.id_new_col.isin(to_keep)]
df_filtered[:3]

Unnamed: 0,Product,Brand,Price,Rating,Review,Votes,id_col,id_new_col,cluster_name,Standard_Product_Name,cluster
53,"""Nokia Asha 302 Unlocked GSM Phone with 3.2MP ...",Nokia,299.0,5,muy buen producto,0.0,53,1,45,"""Nokia Asha 302 GSM with 3.2MP Video, QWERTY W...",13
69,"""Nokia Asha 302 Unlocked GSM Phone with 3.2MP ...",Nokia,299.0,5,Nokia Asha 302 Unlocked GSM Phone with 3.2MP C...,13.0,69,2,45,"""Nokia Asha 302 GSM with 3.2MP Video, QWERTY W...",4
71,"""Nokia Asha 302 Unlocked GSM Phone with 3.2MP ...",Nokia,299.0,1,"Hola, compramos dos teléfonos y vienieron tota...",2.0,71,3,45,"""Nokia Asha 302 GSM with 3.2MP Video, QWERTY W...",18


In [32]:

def filter_with_characteristics(df_filtered, characteristics):
    tokenized_reviews = get_tokens(df_filtered, stem = False, negation = False)
    to_keep_in = []
    j = 0
    
    for i in df_filtered.id_col: 
        for token in tokenized_reviews[j]:
            if token in characteristics:
                to_keep_in.append(i)
                break
        
        j+=1
        
    return to_keep_in
                
to_keep_in = filter_with_characteristics(df_filtered, characteristics)
len(to_keep_in)   
    
    

progress:  26.666666666666668 %
progress:  53.333333333333336 %
progress:  80.0 %


240

In [33]:

to_keep_in += list(set(test.review_id)) # this are the reviews annotated for test
df_filtered = df_filtered[df_filtered.id_col.isin(to_keep_in)]
df_filtered[:3]

Unnamed: 0,Product,Brand,Price,Rating,Review,Votes,id_col,id_new_col,cluster_name,Standard_Product_Name,cluster
53,"""Nokia Asha 302 Unlocked GSM Phone with 3.2MP ...",Nokia,299.0,5,muy buen producto,0.0,53,1,45,"""Nokia Asha 302 GSM with 3.2MP Video, QWERTY W...",13
69,"""Nokia Asha 302 Unlocked GSM Phone with 3.2MP ...",Nokia,299.0,5,Nokia Asha 302 Unlocked GSM Phone with 3.2MP C...,13.0,69,2,45,"""Nokia Asha 302 GSM with 3.2MP Video, QWERTY W...",4
71,"""Nokia Asha 302 Unlocked GSM Phone with 3.2MP ...",Nokia,299.0,1,"Hola, compramos dos teléfonos y vienieron tota...",2.0,71,3,45,"""Nokia Asha 302 GSM with 3.2MP Video, QWERTY W...",18


In [34]:
positive_exceptions = ["high", "surprised"] # wordnet have it as negative, should be positive
negative_exceptions = ["old"] # wordnet have it as positive, should be negative. 
ignore_exceptions = ["old", "new", "unlocked", "normal"]
ignore_exceptions += colors
word_exceptions = ["missing", "broken", "love", "awesome", "cool"] # They are not tagged as JJ sometimes, they should.


def compute_score(word, word_neg):
    if word in ignore_exceptions: 
        return 0
    
    if word in positive_exceptions:
        if word_neg.find("_NEG") == -1:
            return 1
        else:
            return -1
        
    if word in negative_exceptions:
        print(word)
        if word_neg.find("_NEG") == -1:
            return -1
        else:
            return 1
        
    word2 = ''.join([word,".a.01"])
    try:
        pos_score = swn.senti_synset(word2).pos_score()
        neg_score = swn.senti_synset(word2).neg_score()
    except:
        if word in pos_list:
            pos_score = 1
            neg_score = 0
        elif word in neg_list:
            pos_score = 0
            neg_score = 1
        else:
            return 0
    
    if pos_score > neg_score:
        if word_neg.find("_NEG") == -1:
            return 1
        else:
            
            return -1
    elif neg_score > pos_score:
        if word_neg.find("_NEG") == -1:            
            return -1
        else:
            
            return 1   
    else:
        if word in pos_list:
            return 1
        elif word in neg_list:
            return -1
        else:
            return 0

    
def extract_characteristic_opinion_words(review, review_neg, max_opinion_words = 2, max_distance = 5, use_distance = False):
    review_charactetistics_sentiment = defaultdict(list) 
    i = 0
    
    temp_review = []
    for word in review: 
        word = word + ("free",)
        temp_review.append(list(word))
            
    for i in range(len(review)):
        if review[i][0] in characteristics:
            keep_forward = True
            keep_backward = True
            opinion_words = 0
            
            for j in range(1,max_distance+1):
                
                if  i+j >= len(review):
                    keep_forward = False
                if keep_forward:
                    if  review[i+j][0] in characteristics or opinion_words >= max_opinion_words:
                        keep_forward = False

                    elif i+j < len(review) and (review[i+j][1] in ["JJ", "JJR", "JJS"] or review[i+j][0] in word_exceptions) and temp_review[i+j][2] == "free":
                        sentiment = defaultdict(int)
                        score = compute_score(review[i+j][0], review_neg[i+j][0])                   
                        if score == 0: continue

                        if use_distance:
                            distance = j
                        else:
                            distance = 1

                        sentiment[review[i+j][0]] = (score,distance)
                        review_charactetistics_sentiment[review[i][0]].append(sentiment)
                        temp_review[i+j][2] = "used"
                        opinion_words +=1
                
                
                if  i-j < 0:
                    keep_backward = False
                if keep_backward:
                    if  review[i-j][0] in characteristics or opinion_words >= max_opinion_words:
                        keep_backward = False

                    elif i-j > -1 and (review[i-j][1] in ["JJ", "JJR", "JJS"] or review[i-j][0] in word_exceptions) and temp_review[i-j][2] == "free":
                        sentiment = defaultdict(int)
                        score = compute_score(review[i-j][0], review_neg[i-j][0])         

                        if score == 0: continue

                        if use_distance:
                            distance = j
                        else:
                            distance = 1

                        sentiment[review[i-j][0]] = (score,distance)

                        review_charactetistics_sentiment[review[i][0]].append(sentiment)
                        temp_review[i-j][2] = "used"  
                        opinion_words +=1
                
                if not keep_forward and not keep_backward:
                    break
    
    return review_charactetistics_sentiment


def consolidate_score(characteristic_dict):
    num = 0
    den = 0
    
    for opinion in characteristic_dict:
        for k, v in opinion.items():
            num += v[0]/v[1]
            den += 1/v[1]

    return num/den


def compute_sentiment_scores(tokenized_pos, tokenized_pos_neg, max_distance = 5, use_distance = True):
    
    if len(tokenized_pos) != len(tokenized_pos_neg):
        print("FATAL ERROR: Different lenght between tokenized_pos and tokenized_pos_neg")
        return null
    
    else:
        
        reviews_sentiment_scores = []        
        
        for i in range(len(tokenized_pos)):
            review_sentiment_score = defaultdict(int)
            
            review_characteristics_opinion_words = extract_characteristic_opinion_words(tokenized_pos[i], tokenized_pos_neg[i], max_distance = max_distance, use_distance = use_distance)
            
            for k, v in review_characteristics_opinion_words.items():
                review_sentiment_score[k] = consolidate_score(v)
                
            reviews_sentiment_scores.append(review_sentiment_score)
            
        return reviews_sentiment_scores

    
def get_NN_count(tokenized_pos):
    NN_count = []
    
    for review in tokenized_pos:
        review_NN_count = 0
        for token in review: 
            if token[1] in ["NN", "NNS", "NNP"] or token[0] in characteristics:
                review_NN_count += 1
        NN_count.append(review_NN_count)
    
    return NN_count


In [35]:
tokenized_reviews = get_tokens(df_filtered, stem = False, negation = False)
tokenized_pos = get_pos(tokenized_reviews)

tokenized_reviews_neg = get_tokens(df_filtered, stem = False, negation = True)
tokenized_pos_neg = get_pos(tokenized_reviews_neg)

NN_count = get_NN_count(tokenized_pos)

df_filtered['new_id'] = range(0, len(df_filtered))

progress:  35.2112676056338 %
progress:  70.4225352112676 %
progress:  35.2112676056338 %
progress:  70.4225352112676 %


In [36]:

lookup_product_id = 7

for val in df_filtered[df_filtered.new_id == lookup_product_id]["Review"]: print(val)
display(tokenized_pos[lookup_product_id])

review_characteristics_opinion_words = extract_characteristic_opinion_words(tokenized_pos[lookup_product_id], tokenized_pos_neg[lookup_product_id], max_distance = 5, use_distance = True)               
display(review_characteristics_opinion_words)



This phone in an excellent phone at a great price. I was impressed with the features of this phone and would recommend this to anyone.


[('phone', 'NN'),
 ('excellent', 'JJ'),
 ('phone', 'NN'),
 ('great', 'JJ'),
 ('price', 'NN'),
 ('impressed', 'VBD'),
 ('features', 'NNS'),
 ('phone', 'NN'),
 ('would', 'MD'),
 ('recommend', 'VB'),
 ('anyone', 'NN')]

defaultdict(list, {'excellent': [defaultdict(int, {'great': (1, 2)})]})

In [37]:
review_sentiment_scores = compute_sentiment_scores(tokenized_pos, tokenized_pos_neg, max_distance = 5, use_distance = True)
review_sentiment_scores[:6]

df_filtered["Sentiments"] = list(review_sentiment_scores)
df_filtered["NN_count"] = list(NN_count)
df_filtered[:3]


Unnamed: 0,Product,Brand,Price,Rating,Review,Votes,id_col,id_new_col,cluster_name,Standard_Product_Name,cluster,new_id,Sentiments,NN_count
53,"""Nokia Asha 302 Unlocked GSM Phone with 3.2MP ...",Nokia,299.0,5,muy buen producto,0.0,53,1,45,"""Nokia Asha 302 GSM with 3.2MP Video, QWERTY W...",13,0,{},1
69,"""Nokia Asha 302 Unlocked GSM Phone with 3.2MP ...",Nokia,299.0,5,Nokia Asha 302 Unlocked GSM Phone with 3.2MP C...,13.0,69,2,45,"""Nokia Asha 302 GSM with 3.2MP Video, QWERTY W...",4,1,{'WiFi': 1.0},14
71,"""Nokia Asha 302 Unlocked GSM Phone with 3.2MP ...",Nokia,299.0,1,"Hola, compramos dos teléfonos y vienieron tota...",2.0,71,3,45,"""Nokia Asha 302 GSM with 3.2MP Video, QWERTY W...",18,2,{},27


In [38]:
# The initial format of he annotated test_set is difficult to read
# as a dataframe, transformation to .csv format is computed first 
# with regular expressions.

test = open('data/annotated_test_set.txt','r', encoding='utf8')
test_file = test.read()
test.close()
test_file[:200]

test_file = re.sub(r"{[^{}]+}", lambda x: x.group(0).replace(",", ";"), test_file)
test_file = test_file.replace(';', "%")
test_file = test_file.replace(',', ";")
test_file = test_file.replace('%', ",")
test_file = test_file.replace('{', "{'")
test_file = test_file.replace(',', ",'")
test_file = test_file.replace(':', "':")
test_file = test_file.replace("},'", "}")

# Once fixed, save and load:
text_file = open("data/annotated_test_set_corrected.csv", "w")
for row in test_file.split(",\n"):
    text_file.write(row)
    text_file.write("\n")
text_file.close()


test = open('data/annotated_test_set_corrected.csv','r', encoding='utf8')
test_file = test.read()
test.close()

test = pd.read_csv('data/annotated_test_set_corrected.csv', delimiter = ";")
test.columns = ['review_id', 'Product', 'Sentiments_test']


In [39]:
test[:3]

Unnamed: 0,review_id,Product,Sentiments_test
0,1540,BlackBerry Curve,"{'Trackball':-1,'Battery':-1,'Micro-SD':-1}"
1,1554,Acer Liquid E700 TRIO,"{'Camera':-1,'Hardware':-1,'Buttons':-1}"
2,1697,Alcatel OneTouch,"{'Hardware':-1,'Charging Port':-1}"


In [40]:
df_merge = pd.merge(df_filtered, test, left_on='id_col', right_on='review_id', how = "left")
df_merge[df_merge.Sentiments_test.isnull()==False]


Unnamed: 0,Product_x,Brand,Price,Rating,Review,Votes,id_col,id_new_col,cluster_name,Standard_Product_Name,cluster,new_id,Sentiments,NN_count,review_id,Product_y,Sentiments_test
0,"""Nokia Asha 302 Unlocked GSM Phone with 3.2MP ...",Nokia,299.00,5,muy buen producto,0.0,53,1,45,"""Nokia Asha 302 GSM with 3.2MP Video, QWERTY W...",13,0,{},1,53.0,Asha 302,"{'sound': 1,' smart phone features': 1,' soft..."
1,"""Nokia Asha 302 Unlocked GSM Phone with 3.2MP ...",Nokia,299.00,5,Nokia Asha 302 Unlocked GSM Phone with 3.2MP C...,13.0,69,2,45,"""Nokia Asha 302 GSM with 3.2MP Video, QWERTY W...",4,1,{'WiFi': 1.0},14,69.0,Asha 302,"{'build': 1,' keyboard': 1,'sound': 1,' Xpres..."
2,"""Nokia Asha 302 Unlocked GSM Phone with 3.2MP ...",Nokia,299.00,1,"Hola, compramos dos teléfonos y vienieron tota...",2.0,71,3,45,"""Nokia Asha 302 GSM with 3.2MP Video, QWERTY W...",18,2,{},27,71.0,Asha 302,"{'build': 1,' reception': 1,' audio': 1,' key..."
3,"""Nokia Asha 302 Unlocked GSM Phone with 3.2MP ...",Nokia,299.00,5,GRACIAS ME LLEGO EL PROCTO QUE COMPRE Y LLEVO ...,0.0,73,4,45,"""Nokia Asha 302 GSM with 3.2MP Video, QWERTY W...",13,3,{},8,73.0,Asha 302,{'apps': 1}
4,"""Nokia Asha 302 Unlocked GSM Phone with 3.2MP ...",Nokia,299.00,4,"The keys are a little hard to hit, and I didn'...",0.0,75,5,45,"""Nokia Asha 302 GSM with 3.2MP Video, QWERTY W...",11,4,{'didnt': -0.24999999999999994},5,75.0,Asha 302,"{'SMS': 1,' rings': 1,' body': 1,' freezes': -1}"
5,"""Nokia Asha 302 Unlocked GSM Phone with 3.2MP ...",Nokia,299.00,5,I bought this phone as a Christmas present for...,3.0,78,6,45,"""Nokia Asha 302 GSM with 3.2MP Video, QWERTY W...",4,5,"{'amazon': -0.14285714285714285, 'features': 0...",57,78.0,Asha 302,{'ring tones': 1}
6,"""Nokia Asha 302 Unlocked GSM Phone with 3.2MP ...",Nokia,299.00,4,The Phone is pretty good. I am using it with a...,2.0,79,7,45,"""Nokia Asha 302 GSM with 3.2MP Video, QWERTY W...",11,6,{'plan': -0.20000000000000004},12,79.0,Asha 302,"{'wi-fi': 1,' calendar': 1,' alarm clock': 1,..."
7,"""Nokia Asha 302 Unlocked GSM Phone with 3.2MP ...",Nokia,299.00,5,This phone in an excellent phone at a great pr...,1.0,82,8,45,"""Nokia Asha 302 GSM with 3.2MP Video, QWERTY W...",3,7,{'excellent': 1.0},7,82.0,Asha 302,{'price': 1}
8,"""Nokia Asha 302 Unlocked GSM Phone with 3.2MP ...",Nokia,299.00,4,This is a good phone although it seems to have...,1.0,84,9,45,"""Nokia Asha 302 GSM with 3.2MP Video, QWERTY W...",11,8,{'screen': -0.3333333333333333},8,84.0,Asha 302,"{'time': -1,' support': -1,' booklet': -1}"
9,"""Nokia Asha 302 Unlocked GSM Phone with 3.2MP ...",Nokia,299.00,5,I've been a long time user of the iPhone. I fi...,2.0,85,10,45,"""Nokia Asha 302 GSM with 3.2MP Video, QWERTY W...",10,9,"{'fits': 1.0, 'look': 1.0, 'etc': 1.0}",45,85.0,Asha 302,"{'screen': -1,' calling': 1,' messaging': 1,'..."


In [41]:
lookup = 1540

for val in df_merge[df_merge.id_col == lookup].Review:
    print(val)
    
df_merge[df_merge.id_col == lookup]

I recevied the phone with broken trackball, missing micro-sd and missing battery.The seller claimed that it is 100% working. i cannot see how such a phone can be workingwithout the internal sd and battery. It claimed that it is OEM and brand new.My obervations indicated this was a poorly attempted refurbished phone. They must berunnin out of second handed parts.


Unnamed: 0,Product_x,Brand,Price,Rating,Review,Votes,id_col,id_new_col,cluster_name,Standard_Product_Name,cluster,new_id,Sentiments,NN_count,review_id,Product_y,Sentiments_test
18,8330 BlackBerry Curve (US Cellular) Titanium P...,,29.95,1,"I recevied the phone with broken trackball, mi...",4.0,1540,21,402,8330 BlackBerry Curve Cellular) Titanium,3,18,"{'trackball': -1.0, 'microsd': -1.0}",15,1540.0,BlackBerry Curve,"{'Trackball':-1,'Battery':-1,'Micro-SD':-1}"


In [42]:
def characteristics_extraction_performance(NN_count, training, test):
    TP = 0
    TN = 0
    FP = 0
    FN = 0
    temp_test = []
    test = eval(test)
    
    for test_characteristic in test.keys():
        test_characteristic = str(test_characteristic).lower()
        test_characteristic = re.sub(r'[^A-Za-z /.]','',test_characteristic)
        temp_test.append(test_characteristic)
            
        if test_characteristic in training.keys():
            TP += 1
        else: 
            FN += 1
            
    TN = NN_count - len(training.keys()) - FN
        
    for train_characteristic in training.keys():
        if train_characteristic not in temp_test:
            FP += 1
    
    return TP, TN, FP, FN
    

def compute_characteristics_extraction_performance(df_merge):
    total_TP = 0
    total_TN = 0
    total_FP = 0
    total_FN = 0
    
    for i in range(len(df_merge)):
        NN_count = df_merge.NN_count[i]
        training = df_merge.Sentiments[i]
        test = df_merge.Sentiments_test[i]
        if pd.isnull(test): continue
        TP, TN, FP, FN = characteristics_extraction_performance(NN_count, training, test)
        
        total_TP += TP
        total_TN += TN
        total_FP += FP
        total_FN += FN
        
    if total_TP + total_FP == 0:
        TPR_RECALL = 0
    else:
        TPR_RECALL =  total_TP / (total_TP + total_FP)
        
    TNR_SPECIFICITY = total_TN / (total_TN + total_FN)
    F1_Score = 2* total_TP / (2*total_TP + total_FP + total_FN)
    Accuracy = (total_TP + total_TN) / (total_TP + total_TN + total_FP + total_FN)
    fpr = total_FP / (total_FN + total_FP)
    
    return TPR_RECALL, TNR_SPECIFICITY, F1_Score, Accuracy, fpr
    
    

In [43]:
Recall, Specificity, F1_Score, Accuracy, fpr= compute_characteristics_extraction_performance(df_merge)

print("Recall: ", Recall)
print("Specificity: ", Specificity)
print("F1_Score: ", F1_Score)
print("Accuracy: ", Accuracy)

Recall:  0.05128205128205128
Specificity:  0.7784810126582279
F1_Score:  0.0273972602739726
Accuracy:  0.723196881091618


In [44]:
def characteristics_sentiment_performance(training, test):
    TP = 0
    TN = 0
    FP = 0
    FN = 0
    test = eval(test)
    
    for test_characteristic, test_score in test.items():
        test_characteristic = str(test_characteristic).lower()
        test_characteristic = re.sub(r'[^A-Za-z /.]','',test_characteristic)
                        
        if test_characteristic in training.keys():
            if test_score == training[test_characteristic]:
                if test_score > 0:
                    TP += 1
                else:
                    TN += 1
            else:
                if test_score > 0:
                    FN += 1
                else:
                    FP += 1            
        else: 
            continue    

    return TP, TN, FP, FN


def compute_characteristics_sentiment_performance(df_merge):
    total_TP = 0
    total_TN = 0
    total_FP = 0
    total_FN = 0
    cases = 0
    
    for i in range(len(df_merge)):
        training = df_merge.Sentiments[i]
        test = df_merge.Sentiments_test[i]
        if pd.isnull(test): continue
        TP, TN, FP, FN = characteristics_sentiment_performance(training, test)
        if TP+ TN+ FP+ FN > 0:
            cases+=1            
        
        total_TP += TP
        total_TN += TN
        total_FP += FP
        total_FN += FN
        
    if total_TP + total_FP == 0:
        TPR_RECALL = 0
    else:
        TPR_RECALL =  total_TP / (total_TP + total_FP)
        
    TNR_SPECIFICITY = total_TN / (total_TN + total_FN)
    F1_Score = 2* total_TP / (2*total_TP + total_FP + total_FN)
    Accuracy = (total_TP + total_TN) / (total_TP + total_TN + total_FP + total_FN)
    fpr = total_FP / (total_FN + total_FP)
    
    return TPR_RECALL, TNR_SPECIFICITY, F1_Score, Accuracy, cases
    


In [45]:
Recall, Specificity, F1_Score, Accuracy, cases= compute_characteristics_sentiment_performance(df_merge)

print("Reviews Evaluated: ", cases)
print("Recall: ", Recall)
print("Specificity: ", Specificity)
print("F1_Score: ", F1_Score)
print("Accuracy: ", Accuracy)


Reviews Evaluated:  5
Recall:  0.5
Specificity:  1.0
F1_Score:  0.6666666666666666
Accuracy:  0.8333333333333334
