In [0]:
import csv
import json
import os
from nltk import word_tokenize
from nltk.corpus import wordnet as wn
import nltk
from textblob import TextBlob
import re
from sklearn.feature_extraction.text import TfidfVectorizer

In [0]:
# read data
input_review_file = "Amazon_Unlocked_Filtered.csv"

product_name_with_review_dict = dict()

with open(input_review_file) as csv_file:
    csv_reader = csv.reader(csv_file, delimiter=',')
    line_index = 0
    for row in csv_reader:
        if line_index != 0 and len(row) > 0:
            if len(row[0]) > 3:
                if row[0] not in product_name_with_review_dict.keys():
                    product_name_with_review_dict[row[0]] = [row[4]]
                else:
                    product_name_with_review_dict[row[0]].append(row[4])
        line_index += 1

In [0]:
en_stop = set(nltk.corpus.stopwords.words('english'))
custom_stop_words = ['phone', 'work', 'works', 'could', 'would', 'great', 'iphone', 'samsung']
en_stop |= set(custom_stop_words)

In [15]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [0]:
def remove_non_ascii(text):
    """

    :param text: takes input as a text
    :return: replace all the non-ascii character from the input text
    """
    return ''.join([i if ord(i) < 128 else ' ' for i in text])


# creating tokens from the given string text
def tokenize(text):
    text = remove_non_ascii(text)
    lda_tokens = word_tokenize(text.lower())
    return lda_tokens


# converting words to their lemmatized form
def get_lemma(word):
    lemma = wn.morphy(word)
    if lemma is None:
        return word
    else:
        return lemma


# converting each sentence to its respective tokens
def prepare_text_for_lda(text):
    tokens = tokenize(text)
    tokens = [token for token in tokens if len(token) > 4]
    tokens = [token for token in tokens if token not in en_stop]
    tokens = [get_lemma(token) for token in tokens]
    return remove_non_ascii(' '.join(tokens))

In [0]:
def topic_modelling_and_sentiment_score(product_name, reviews):
    total_words_count = sum(len(review.split()) for review in reviews)

    print product_name
    print "\n"
    if len(reviews) > 10:
        max_df = 0.5
        min_df = 0.02
    else:
        max_df = 1.0
        min_df = 1
    vec = TfidfVectorizer(max_df=max_df, min_df=min_df).fit(reviews)
    bag_of_words = vec.transform(reviews)
    sum_words = bag_of_words.sum(axis=0)
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq = sorted(words_freq, key=lambda x: x[1], reverse=True)
    top_topics_tuple = words_freq[:20]
    
    print top_topics_tuple
    print "\n"

    review_with_sentiment_dict = dict()
    for review in reviews:
        review_with_sentiment_dict[review] = TextBlob(review).sentiment.polarity

    topic_sentiment = dict()
    topic_counts = 0
    for topic, _ in top_topics_tuple:
        for review in reviews:
            if topic in review:
                if topic not in topic_sentiment.keys():
                    topic_sentiment[topic] = review_with_sentiment_dict[review]
                else:
                    topic_sentiment[topic] += review_with_sentiment_dict[review]

            topic_counts += sum(1 for _ in re.finditer(r'\b%s\b' % re.escape(topic), review))

        print topic + " - RATIO in all words %f ", float(topic_counts) / float(total_words_count)
        print topic + " - RATIO in all REVIEWS %f ", float(topic_counts) / float(len(reviews))

        topic_sentiment[topic] = float(topic_sentiment[topic]) / float(len(reviews))
        print topic + " - Sentiment %f ", topic_sentiment[topic]

    print("\n")

In [20]:
product_name_with_reviews = dict()
for key, values in product_name_with_review_dict.iteritems():
    for line in values:
        sentence = prepare_text_for_lda(line)
        if len(sentence) > 2:
            if key not in product_name_with_reviews.keys():
                t_name_with_reviews[key] = [sentence]
            else:
                product_name_with_reviews[key].append(sentence)

    topic_modelling_and_sentiment_score(key, product_name_with_reviews[key])

NameError: ignored