In [10]:
import pandas as pd
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import wordnet as wn
from nltk.corpus import sentiwordnet as swn
from nltk.wsd import lesk
from nltk.corpus import stopwords
import nltk
# nltk.download('wordnet')
# nltk.download('sentiwordnet')
# nltk.download('punkt')
# nltk.download('stopwords')
NEGATE = {
    "n't",
    "aint",
    "arent",
    "cannot",
    "cant",
    "couldnt",
    "darent",
    "didnt",
    "doesnt",
    "ain't",
    "aren't",
    "can't",
    "couldn't",
    "daren't",
    "didn't",
    "doesn't",
    "dont",
    "hadnt",
    "hasnt",
    "havent",
    "isnt",
    "mightnt",
    "mustnt",
    "neither",
    "don't",
    "hadn't",
    "hasn't",
    "haven't",
    "isn't",
    "mightn't",
    "mustn't",
    "neednt",
    "needn't",
    "never",
    "none",
    "nope",
    "nor",
    "not",
    "nothing",
    "nowhere",
    "oughtnt",
    "shant",
    "shouldnt",
    "uhuh",
    "wasnt",
    "werent",
    "oughtn't",
    "shan't",
    "shouldn't",
    "uh-uh",
    "wasn't",
    "weren't",
    "without",
    "wont",
    "wouldnt",
    "won't",
    "wouldn't",
    "rarely",
    "seldom",
    "despite",
}

positive_file = open('positive-words.txt', 'r')
POSITIVE = positive_file.read().splitlines()
positive_file.close()

negative_file = open('negative-words.txt', 'r')
NEGATIVE = negative_file.read().splitlines()
negative_file.close()



In [6]:
data_frame = pd.read_json(r'reviews_Automotive_5.json', lines = True)

data_frame.head()

data_frame = data_frame.dropna()
data_frame = data_frame.drop(['asin', 'helpful', 'reviewTime', 'reviewerID', 'reviewerName', 'summary', 'unixReviewTime'], axis = 1)

In [52]:
def negated(input_words, include_nt=True):
    """
    Determine if input contains negation words
    """
    neg_words = NEGATE
    if any(word.lower() in neg_words for word in input_words):
        return True
    if include_nt:
        if any("n't" in word.lower() for word in input_words):
            return True
    return False

def negated_word(word, include_nt=True):
    neg_words = NEGATE
    if word in neg_words:
        return True
    if include_nt:
        if word == "n't":
            return True
        
    return False

def get_negation_statistic(data_frame):
    #Negation Counter
    total_negation_count = 0
    total_negation_sentence_count = 0
    max_negation_count = 0
    #Total sentence counter
    total_sentence_count = 0
    
    #Sentiment Counter
    total_sentiment_count = 0
    total_negation_sentiment_count = 0
    
    reviewList = data_frame['reviewText'].tolist()
    for sentence in reviewList:
        sentence_tokenizer = word_tokenize(sentence)
        review_negation_count = 0
        review_sentiment_count = 0
        for word in sentence_tokenizer:
            if negated_word(word):
                review_negation_count += 1
            if word == '.': #at the end of the sentence
                total_sentence_count += 1
                if review_negation_count == 0: #There's no negation word
                    pass
                else: #There's a negation
                    if review_negation_count > max_negation_count:
                        print(review_negation_count)
                        max_negation_count = review_negation_count
                    total_negation_count += review_negation_count
                    total_negation_sentence_count += 1
                    review_negation_count = 0
    return total_negation_count, total_negation_sentence_count, total_sentence_count, total_negation_count/total_negation_sentence_count, max_negation_count
    
stop_words = set(stopwords.words('english'))
def get_sentiment(sentence_token):
    positive_index = []
    negative_index = []
#     sentence_token = nltk.word_tokenize(sentence)
    for wordIndex in range(len(sentence_token)):
        word = sentence_token[wordIndex].lower()
        if word in stop_words or word in NEGATE:
            pass
        else:
            if word in NEGATIVE:
                  negative_index.append(wordIndex)  
            elif word in POSITIVE:
                positive_index.append(wordIndex)
    return positive_index, negative_index


def get_negation(sentence_token):
    negation_list = []
    pos_after_negation = []
#     sentence_token = nltk.word_tokenize(sentence)
    for wordIndex in range(len(sentence_token)):
        word = sentence_token[wordIndex].lower()
        if negated_word(word):
            negation_list.append(wordIndex)
            
    return negation_list, len(negation_list)
    
def is_closest_sentiment_positive(value, pos_list, neg_list):
    smallest = 1000000
    positive = 1
    for item in pos_list:
        if item > value and item < smallest:
            smallest = item
    for item in neg_list:
        if item > value and item < smallest:
            positive = 0
    if smallest == 1000000:
        positive = -1
    return positive

def preprocess_sentence(sentence):
    vector = []
    sentence_token = nltk.word_tokenize(sentence)
    pos_index , neg_index = get_sentiment(sentence_token)
    negate_index, num_negate = get_negation(sentence_token)
    pos_score = len(pos_index)
    neg_score = len(neg_index)
    num_sent = pos_score + neg_score
    if num_sent == 0:
        return None
    else:
        vector.append(num_negate)
        vector.append(num_sent)
        vector.append(pos_score)
        vector.append(neg_score)
        
        for negateIndex in negate_index:
            vector.append(negateIndex)
            vector.append(is_closest_sentiment_positive(negateIndex, pos_index, neg_index))
    
        return vector

def preprocess_review(review):
    review_vectors = []
    review_token = sent_tokenize(review)
    for sentence in review_token:
        sentence_vector = preprocess_sentence(sentence)
        if sentence_vector == None:
            pass
        else:
            review_vectors.append(sentence_vector)
    return review_vectors

In [53]:
print(data_frame['reviewText'][1])
preprocess_sentence("Package arrived a bit ragged but it was all there, easy to install, takes less than 5 minutes, works as described")

These long cables work fine for my truck, but the quality seems a little on the shabby side. For the money I was not expecting 200 dollar snap-on jumper cables but these seem more like what you would see at a chinese knock off shop like harbor freight for 30 bucks.


[0, 3, 2, 1]

In [54]:
print(preprocess_review(data_frame['reviewText'][21]))

[[0, 2, 2, 0], [0, 1, 0, 1]]
