# Text data feature extraction

# Data preperation
# TODO modify text description of the steps
# TODO parametrize
# TODO refactor

We will use a dataset consisting of baby product reviews on Amazon.com.

In [270]:
USE_MY_METHOD = True
USE_STOP_WORDS = True
USE_EMOTICONS = True
USE_NEGATION = True
USE_BOOLEAN_REPRESENTATION = True # If set to false, number of occurances of words is calculated

In [271]:
import pandas as pd

In [272]:
products = pd.read_csv("../valt_sa_data/amazon_baby.csv")[['review', 'rating']]

In [273]:
products = products[0:1000]

In [274]:
products

Unnamed: 0,review,rating
0,"These flannel wipes are OK, but in my opinion ...",3
1,it came early and was not disappointed. i love...,5
2,Very soft and comfortable and warmer than it l...,5
3,This is a product well worth the purchase. I ...,5
4,All of my kids have cried non-stop when I trie...,5
5,"When the Binky Fairy came to our house, we did...",5
6,"Lovely book, it's bound tightly so you may not...",4
7,Perfect for new parents. We were able to keep ...,5
8,A friend of mine pinned this product on Pinter...,5
9,This has been an easy way for my nanny to reco...,4


## Build the word count vector for each review

Let us explore a specific example of a baby product.

In [275]:
products.iloc[9]

review    This has been an easy way for my nanny to reco...
rating                                                    4
Name: 9, dtype: object

Now, we will perform 2 simple data transformations:

1. Remove punctuation using Python's built-in string functionality.
2. Transform the reviews into word-counts.

In [276]:
emoticons = [':)', ':))', ':)))', ':(', ':((', ':(((']

def extract_emoticons(text):
    emoticons_in_text = []
    for emoticon in emoticons:
        i = text.find(emoticon)
        if i > -1:
            emoticons_in_text.append(emoticon)
    return emoticons_in_text

In [277]:
punctuation_to_remove = '!"#$%&()*+,-./:;<=>?@[\]^_`{|}~'

def remove_punctuation(text):
    return text.translate(None, punctuation_to_remove)

pos_dict = {
    'NN': 'n', 'VB': 'v', 'VBD': 'v', 'VBG': 'v', 
    'VBN': 'v', 'VBP': 'v', 'VBZ': 'v',
    'JJ': 'a', 'JJR': 'a', 'JJS': 'a', 'JJT': 'a'
}

def get_pos_for_lemmatirzer(brown_post):
    if not brown_post in pos_dict:
        return 'n'
    else:
        return pos_dict[brown_post]



In [278]:
import nltk
from nltk.stem import WordNetLemmatizer

# TODO modify and use this list
# Or just ignore it
stop_words = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you',
 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 
 'himself', 'she', 'her', 'hers', 'herself', 'it', 'its', 'itself', 
 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 
 'who', 'whom', 'this', 'that', 'these', 'those', 'am', 'is', 'are', 
 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 
 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 
 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 
 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 
 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 
 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 
 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no',
 'nor', 'not', "n't", 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 
 'will', 'just', 'don', 'should', 'now']

my_words_list = []
my_words = set()

def analyze_review(text):
    global my_words
    if USE_EMOTICONS:
        emoticons_features = extract_emoticons(text)
    else:
        emoticons_features = []
    text_without_punctuation = remove_punctuation(text)
    tokens = nltk.word_tokenize(text_without_punctuation)
    tagged_tokens = nltk.pos_tag(tokens)
    lemmatizer = WordNetLemmatizer()
    
    tokens_prepared_for_lemmatization = [(t[0], get_pos_for_lemmatirzer(t[1])) for t in tagged_tokens]
    
    new_words = []
    not_count = 0
    n_count = 0
    
    for tpl in tokens_prepared_for_lemmatization:
        current_word = lemmatizer.lemmatize(tpl[0], tpl[1])
        if n_count > 2:
            not_count = 0
            n_count = 0
        if current_word == 'not' or current_word == "n't":
                not_count += 1
        elif (not USE_STOP_WORDS) or (not current_word.lower() in stop_words):
            if USE_NEGATION and not_count % 2 == 1:
                current_word = 'NOT_' + current_word
                n_count += 1
            new_words.append(current_word)
        
    lemmas = new_words
    
    filtered_lemmas = []
    for lemma in lemmas:
        lemma_l = lemma.lower()
        filtered_lemmas.append(lemma_l)
    
    words = filtered_lemmas + emoticons_features
    for word in words:
        if not word in ['review', 'rating']:
            my_words_list.append(word)
            my_words.add(word)
    return words

In [279]:
analyzed_reviews = products['review'].apply(str).apply(analyze_review)

from collections import Counter

count = Counter(my_words_list)
most_common_words = map(lambda x: x[0], count.most_common(100))
print most_common_words

['diaper', 'use', "'s", u'bag', 'one', 'get', 'baby', 'champ', 'love', 'buy', 'easy', 'great', u'month', 'would', 'time', 'old', 'product', 'like', 'smell', 'pail', 'change', 'keep', 'son', u'make', 'put', 'genie', 'work', 'little', u'go', 'first', 'also', 'tub', u'sheet', u'take', 'odor', 'still', 'really', 'year', 'recommend', u'thing', u'good', u'try', 'well', 'toy', 'much', 'child', 'daughter', 'need', u'find', 'open', '2', 'even', 'crib', 'give', 'think', 'two', 'could', 'regular', 'trash', 'hold', u'come', "'ve", 'purchase', 'book', 'back', u'start', 'never', u'week', 'since', u'say', 'problem', u'look', 'way', "'m", 'day', 'every', 'new', 'top', 'cover', 'garbage', 'side', u'hand', 'clean', 'ca', 'another', 'play', 'room', 'big', '3', u'want', 'around', 'plastic', 'hard', u'know', 'inside', 'small', 'sling', 'kitchen', 'fit', 'dirty']


In [280]:
print len(my_words)

5998


In [281]:
if USE_MY_METHOD:
    if USE_EMOTICONS:
        significant_words = most_common_words + emoticons # list(my_words) #[0:500]
    else:
        significant_words = most_common_words
else:
    significant_words = ['love', 'great', 'easy', 'old', 'little', 'perfect', 'loves', 
                         'well', 'able', 'car', 'broke', 'less', 'even', 'waste', 'disappointed', 
                         'work', 'product', 'money', 'would', 'return']
        
def count_number_of_significant_words(text):
    words = text['review']
    word_dict = {}
    for word in significant_words:
        word_dict[word] = 0
    for word in words:
        if word in significant_words:
            if not word in word_dict:
                word_dict[word] = 1
            else:
                if USE_BOOLEAN_REPRESENTATION:
                    word_dict[word] = 1
                else:
                    word_dict[word] = word_dict[word] + 1
    significant_words_counts = []
    for word in significant_words:
        significant_words_counts.append(word_dict[word]) 
    return pd.Series(significant_words_counts, index=significant_words)


newcols = pd.DataFrame(analyzed_reviews).apply(count_number_of_significant_words, axis=1)
newcols.columns = significant_words

products_with_words = products.join(newcols)

Now, let us explore what the sample example above looks like after these 2 transformations.

In [282]:
products_with_words.iloc[9]

review     This has been an easy way for my nanny to reco...
rating                                                     4
diaper                                                     0
use                                                        0
's                                                         0
bag                                                        0
one                                                        1
get                                                        0
baby                                                       1
champ                                                      0
love                                                       0
buy                                                        0
easy                                                       1
great                                                      0
month                                                      0
would                                                      0
time                    

## Save prepared data into a file

In [283]:
X = products_with_words[significant_words]
y = products_with_words['rating']
X.to_csv('../valt_sa_data/x_m.csv', index=False)
y.to_csv('../valt_sa_data/y_m.csv', index=False)