# Text data feature extraction

# Data preperation
# TODO modify text description of the steps

We will use a dataset consisting of baby product reviews on Amazon.com.

In [25]:
import pandas as pd

In [26]:
products = pd.read_csv("../valt_sa_data/amazon_baby.csv")[['review', 'rating']]

In [27]:
products = products[0:10000]

In [28]:
# products

## Build the word count vector for each review

Let us explore a specific example of a baby product.

In [29]:
products.iloc[9]

review    This has been an easy way for my nanny to reco...
rating                                                    4
Name: 9, dtype: object

Now, we will perform 2 simple data transformations:

1. Remove punctuation using Python's built-in string functionality.
2. Transform the reviews into word-counts.

In [30]:
emoticons = [':)', ':))', ':)))', ':(', ':((', ':(((']

def extract_emoticons(text):
    emoticons_in_text = []
    for emoticon in emoticons:
        i = text.find(emoticon)
        if i > -1:
            emoticons_in_text.append(emoticon)
    return emoticons_in_text

In [31]:
punctuation_to_remove = '!"#$%&()*+,-./:;<=>?@[\]^_`{|}~'

def remove_punctuation(text):
    return text.translate(None, punctuation_to_remove) 

pos_dict = {
    'NN': 'n', 'VB': 'v', 'VBD': 'v', 'VBG': 'v', 
    'VBN': 'v', 'VBP': 'v', 'VBZ': 'v',
    'JJ': 'a', 'JJR': 'a', 'JJS': 'a', 'JJT': 'a'
}

def get_pos_for_lematirzer(brown_post):
    if not brown_post in pos_dict:
        return 'n'
    else:
        return pos_dict[brown_post]

    
# TODO modify and use this list
# Or just ignore it
stop_words = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you',
 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 
 'himself', 'she', 'her', 'hers', 'herself', 'it', 'its', 'itself', 
 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 
 'who', 'whom', 'this', 'that', 'these', 'those', 'am', 'is', 'are', 
 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 
 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 
 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 
 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 
 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 
 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 
 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no',
 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 
 'will', 'just', 'don', 'should', 'now']    


In [32]:
import nltk
from nltk.stem import WordNetLemmatizer
from collections import Counter

sentence = "I am a big boy:) I'd love to eat ice-cream right now, and my friend goes shopping. By the way, Danylo already went."

my_words_list = []
my_words = set()

def analyze_review(text):
    global my_words
    emoticons_features = extract_emoticons(text)
    text_without_punctuation = remove_punctuation(text)
    tokens = nltk.word_tokenize(text_without_punctuation)
    tagged_tokens = nltk.pos_tag(tokens)
    # TODO apply Turney alorithm
    lemmatizer = WordNetLemmatizer()
    
    tokens_prepared_for_lemmatization = [(t[0], get_pos_for_lematirzer(t[1])) for t in tagged_tokens]
    lemmas = [lemmatizer.lemmatize(tpl[0], tpl[1]) for tpl in tokens_prepared_for_lemmatization]
    
    filtered_lemmas = []
    for lemma in lemmas:
        lemma_l = lemma.lower()
        if not lemma_l in stop_words:
            filtered_lemmas.append(lemma_l)
    
    words = filtered_lemmas + emoticons_features
    for word in words:
        if not word in ['review', 'rating']:
            my_words_list.append(word)
            my_words.add(word)
    return words

# analyze_review(sentence)

analyzed_reviews = products['review'].apply(str).apply(analyze_review)

from collections import Counter

count = Counter(my_words_list)
#print("after: len(count) = %s") %(len(count))
#print("most_common = %s") %(count.most_common(1000))

most_common_words = map(lambda x: x[0], count.most_common(1000))

print most_common_words
#TODO apply lower in the right moment

['use', "n't", "'s", 'baby', 'get', 'one', 'bottle', 'would', 'buy', 'love', 'great', 'like', 'time', u'month', 'old', 'easy', 'work', 'son', u'make', u'go', 'diaper', 'product', 'seat', 'little', 'tub', u'good', 'also', u'take', 'much', 'first', 'well', 'daughter', 'put', u'try', 'really', u'keep', 'pump', 'think', 'even', u'find', 'still', 'need', 'recommend', 'could', 'back', 'thing', u'bag', 'problem', 'child', 'purchase', u'come', 'toy', '2', 'year', 'gate', u'look', 'fit', 'give', 'nipple', 'two', u'want', 'sleep', 'big', 'since', 'potty', 'play', 'clean', 'around', 'pillow', "'ve", u'say', 'way', 'water', "'m", 'enough', 'side', 'sit', 'day', 'never', 'hold', 'every', 'small', u'know', u'start', 'leak', '3', 'car', 'change', u'week', 'night', 'best', u'seem', 'without', 'bath', 'new', 'right', 'open', 'room', '4', 'money', 'see', 'long', 'help', 'another', u'lot', 'wash', 'ca', 'hard', 'bottom', 'able', u'hand', u'nice', 'milk', 'avent', 'comfortable', 'size', 'place', 'item', '

In [33]:
print len(my_words)

21721


In [34]:
#significant_words = ['love', 'great', 'easy', 'old', 'little', 'perfect', 'loves', 
#      'well', 'able', 'car', 'broke', 'less', 'even', 'waste', 'disappointed', 
#      'work', 'product', 'money', 'would', 'return']

significant_words = most_common_words + emoticons # list(my_words) #[0:500]
        
def count_number_of_significant_words(text):
    words = text['review']
    word_dict = {}
    for word in significant_words:
        word_dict[word] = 0
    for word in words:
        if word in significant_words:
            if word not in word_dict:
                word_dict[word] = 1
            else:
                word_dict[word] = 1
                #word_dict[word] = word_dict[word] + 1
                #pass
    significant_words_counts = []
    for word in significant_words:
        significant_words_counts.append(word_dict[word]) 
    return pd.Series(significant_words_counts, index=significant_words)


newcols = pd.DataFrame(analyzed_reviews).apply(count_number_of_significant_words, axis=1)
newcols.columns = significant_words

products_with_words = products.join(newcols)

Now, let us explore what the sample example above looks like after these 2 transformations.

In [35]:
products_with_words.iloc[9]

review        This has been an easy way for my nanny to reco...
rating                                                        4
use                                                           0
n't                                                           1
's                                                            0
baby                                                          1
get                                                           0
one                                                           1
bottle                                                        0
would                                                         1
buy                                                           0
love                                                          0
great                                                         0
like                                                          0
time                                                          0
month                                   

## Save prepared data into a file

In [36]:
X = products_with_words[significant_words]
y = products_with_words['rating']
X.to_csv('../valt_sa_data/x_m.csv', index=False)
y.to_csv('../valt_sa_data/y_m.csv', index=False)