# Text data feature extraction

# Data preperation

We will use a dataset consisting of baby product reviews on Amazon.com.

In [31]:
use_stemming = True
use_l = True

In [32]:
import pandas as pd

In [33]:
products = pd.read_csv("../valt_sa_data/amazon_baby.csv")[['review', 'rating']]

In [34]:
products = products[0:100]

In [35]:
products

Unnamed: 0,review,rating
0,"These flannel wipes are OK, but in my opinion ...",3
1,it came early and was not disappointed. i love...,5
2,Very soft and comfortable and warmer than it l...,5
3,This is a product well worth the purchase. I ...,5
4,All of my kids have cried non-stop when I trie...,5
5,"When the Binky Fairy came to our house, we did...",5
6,"Lovely book, it's bound tightly so you may not...",4
7,Perfect for new parents. We were able to keep ...,5
8,A friend of mine pinned this product on Pinter...,5
9,This has been an easy way for my nanny to reco...,4


## Build the word count vector for each review

Let us explore a specific example of a baby product.

In [36]:
products.iloc[9]

review    This has been an easy way for my nanny to reco...
rating                                                    4
Name: 9, dtype: object

Now, we will perform 2 simple data transformations:

1. Remove punctuation using Python's built-in string functionality.
2. Transform the reviews into word-counts.

In [37]:
def remove_punctuation(text):
    import string
    return text.translate(None, string.punctuation) 

review_without_puctuation = products['review'].apply(str).apply(remove_punctuation)

In [14]:
import nltk
sentence = "I am a big boy:) I'd love to eat ice-cream right now, and my friend goes shopping. By the way, Danylo already went."
nltk.word_tokenize
tokens = nltk.word_tokenize(sentence)
print tokens
my_bigrams = nltk.bigrams(tokens)
my_trigrams = nltk.trigrams(tokens)
#for bigram in my_bigrams:
#    print bigram
#for trigram in my_trigrams:
#    print trigram
print type(tokens)

from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

# The only major thing to note is that lemmatize takes a part of speech parameter, "pos."
# If not supplied, the default is "noun."
print(lemmatizer.lemmatize("cats"))
print(lemmatizer.lemmatize("better", pos="a"))
print(lemmatizer.lemmatize("best", pos="a"))
print(lemmatizer.lemmatize("run"))
print(lemmatizer.lemmatize("done",'v'))


sentence = """At eight o'clock on Thursday morning
... Arthur didn't feel very good."""

tokens = nltk.word_tokenize(sentence)
print tokens
tagged = nltk.pos_tag(tokens)
print tagged


from collections import Counter
count = Counter(tokens)

print("len(count) = %s") %(len(count))
print("most_common = %s") %(count.most_common(10))

from nltk.stem import SnowballStemmer 

SnowballStemmer('english').stem('rabbits')


['I', 'am', 'a', 'big', 'boy', ':', ')', 'I', "'d", 'love', 'to', 'eat', 'ice-cream', 'right', 'now', ',', 'and', 'my', 'friend', 'goes', 'shopping', '.', 'By', 'the', 'way', ',', 'Danylo', 'already', 'went', '.']
<type 'list'>
cat
good
best
run
do
['At', 'eight', "o'clock", 'on', 'Thursday', 'morning', '...', 'Arthur', 'did', "n't", 'feel', 'very', 'good', '.']
[('At', 'IN'), ('eight', 'CD'), ("o'clock", 'NN'), ('on', 'IN'), ('Thursday', 'NNP'), ('morning', 'NN'), ('...', ':'), ('Arthur', 'NNP'), ('did', 'VBD'), ("n't", 'RB'), ('feel', 'VB'), ('very', 'RB'), ('good', 'JJ'), ('.', '.')]
len(count) = 14
most_common = [('on', 1), ('good', 1), ("o'clock", 1), ('did', 1), ('Thursday', 1), ('morning', 1), ("n't", 1), ('feel', 1), ('Arthur', 1), ('eight', 1)]


u'rabbit'

In [38]:
import nltk

my_words = set()

def my_split(text):
    words = []
    global my_words
    mw = nltk.word_tokenize(text)
    for w in mw:
        if w != 'rating':
            words.append(w)
            my_words.add(w)
    return words

#TODO nltk stemming

#review_without_puctuation_new = review_without_puctuation

review_without_puctuation_new = review_without_puctuation.apply(my_split)

# print my_words

# review_without_puctuation = products['review'].apply(str).apply(my_split)

#import nltk
# sentence = "I am a big boy:) I'd love to eat ice-cream right now, and my friend goes shopping. By the way, Danylo already went."
# nltk.word_tokenize
# tokens = nltk.word_tokenize(sentence)
# print tokens
# print type(tokens)

In [39]:
#significant_words = ['love', 'great', 'easy', 'old', 'little', 'perfect', 'loves', 
#       'well', 'able', 'car', 'broke', 'less', 'even', 'waste', 'disappointed', 
#       'work', 'product', 'money', 'would', 'return']

significant_words = list(my_words)[1:50]
        
def count_number_of_significant_words(text):
    words = text['review'] #.split()
    word_dict = {}
    for word in significant_words:
        word_dict[word] = 0
    for word in words:
        if word in significant_words:
            if word not in word_dict:
                word_dict[word] = 1
            else:
                word_dict[word] = word_dict[word] + 1
                #pass
    significant_words_counts = []
    for word in significant_words:
        significant_words_counts.append(word_dict[word]) 
    return pd.Series(significant_words_counts, index=significant_words)


newcols = pd.DataFrame(review_without_puctuation_new).apply(count_number_of_significant_words, axis=1)
newcols.columns = significant_words

products_with_words = products.join(newcols)

Now, let us explore what the sample example above looks like after these 2 transformations.

In [40]:
products_with_words.iloc[9]

review           This has been an easy way for my nanny to reco...
rating                                                           4
6pm                                                              0
saved                                                            0
feeding                                                          0
teaching                                                         0
worth                                                            0
every                                                            0
squares                                                          0
solution                                                         0
clothes                                                          0
enjoy                                                            0
chew                                                             0
quilt                                                            0
tired                                                         

## Save prepared data into a file

In [41]:
X = products_with_words[significant_words]
y = products_with_words['rating']
X.to_csv('../valt_sa_data/x_m.csv', index=False)
y.to_csv('../valt_sa_data/y_m.csv', index=False)