# Text data feature extraction

# Data preperation

We will use a dataset consisting of baby product reviews on Amazon.com.

In [1]:
import pandas as pd

In [2]:
products = pd.read_csv("../valt_sa_data/amazon_baby.csv")[['review', 'rating']]

In [3]:
products = products[0:1000]

In [4]:
products

Unnamed: 0,review,rating
0,"These flannel wipes are OK, but in my opinion ...",3
1,it came early and was not disappointed. i love...,5
2,Very soft and comfortable and warmer than it l...,5
3,This is a product well worth the purchase. I ...,5
4,All of my kids have cried non-stop when I trie...,5
5,"When the Binky Fairy came to our house, we did...",5
6,"Lovely book, it's bound tightly so you may not...",4
7,Perfect for new parents. We were able to keep ...,5
8,A friend of mine pinned this product on Pinter...,5
9,This has been an easy way for my nanny to reco...,4


## Build the word count vector for each review

Let us explore a specific example of a baby product.

In [5]:
products.iloc[9]

review    This has been an easy way for my nanny to reco...
rating                                                    4
Name: 9, dtype: object

Now, we will perform 2 simple data transformations:

1. Remove punctuation using Python's built-in string functionality.
2. Transform the reviews into word-counts.

In [6]:
def remove_punctuation(text):
    import string
    return text.translate(None, string.punctuation) 

review_without_puctuation = products['review'].apply(str).apply(remove_punctuation)

In [7]:
def my_split(text):
    global my_words
    mw = nltk.word_tokenize(text)
    for w in mw:
        my_words.add(w)
        
my_words = set()

#import nltk
# sentence = "I am a big boy. I'd love to eat ice-cream right now."
# nltk.word_tokenize
# tokens = nltk.word_tokenize(sentence)
# print tokens
# print type(tokens)

In [8]:
significant_words = ['love', 'great', 'easy', 'old', 'little', 'perfect', 'loves', 
      'well', 'able', 'car', 'broke', 'less', 'even', 'waste', 'disappointed', 
      'work', 'product', 'money', 'would', 'return']
        
def count_number_of_significant_words(text):
    words = text['review'].split()
    word_dict = {}
    for word in significant_words:
        word_dict[word] = 0
    for word in words:
        if word in significant_words:
            if word not in word_dict:
                word_dict[word] = 1
            else:
                word_dict[word] = word_dict[word] + 1
    significant_words_counts = []
    for word in significant_words:
        significant_words_counts.append(word_dict[word]) 
    return pd.Series(significant_words_counts, index=significant_words)

lambdafunc = lambda x: pd.Series(significant_words)

newcols = pd.DataFrame(review_without_puctuation).apply(count_number_of_significant_words, axis=1)
newcols.columns = significant_words

products_with_words = products.join(newcols)

Now, let us explore what the sample example above looks like after these 2 transformations.

In [9]:
products_with_words.iloc[9]

review          This has been an easy way for my nanny to reco...
rating                                                          4
love                                                            0
great                                                           0
easy                                                            1
old                                                             0
little                                                          0
perfect                                                         0
loves                                                           0
well                                                            0
able                                                            0
car                                                             0
broke                                                           0
less                                                            0
even                                                            0
waste     

## Save prepared data into a file

In [10]:
X = products_with_words[significant_words]
y = products_with_words['rating']
X.to_csv('../valt_sa_data/x_m.csv', index=False)
y.to_csv('../valt_sa_data/y_m.csv', index=False)