# Text data feature extraction

Here are parameters of the program user can easily change and estimate their impact on the performance.

In [109]:
USE_MY_METHOD = True
USE_STOP_WORDS = False
USE_EMOTICONS = False
USE_NEGATION = True

# If set to false, number of occurrences of words is calculated
USE_BOOLEAN_REPRESENTATION = True

NUMBER_OF_REVIEWS_TO_ANALYZE = 10000
NUMBER_OF_POPULAR_WORDS_TO_USE = 1000

We will use a dataset consisting of baby product reviews on Amazon.com.

In [110]:
import pandas as pd

In [111]:
products_raw = pd.read_csv("../valt_sa_data/amazon_baby.csv")
products = products_raw[['review', 'rating']][0:NUMBER_OF_REVIEWS_TO_ANALYZE]

Let us see how the data looks like:

In [112]:
products

Unnamed: 0,review,rating
0,"These flannel wipes are OK, but in my opinion ...",3
1,it came early and was not disappointed. i love...,5
2,Very soft and comfortable and warmer than it l...,5
3,This is a product well worth the purchase. I ...,5
4,All of my kids have cried non-stop when I trie...,5
5,"When the Binky Fairy came to our house, we did...",5
6,"Lovely book, it's bound tightly so you may not...",4
7,Perfect for new parents. We were able to keep ...,5
8,A friend of mine pinned this product on Pinter...,5
9,This has been an easy way for my nanny to reco...,4


Let us explore a specific example of a baby product.

In [113]:
products.iloc[9]

review    This has been an easy way for my nanny to reco...
rating                                                    4
Name: 9, dtype: object

Let us define an emoticons extraction function.

In [114]:
emoticons = [
    ':)', ':))', ':)))', ':(', ':((',
    ':(((', '=)', '=(', '=))', '=(('
]

def extract_emoticons(text):
    emoticons_in_text = []
    for emoticon in emoticons:
        i = text.find(emoticon)
        if i > -1:
            emoticons_in_text.append(emoticon)
    return emoticons_in_text

The helper functions below are also useful.

In [115]:
punctuation_to_remove = '!"#$%&()*+,-./:;<=>?@[\]^_`{|}~'

def remove_punctuation(text):
    return text.translate(None, punctuation_to_remove)

pos_dict = {
    'NN': 'n', 'VB': 'v', 'VBD': 'v', 'VBG': 'v', 
    'VBN': 'v', 'VBP': 'v', 'VBZ': 'v',
    'JJ': 'a', 'JJR': 'a', 'JJS': 'a', 'JJT': 'a'
}

def get_pos_for_lemmatirzer(brown_post):
    if not brown_post in pos_dict:
        return 'n'
    else:
        return pos_dict[brown_post]

Now let us define a more sophisticated function for review analysis.

First the punctuation is removed.

Then every word is pos tagged to prepare for lemmatization.

After that lemmatization is performed to find the root form of each word.

All the stop words are removed if the corresponding program parameter is set.
Also if set, emoticons are extracted and processed.

In [116]:
import nltk
from nltk.stem import WordNetLemmatizer

stop_words = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you',
 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 
 'himself', 'she', 'her', 'hers', 'herself', 'it', 'its', 'itself', 
 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 
 'who', 'whom', 'this', 'that', 'these', 'those', 'am', 'is', 'are', 
 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 
 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 
 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 
 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 
 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 
 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 
 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no',
 'nor', 'not', "n't", 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 
 'will', 'just', 'don', 'should', 'now']

def analyze_review(text):
    if USE_EMOTICONS:
        emoticons_features = extract_emoticons(text)
    else:
        emoticons_features = []

    text_without_punctuation = remove_punctuation(text)
    tokens = nltk.word_tokenize(text_without_punctuation)
    tagged_tokens = nltk.pos_tag(tokens)
    tokens_prepared_for_lemmatization = [(t[0], get_pos_for_lemmatirzer(t[1])) for t in tagged_tokens]
    
    lemmatizer = WordNetLemmatizer()
    lemmas = []
    
    not_count = 0
    words_after_not_count = 0
    for tpl in tokens_prepared_for_lemmatization:
        current_word = lemmatizer.lemmatize(tpl[0], tpl[1]).lower()
        if words_after_not_count > 2:
            not_count = 0
            words_after_not_count = 0
        if current_word == 'not' or current_word == "n't":
                not_count += 1
        elif (not USE_STOP_WORDS) or (not current_word in stop_words):
            if USE_NEGATION and not_count % 2 == 1:
                current_word = 'NOT_' + current_word
                words_after_not_count += 1
            lemmas.append('F_' + current_word) # F - meaning feature
    
    review_words = lemmas + emoticons_features
    return review_words

Now, we will perform text analysis.
We will also find and print most common words and total number of words in the dictionary.

In [117]:
analyzed_reviews = products['review'].apply(str).apply(analyze_review)

review_words_list = [] # conaints duplicates, so that count of each word can be calculated
review_dictionary = set()

for w_l in analyzed_reviews:
    for word in w_l:
        review_words_list.append(word)
        review_dictionary.add(word)

from collections import Counter

review_counter = Counter(review_words_list)
most_common_words = map(lambda x: x[0], review_counter.most_common(NUMBER_OF_POPULAR_WORDS_TO_USE))
print most_common_words
print len(review_dictionary)

['F_the', u'F_be', 'F_i', 'F_it', 'F_and', 'F_to', 'F_a', u'F_have', 'F_this', 'F_for', 'F_of', 'F_my', 'F_in', 'F_that', u'F_do', 'F_with', 'F_we', 'F_on', 'F_use', 'F_you', 'F_but', 'F_so', "F_'s", 'F_baby', 'F_one', 'F_get', 'F_when', 'F_they', 'F_he', 'F_bottle', 'F_would', 'F_she', 'F_at', 'F_up', 'F_just', 'F_can', 'F_love', 'F_out', 'F_buy', 'F_very', 'F_all', 'F_great', 'F_our', u'F_month', 'F_time', 'F_these', 'F_if', 'F_her', 'F_them', 'F_old', 'F_like', 'F_or', 'F_easy', 'F_son', 'F_from', 'F_because', u'F_make', 'F_diaper', 'F_only', u'F_go', 'F_work', 'F_now', 'F_product', 'F_seat', 'F_will', 'F_little', 'F_tub', 'F_about', 'F_no', 'F_also', 'F_more', 'F_first', 'F_other', 'F_your', u'F_good', 'F_daughter', u'F_take', 'F_me', u'F_try', 'F_after', 'F_well', 'F_there', 'F_put', 'F_NOT_to', 'F_keep', 'F_too', 'F_NOT_the', 'F_much', 'F_really', 'F_than', 'F_pump', 'F_still', 'F_his', 'F_an', 'F_could', 'F_think', 'F_back', u'F_find', 'F_which', 'F_need', u'F_thing', 'F_recomme

We perform feature extraction on the analyzed text. The matrix for machine learning is formed. Only features stored in the variable `significant_words` are included.

In [118]:
if USE_MY_METHOD:
    if USE_EMOTICONS:
        significant_words = most_common_words + emoticons
    else:
        significant_words = most_common_words
else:
    significant_words = ['love', 'great', 'easy', 'old', 'little', 'perfect', 'loves', 
                         'well', 'able', 'car', 'broke', 'less', 'even', 'waste', 'disappointed', 
                         'work', 'product', 'money', 'would', 'return']
        
def count_number_of_significant_words(text):
    words = text['review']
    word_dict = {}
    for word in significant_words:
        word_dict[word] = 0
    for word in words:
        if word in significant_words:
            if not word in word_dict:
                word_dict[word] = 1
            else:
                if USE_BOOLEAN_REPRESENTATION:
                    word_dict[word] = 1
                else:
                    word_dict[word] = word_dict[word] + 1
    significant_words_counts = []
    for word in significant_words:
        significant_words_counts.append(word_dict[word]) 
    return pd.Series(significant_words_counts, index=significant_words)

word_counts_df = pd.DataFrame(analyzed_reviews).apply(count_number_of_significant_words, axis=1)
word_counts_df.columns = significant_words

products_with_words = products.join(word_counts_df)

Now, let us explore what the sample looks like after all the transformations.

The resulting matrix is very sparse, as was expected.

In [119]:
products_with_words.iloc[9]

review           This has been an easy way for my nanny to reco...
rating                                                           4
F_the                                                            1
F_be                                                             1
F_i                                                              1
F_it                                                             1
F_and                                                            0
F_to                                                             1
F_a                                                              0
F_have                                                           1
F_this                                                           1
F_for                                                            1
F_of                                                             1
F_my                                                             1
F_in                                                          

## Save prepared data into a file

In [120]:
X = products_with_words[significant_words]
y = products_with_words['rating']
X.to_csv('../valt_sa_data/x_m.csv', index=False)
y.to_csv('../valt_sa_data/y_m.csv', index=False)