# Download and extract Amazon Reviews (1/4).
The Amazon reviews Dataset source used in the project source is the following:
http://jmcauley.ucsd.edu/data/amazon

This web contains Amazon reviews filtered by categories whose data have been choosen in order to train and test the Logistic Regression Model. We've chosen the following categories:
- Babies: http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Baby_5.json.gz
- Pets: http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Pet_Supplies_5.json.gz

NOTE: These files must be uncompressed before launching this script at this same folder.



# Amazon reviews generation file babies+pets_reviews.csv based on the merge of reviews_Baby_5.csv and reviews_Pet_Supplies_5.csv (2/4)


###  Required functions in this step
Due to the data files format is json, it's neccesary to convert then as csv file.
There also necessary functions for lemmatization technics and punctuation & Stop Words removal. 
These functions are used in order to get clean text required for sentiment analysis:<br/>
- Stopwords and punctuation removal<br/>
- Text Lemmatization. Libraries used: TextBlob, NLTK<br/>

In [2]:
import json
import pandas as pd
from glob import glob


def convert(x):
    ob = json.loads(x)
    for k, v in ob.items():
        if isinstance(v, list):
            ob[k] = ','.join(str(v))
        elif isinstance(v, dict):
            for kk, vv in v.items():
                ob['%s_%s' % (k, kk)] = vv
            del ob[k]
    return ob


In [3]:
from textblob import TextBlob
from textblob import Word
import nltk
# nltk.download() hay que hacerlo la primera vez para cargar todos los corpus necesarios
from nltk.corpus import stopwords
cachedStopWords = stopwords.words("english")

cachedStopWords

[u'i',
 u'me',
 u'my',
 u'myself',
 u'we',
 u'our',
 u'ours',
 u'ourselves',
 u'you',
 u'your',
 u'yours',
 u'yourself',
 u'yourselves',
 u'he',
 u'him',
 u'his',
 u'himself',
 u'she',
 u'her',
 u'hers',
 u'herself',
 u'it',
 u'its',
 u'itself',
 u'they',
 u'them',
 u'their',
 u'theirs',
 u'themselves',
 u'what',
 u'which',
 u'who',
 u'whom',
 u'this',
 u'that',
 u'these',
 u'those',
 u'am',
 u'is',
 u'are',
 u'was',
 u'were',
 u'be',
 u'been',
 u'being',
 u'have',
 u'has',
 u'had',
 u'having',
 u'do',
 u'does',
 u'did',
 u'doing',
 u'a',
 u'an',
 u'the',
 u'and',
 u'but',
 u'if',
 u'or',
 u'because',
 u'as',
 u'until',
 u'while',
 u'of',
 u'at',
 u'by',
 u'for',
 u'with',
 u'about',
 u'against',
 u'between',
 u'into',
 u'through',
 u'during',
 u'before',
 u'after',
 u'above',
 u'below',
 u'to',
 u'from',
 u'up',
 u'down',
 u'in',
 u'out',
 u'on',
 u'off',
 u'over',
 u'under',
 u'again',
 u'further',
 u'then',
 u'once',
 u'here',
 u'there',
 u'when',
 u'where',
 u'why',
 u'how',
 u'all

In [4]:
import string
def remove_punctuation(text):  
    #salida = text.translate(None, string.punctuation).lower()
    salida = text.translate(None, string.punctuation)
    return salida

def lemmatizeReviewText(row):
    words = TextBlob(row).words.lemmatize()
    ext = ' '.join(word for word in words if word not in (cachedStopWords))
    return ext


### The code of this step is the following

In [5]:
#Tratamos el dataset de Babies review
babyDf = pd.DataFrame([convert(line) for line in file('reviews_Baby_5.json')])
# forzamos la columna reviewText como str para el correcto funcionamiento de remove_punctuation
babyDf['reviewText'] = babyDf['reviewText'].astype(str)
babyDf['review_clean'] = babyDf['reviewText'].apply(remove_punctuation)
babyDf['review_clean']=babyDf['review_clean'].apply(lambda x: x.lower())
babyDf['review_clean']= babyDf['review_clean'].apply(lemmatizeReviewText)

babyDf.to_csv('reviews_Baby_5.csv', header=True,quoting=1)


In [6]:
# Tratamos el dataset de Pets review
petsDf = pd.DataFrame([convert(line) for line in file('reviews_Pet_Supplies_5.json')])
# forzamos la columna reviewText como str para el correcto funcionamiento de remove_punctuation
petsDf['reviewText'] = petsDf['reviewText'].astype(str)
petsDf['review_clean'] = petsDf['reviewText'].apply(remove_punctuation)
petsDf['review_clean']= petsDf['review_clean'].apply(lambda x: x.lower())
petsDf['review_clean']= petsDf['review_clean'].apply(lemmatizeReviewText)
petsDf.to_csv('reviews_Pet_Supplies_5.csv', header=True,quoting=1)

In [7]:
principalDf = [babyDf,petsDf]
amazonReviewsDf = pd.concat(principalDf)
# renombramos la columna overall por review_overall
amazonReviewsDf['review_overall']=amazonReviewsDf['overall']
del(amazonReviewsDf['overall'])
amazonReviewsDf.to_csv('babies+pets_reviews.csv', header=True,quoting=1)
len(amazonReviewsDf)

318628

### Overall reviews Histogram 

In [18]:
plt=amazonReviewsDf['review_overall'].value_counts().plot(kind='bar').set_title('Reviews Overall')

len(amazonReviewsDf['review_overall'])

318628

# Important words list generation (3/4)

###  Required functions in this step
We need to build a dictionary {'word': frequency} necessary to count the frequency of words.<br/>
Initially we'll need 1500 words in our process.

In [9]:

num_important_words = 1500
import numpy as np
cnt = {}
for linea in amazonReviewsDf['review_clean'].astype(str).values:
    for word in linea.split():
        if (word not in cnt):
            cnt[word] = 1
        else:
            cnt[word] += 1
            


### The code of this step is the following

In [10]:
wordsSerie=pd.Series(cnt,index=cnt.keys())
wordsSerie=wordsSerie[wordsSerie.values>1]
data={'a':wordsSerie.index,'b':wordsSerie.values}
wordsDf=pd.DataFrame(data=data, index=np.arange(len(wordsSerie)))
wordsDf=wordsDf.sort('b', ascending=False).head(num_important_words)

important_words = str([str(s) for s in wordsDf['a']])
f= open('important_words_1500.json', 'w') 
f.write(important_words.replace("'",'"'))
f.close()
len(wordsDf)

1500

# Tweeter files management (4/4)


###  Required functions in this step
The following functions are required for lemmatization technics and removal of punctuation & Stop Words. 
These functions are used in order to get clean text required for sentiment analysis:<br/>
- Stopwords and punctuation removal<br/>
- Text Lemmatization. Libraries used: TextBlob, NLTK<br/>text_clean

In [11]:
# utilizado para descartar caracteres no ASCII
def is_ascii(s):
    return all(ord(c) < 128 for c in s)

# Devuelve un dataframe a partir del csv de reviews de Tweeter
def getTweetsDF(json_filename):
    csv_filename = '%s.csv' % json_filename[:-5]
    print 'Converting %s to %s' % (json_filename, csv_filename)
    df = pd.DataFrame( columns=['tweet_id','tweet_text','tweet_created_at','tweet_geo_lat','tweet_geo_lon','tweet_city'])

    json_data=open(json_filename).read()
    ob = json.loads(json_data)
    results = ob['results']
    print(len(results))
    for k  in results:

        if (not k['geo']):
            geo = ''
        else:
            idd = k['id_str']
            #preguntamos si el texto está en ASCII para evitar que entren  caracteres UNICODE problematicos en NLP
            if (is_ascii( k['text'])):
                text = k['text']
            else:
                text = ''
            created_at = k['created_at']
            #geo= ', '.join(map(str,k['geo']['coordinates']))
            geo_lat= k['geo']['coordinates'][0]
            geo_lon= k['geo']['coordinates'][1]
            if (is_ascii(k['place']['full_name'])):
                city = k['place']['full_name']
            else:
                city = ''
            SR_row = pd.Series({'tweet_id':idd, 'tweet_text':text, 'tweet_created_at':created_at,'tweet_geo_lat':geo_lat,'tweet_geo_lon':geo_lon,'tweet_city':city},name=len(df))
            df=df.append(SR_row)
    return df

# lematiza y elimina Stop Words de la fila del dataset de reviews de Tweeter
def lemmatizeReviewTextForTweets(row):
    if (type(row['tweet_text']==str)):
        words = TextBlob(row['tweet_text']).words.lemmatize()
        ext = ' '.join(word for word in words if word not in (cachedStopWords))
    else:
        ext = ''
    return ext



### The code of this step is the following

In [12]:
# USA-car-tweets.json processing
dfUSACarTweets = getTweetsDF('USA-car-tweets.json')
dfUSACarTweets['text_clean'] = dfUSACarTweets.apply(lemmatizeReviewTextForTweets,axis=1)
dfUSACarTweets['text_clean'] = dfUSACarTweets['text_clean'].astype(str)
dfUSACarTweets['text_clean'] = dfUSACarTweets['text_clean'].apply(remove_punctuation)
dfUSACarTweets['text_clean'] = dfUSACarTweets['text_clean'].apply(lambda x: x.lower())
dfUSACarTweets=dfUSACarTweets[dfUSACarTweets['text_clean']!='']
# Generamos csv
dfUSACarTweets.to_csv('USA-car-tweets.csv', header=True,quoting=1)

Converting USA-car-tweets.json to USA-car-tweets.csv
326


# Tratamos USA-flight-tweets.json para generar el csv con la columna text_clean

In [13]:
# USA-fligh-tweets.json processing
dfUSAFlightTweets = getTweetsDF('USA-flight-tweets.json')
dfUSAFlightTweets['text_clean'] = dfUSAFlightTweets.apply(lemmatizeReviewTextForTweets,axis=1)
dfUSAFlightTweets['text_clean'] = dfUSAFlightTweets['text_clean'].astype(str)
dfUSAFlightTweets['text_clean'] = dfUSAFlightTweets['text_clean'].apply(remove_punctuation)
dfUSAFlightTweets['text_clean'] = dfUSAFlightTweets['text_clean'].apply(lambda x: x.lower())
dfUSAFlightTweets=dfUSAFlightTweets[dfUSAFlightTweets['text_clean']!='']
# Generamos us_tweets.csv
dfUSAFlightTweets.to_csv('USA-flight-tweets.csv', header=True,quoting=1)

Converting USA-flight-tweets.json to USA-flight-tweets.csv
264
