In [1]:
import pandas as pd
import numpy as np
from textblob import TextBlob
import datetime
import multiprocessing as mp

In [2]:
from os import listdir
from os.path import isfile, join
mypath='/home/danielj/insight/project/data/yelp_dataset_challenge_academic_dataset/yelp_review_subsets_50000/'
onlyfiles = [ join(mypath,f) for f in listdir(mypath) if isfile(join(mypath,f)) ]
print onlyfiles

['/home/danielj/insight/project/data/yelp_dataset_challenge_academic_dataset/yelp_review_subsets_50000/25.json', '/home/danielj/insight/project/data/yelp_dataset_challenge_academic_dataset/yelp_review_subsets_50000/24.json', '/home/danielj/insight/project/data/yelp_dataset_challenge_academic_dataset/yelp_review_subsets_50000/9.json', '/home/danielj/insight/project/data/yelp_dataset_challenge_academic_dataset/yelp_review_subsets_50000/32.json', '/home/danielj/insight/project/data/yelp_dataset_challenge_academic_dataset/yelp_review_subsets_50000/18.json', '/home/danielj/insight/project/data/yelp_dataset_challenge_academic_dataset/yelp_review_subsets_50000/1.json', '/home/danielj/insight/project/data/yelp_dataset_challenge_academic_dataset/yelp_review_subsets_50000/19.json', '/home/danielj/insight/project/data/yelp_dataset_challenge_academic_dataset/yelp_review_subsets_50000/31.json', '/home/danielj/insight/project/data/yelp_dataset_challenge_academic_dataset/yelp_review_subsets_50000/14.

In [3]:
def test_fcn(infile):
    print infile
    return infile

def read_file_do_sentiment(infile):
    start_read_time = datetime.datetime.now()
    # read the entire file into a python array
    print '  Reading file: %s'%(infile)
    with open(infile, 'rb') as f:
        data = f.readlines()

    # remove the trailing "\n" from each line
    data = map(lambda x: x.rstrip(), data)

    # each element of 'data' is an individual JSON object.
    # i want to convert it into an *array* of JSON objects
    # which, in and of itself, is one large JSON object
    # basically... add square brackets to the beginning
    # and end, and have all the individual business JSON objects
    # separated by a comma
    data_json_str = "[" + ','.join(data) + "]"

    temp_df = pd.read_json(data_json_str)
    
    end_read_time = datetime.datetime.now()
    read_delta = end_read_time - start_read_time
    read_sec = read_delta.total_seconds() #seconds
    print '    read duration(s) = %.3f'%(read_sec)
    
    start_sentiment_time = datetime.datetime.now()
    
    polarity_list = [0 for x in range(len(temp_df))]
    subjectivity_list = [0 for x in range(len(temp_df))]

    for j in range(len(temp_df)):
        blob = TextBlob(temp_df['text'].iloc[j])
        polarity_list[j] = blob.sentiment.polarity
        subjectivity_list[j] = blob.sentiment.subjectivity
    
    polarity_series = pd.Series(polarity_list,index=temp_df.index)
    subjectivity_series = pd.Series(subjectivity_list,index=temp_df.index)

    temp_df['polarity'] = polarity_series
    temp_df['subjectivity'] = subjectivity_series
    
    temp_df.drop(['review_id','type','text','user_id',],axis=1,inplace=True)

    
    end_sentiment_time = datetime.datetime.now()
    sentiment_delta = end_sentiment_time - start_sentiment_time
    sentiment_sec = sentiment_delta.total_seconds() #seconds
    print '    sentiment duration(s) = %.3f'%(sentiment_sec)
    
    return temp_df

In [4]:
start_time = datetime.datetime.now()

pool = mp.Pool(processes=mp.cpu_count()-1)
results = [pool.apply_async(read_file_do_sentiment, args=(infile,)) for infile in onlyfiles]
output = [p.get() for p in results]
print output

df = pd.concat(output,ignore_index=True)
df.to_csv("/home/danielj/insight/project/data/yelp_dataset_challenge_academic_dataset/yelp_reviews_sentiment.csv")

end_time = datetime.datetime.now()
delta = end_time - start_time
sec = delta.total_seconds() # seconds
print 'Total loop duration(s) = %.3f'%sec
print len(df)
print df.head(5)
print df.tail(5)

[                  business_id       date  stars  \
0      jtzhY-P4H6WSYpv5rWhxtw 2014-08-28      5   
1      jtzhY-P4H6WSYpv5rWhxtw 2014-09-05      2   
2      jtzhY-P4H6WSYpv5rWhxtw 2014-09-07      3   
3      jtzhY-P4H6WSYpv5rWhxtw 2014-09-09      5   
4      jtzhY-P4H6WSYpv5rWhxtw 2014-09-24      5   
5      jtzhY-P4H6WSYpv5rWhxtw 2014-09-24      3   
6      jtzhY-P4H6WSYpv5rWhxtw 2014-09-29      2   
7      jtzhY-P4H6WSYpv5rWhxtw 2014-09-30      4   
8      jtzhY-P4H6WSYpv5rWhxtw 2014-10-05      4   
9      jtzhY-P4H6WSYpv5rWhxtw 2014-10-06      2   
10     jtzhY-P4H6WSYpv5rWhxtw 2014-10-06      4   
11     jtzhY-P4H6WSYpv5rWhxtw 2014-10-07      1   
12     jtzhY-P4H6WSYpv5rWhxtw 2014-10-11      4   
13     jtzhY-P4H6WSYpv5rWhxtw 2014-10-13      4   
14     jtzhY-P4H6WSYpv5rWhxtw 2014-10-16      2   
15     jtzhY-P4H6WSYpv5rWhxtw 2014-10-17      3   
16     jtzhY-P4H6WSYpv5rWhxtw 2014-10-22      4   
17     jtzhY-P4H6WSYpv5rWhxtw 2014-10-23      4   
18     jtzhY-P4H6WSYpv5rWhxtw 