In [1]:
import pandas as pd
from textblob import TextBlob

In [2]:
#read in the data, remove NaNs, reset index
data = pd.read_csv("review_final_academic.csv")
data = data[pd.notnull(data['text'])]
data = data.reset_index(drop=True)

In [3]:
#decode review text to prevent decoding errors when breaking it into sentences
def decodeArray(text):
    try:
        return text.decode('utf-8','replace')
    except:
        return 0
    
data.text = data.text.apply(decodeArray)

In [4]:
#break review text into a list of sentences
#this step took about 2 hours to run on my mac
for i in range(0,len(data)):
    data.text[i] = TextBlob(data.text[i]).raw_sentences
    if i%10000 == 0:
        print("Processing the %dth review, %d more to go... Please be patient..." % (i,len(data)-i))

Processing the 0th review, 122641 more to go... Please be patient...
Processing the 10000th review, 112641 more to go... Please be patient...
Processing the 20000th review, 102641 more to go... Please be patient...
Processing the 30000th review, 92641 more to go... Please be patient...
Processing the 40000th review, 82641 more to go... Please be patient...
Processing the 50000th review, 72641 more to go... Please be patient...
Processing the 60000th review, 62641 more to go... Please be patient...
Processing the 70000th review, 52641 more to go... Please be patient...
Processing the 80000th review, 42641 more to go... Please be patient...
Processing the 90000th review, 32641 more to go... Please be patient...
Processing the 100000th review, 22641 more to go... Please be patient...
Processing the 110000th review, 12641 more to go... Please be patient...
Processing the 120000th review, 2641 more to go... Please be patient...


A value is trying to be set on a copy of a slice from a DataFrame

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [5]:
#convert each sentence into a dataframe row preserving all other column information
data_sentences = pd.concat([pd.DataFrame(dict(zip(data.columns,data.ix[i]))) for i in range(len(data))], ignore_index=True)

In [6]:
#1,119,635 sentences from 122,641 reviews
len(data_sentences)

1119635

In [7]:
data_sentences.head()

Unnamed: 0.1,Unnamed: 0,business_id,date,review_id,stars,text,type,user_id,votes
0,0,qHmamQPCAKkia9X0uryA8g,2006-09-23,M8G9Rs21i4euIo3T5gyGOg,4,Are you drunk?,review,Xsp1amevfceAqAMjKhZkgA,"{u'funny': 0, u'useful': 1, u'cool': 0}"
1,0,qHmamQPCAKkia9X0uryA8g,2006-09-23,M8G9Rs21i4euIo3T5gyGOg,4,Is it around 3am?,review,Xsp1amevfceAqAMjKhZkgA,"{u'funny': 0, u'useful': 1, u'cool': 0}"
2,0,qHmamQPCAKkia9X0uryA8g,2006-09-23,M8G9Rs21i4euIo3T5gyGOg,4,Are you in downtown Berkeley?,review,Xsp1amevfceAqAMjKhZkgA,"{u'funny': 0, u'useful': 1, u'cool': 0}"
3,0,qHmamQPCAKkia9X0uryA8g,2006-09-23,M8G9Rs21i4euIo3T5gyGOg,4,"Well then, you better head down to Top Dog, pu...",review,Xsp1amevfceAqAMjKhZkgA,"{u'funny': 0, u'useful': 1, u'cool': 0}"
4,0,qHmamQPCAKkia9X0uryA8g,2006-09-23,M8G9Rs21i4euIo3T5gyGOg,4,"They have around, what, 25, 30 different kinds...",review,Xsp1amevfceAqAMjKhZkgA,"{u'funny': 0, u'useful': 1, u'cool': 0}"


In [9]:
#save as csv file
data_sentences.to_csv('review_sentences_final_academic.csv', encoding='utf-8')

In [510]:
#service words selected from:
#1) http://www.vladsandulescu.com/topic-prediction-lda-user-reviews/
#2) Doug's NMF run
#3) Sudeep's slides: http://www.slideshare.net/SudeepDasPhD/chicago-june19sudeepdas

service_words = ['management', 'manager', 'waitress', 'waiter', 'staff', 'employee', 'cashier', 'server', 'guy', 'customer', #people nouns
                 'service', 'counter', 'minute', 'attention', 'knowledge', 'nerve', #other nouns
                 'greet', 'seated', 'taking', 'treated', 'waited', 'forgot', 'apologize', #verbs
                 'friendly', 'professional', 'prompt', 'attentive', 'helpful', 'slow', 'rude', 'untrained', 'unattended'] #adjectives

In [511]:
len(service_words) 

32

In [512]:
#select sentences containing only the service words
all_data = []
for i in service_words:
    all_data.append(data_sentences[data_sentences['text'].str.contains(i)])

In [513]:
#concaternate all dataframes, sort, drop duplicate rows and reset index
test_data = pd.concat(all_data)
test_data = test_data.sort().drop_duplicates()
test_data = test_data.reset_index(drop=True)

In [514]:
#number of service-related sentences
len(test_data)

113278

In [515]:
#save as csv file
test_data.to_csv('review_sentences_final_academic_service_words_NEW.csv', encoding='utf-8')

In [516]:
servicesentences = pd.read_csv('review_sentences_final_academic_service_words_NEW.csv')

In [517]:
servicesentences.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,business_id,date,review_id,stars,text,type,user_id,votes
0,0,5,qHmamQPCAKkia9X0uryA8g,2012-06-15,hIrFN-5jhCo04AvhsNtimg,5,You order in 3 seconds and you're out in 5 min...,review,P-xcy872BvGcClkNpNlPqQ,"{u'funny': 0, u'useful': 1, u'cool': 1}"
1,1,7,qHmamQPCAKkia9X0uryA8g,2010-12-10,C3XPzVWPoqK_FpgItJFtjg,4,The guy behind the counter was very friendly a...,review,S6BXOUedzPH58K0rYY1loQ,"{u'funny': 0, u'useful': 0, u'cool': 0}"
2,2,8,qHmamQPCAKkia9X0uryA8g,2008-05-05,q39g4U6OFLaDF0gekuOHBg,3,The service is terrible.,review,3Rn993btYslL-3D902Bxlw,"{u'funny': 0, u'useful': 0, u'cool': 0}"
3,3,8,qHmamQPCAKkia9X0uryA8g,2008-05-05,q39g4U6OFLaDF0gekuOHBg,3,"The other customers, loud and rowdy and someti...",review,3Rn993btYslL-3D902Bxlw,"{u'funny': 0, u'useful': 0, u'cool': 0}"
4,4,9,qHmamQPCAKkia9X0uryA8g,2008-01-16,yG5AdbbKaszHbIbwbuvR3A,4,For the budget-conscious student price of $2.7...,review,m07sy7eLtOjVdZ8oN9JKag,"{u'funny': 11, u'useful': 12, u'cool': 14}"
