In [2]:
import gzip
from collections import defaultdict
from sklearn import linear_model
import csv
import heapq
import string
from nltk.corpus import stopwords

In [3]:
def readGz(path):
    for l in gzip.open(path, 'rt'):
        yield eval(l)

In [4]:
def parseData(fname):
    for l in gzip.open(fname):
        d = eval(l)
        yield d

In [5]:
data = list(parseData("CA_5.json.gz"))

In [6]:
places = list(parseData('places_CA.json.gz'))

In [7]:
reviews = list(parseData('reviews_CA.json.gz'))

In [8]:
data[0]

[{'rating': 4.0,
  'reviewerName': 'Mary Gainza',
  'reviewText': 'Gap always has a jean that fits awesomely, i wish their prices were a bit competitive in comparison to their outlets, also the amount of the pieces they have in store sometimes fly so quick that is hard to find an specific size with their best sellers, but in general i really like this store and most of their products.',
  'categories': ['Clothing Store',
   "Women's Clothing Store",
   "Children's Clothing Store"],
  'gPlusPlaceId': '100556368174926958612',
  'unixReviewTime': 1355436757,
  'reviewTime': 'Dec 13, 2012',
  'gPlusUserId': '100000715097692381911'},
 {'rating': 4.0,
  'reviewerName': 'Mary Gainza',
  'reviewText': 'Madewell girls are always nice and smiley, i really love the clothing at the madewell store, specially when they have sales item they are super nice, their dresses and also the little details they have for little gifts, like necklaces and rings.',
  'categories': ["Women's Clothing Store"],
  'g

In [9]:
places[9]

{'name': 'Nail Perfection',
 'price': None,
 'address': ['556 Las Posas Rd', 'Camarillo, CA 93010'],
 'hours': None,
 'phone': '(805) 987-0992',
 'closed': False,
 'gPlusPlaceId': '100068746766818502566',
 'gps': [34.218397, -119.069823]}

In [10]:
reviews[0]

{'rating': 4.0,
 'reviewerName': 'william spindler',
 'reviewText': 'Best War Wanton soup in Red Bluff',
 'categories': ['Asian Restaurant', 'Chinese Restaurant'],
 'gPlusPlaceId': '106591714648856494903',
 'unixReviewTime': 1394669496,
 'reviewTime': 'Mar 12, 2014',
 'gPlusUserId': '100000032416892623125'}

In [11]:
wordCount = defaultdict(int)
for d in data:
    if type(d['reviewText']) == str:
        for w in d['reviewText'].split():
            wordCount[w] += 1

In [12]:
wordCount = defaultdict(int)
punctuation = set(string.punctuation)
for d in data:
    if type(d['reviewText']) == str:
        r = ''.join([c for c in d['reviewText'].lower() if not c in punctuation])
        for w in r.split():
            wordCount[w] += 1

counts = [(wordCount[w], w) for w in wordCount]
counts.sort()
counts.reverse()

In [13]:
words = [x[1] for x in counts[:1000]]

['the',
 'and',
 'a',
 'to',
 'i',
 'is',
 'of',
 'for',
 'in',
 'was',
 'it',
 'but',
 'you',
 'food',
 'good',
 'with',
 'great',
 'this',
 'place',
 'are',
 'they',
 'that',
 'on',
 'my',
 'have',
 'not',
 'very',
 'its',
 'here',
 'at',
 'service',
 'we',
 'so',
 'as',
 'be',
 'had',
 'if',
 'their',
 'like',
 'get',
 'were',
 'there',
 'best',
 'all',
 'one',
 'really',
 'go',
 'can',
 'just',
 'nice',
 'out',
 'or',
 'from',
 'your',
 'love',
 'an',
 'some',
 'too',
 'also',
 'time',
 'always',
 'me',
 'up',
 'when',
 'delicious',
 'restaurant',
 'more',
 'which',
 'dont',
 'well',
 'about',
 'has',
 'back',
 'pretty',
 'would',
 'little',
 'what',
 'friendly',
 'by',
 'staff',
 'only',
 'excellent',
 'our',
 'will',
 'been',
 'amazing',
 'ive',
 'menu',
 'try',
 'bar',
 'than',
 'other',
 'wait',
 'pizza',
 'bit',
 'chicken',
 'much',
 'no',
 'people',
 'better',
 'definitely',
 'come',
 'make',
 'even',
 'coffee',
 'worth',
 'do',
 'atmosphere',
 'though',
 'fresh',
 'favorite'

In [15]:
#removing stop words
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/mgvasque/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [16]:
stop_words = set(stopwords.words('english'))

In [17]:
stop_words

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 'r

In [18]:
wordCount = defaultdict(int)
punctuation = set(string.punctuation)
for d in data:
    if type(d['reviewText']) == str:
        r = ''.join([c for c in d['reviewText'].lower() if not c in punctuation])
        for w in r.split():
            if w not in stop_words:
                wordCount[w] += 1

counts = [(wordCount[w], w) for w in wordCount]
counts.sort()
counts.reverse()

In [19]:
#EDA top words
counts[:10]

[(28827, 'food'),
 (28762, 'good'),
 (27991, 'great'),
 (23042, 'place'),
 (14069, 'service'),
 (11499, 'like'),
 (11397, 'get'),
 (10658, 'best'),
 (10246, 'one'),
 (10117, 'really')]

In [20]:
#sentimental analysis

In [21]:
wordId = dict(zip(words, range(len(words))))
wordSet = set(words)

In [22]:
def feature(d):
    feat = [0]*len(words)
    if type(d['reviewText']) == str:
        r = ''.join([c for c in d['reviewText'].lower() if not c in punctuation])
        ws = r.split()
        ws2 = [' '.join(x) for x in list(zip(ws[:-1],ws[1:]))]
        ws3 = [' '.join(x) for x in list(zip(ws[:-2],ws[1:-1],ws[2:]))]
        ws4 = [' '.join(x) for x in list(zip(ws[:-3],ws[1:-2],ws[2:-1],ws[3:]))]
        ws5 = [' '.join(x) for x in list(zip(ws[:-4],ws[1:-3],ws[2:-2],ws[3:-1],ws[4:]))]
        for w in ws + ws2 + ws3 + ws4 + ws5:
            if w in words:
                feat[wordId[w]] += 1
    feat.append(1)
    return feat

In [23]:
X = [feature(d) for d in data[:60000]]
y = [d['rating'] for d in data[:60000]]

In [24]:
clf = linear_model.Ridge(1.0, fit_intercept=False)
clf.fit(X, y)
theta = clf.coef_
predictions = clf.predict(X)

In [25]:
predictions

array([4.41584228, 4.33374715, 3.92004417, ..., 4.22580846, 3.92381399,
       3.97584104])