## Data Understanding

### from YELP Dataset on Kaggle


#### Builiding an initial database with json files

There are several json files on data set from Yelp on Kaggle,
upload to mongodb with

>mongoimport

And build some indexs on the mongodb with

> db.review.createIndex({business_id:1})

In [365]:
#import libraries
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction import stop_words
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import log_loss, accuracy_score
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.compose import ColumnTransformer
import pymongo

In [3]:
mc = pymongo.MongoClient()
db = mc['yelp']

Shape of 'Business' and 'Review' Collection

In [4]:
business_coll = db['business']
review_coll = db['review']

In [5]:
business_coll.count_documents({})

385218

In [6]:
cur = business_coll.find({})
next(cur)

{'_id': ObjectId('5cc1e600c171cff98cbbbfc2'),
 'business_id': '1SWheh84yJXfytovILXOAQ',
 'name': 'Arizona Biltmore Golf Club',
 'address': '2818 E Camino Acequia Drive',
 'city': 'Phoenix',
 'state': 'AZ',
 'postal_code': '85016',
 'latitude': 33.5221425,
 'longitude': -112.0184807,
 'stars': 3.0,
 'review_count': 5,
 'is_open': 0,
 'attributes': {'GoodForKids': 'False'},
 'categories': 'Golf, Active Life',
 'hours': None}

In [7]:
cities = business_coll.find({}).distinct("city")

In [8]:
len(cities)

1204

In [9]:
states = business_coll.find({}).distinct("state")

In [10]:
print(states)

['AZ', 'ON', 'NC', 'AB', 'NV', 'OH', 'PA', 'QC', 'WI', 'IL', 'NY', 'SC', 'TX', 'UT', 'NM', 'FL', 'CA', 'VA', 'BAS', 'NE', 'AK', 'XGM', 'WA', 'XWY', 'CON', 'BC', 'GA', 'VT', 'CT', 'AL', 'DUR', 'TN', 'NJ', 'AR', 'XGL', 'DOW']


In [11]:
vegas = business_coll.find({'city':'Las Vegas'})

In [12]:
len(list(vegas))

58740

In [13]:
cur = business_coll.find({})

In [14]:
restuants_vegas = business_coll.find({'categories':{"$regex": u"Restaurants"}, 'city':'Las Vegas'})
df = pd.DataFrame(list(restuants_vegas))

In [15]:
df.head()

Unnamed: 0,_id,address,attributes,business_id,categories,city,hours,is_open,latitude,longitude,name,postal_code,review_count,stars,state
0,5cc1e601c171cff98cbbbfd3,"1775 E Tropicana Ave, Ste 29","{'OutdoorSeating': 'False', 'BusinessAcceptsCr...",PZ-LZzSlhSe9utkQYU8pFg,"Restaurants, Italian",Las Vegas,,0,36.100016,-115.128529,Carluccio's Tivoli Gardens,89119,40,4.0,NV
1,5cc1e601c171cff98cbbbfdb,6055 E Lake Mead Blvd,"{'BikeParking': 'True', 'BusinessParking': '{'...",tstimHoMcYbkSC4eBA1wEg,"Mexican, Restaurants, Patisserie/Cake Shop, Fo...",Las Vegas,"{'Monday': '11:0-21:0', 'Tuesday': '10:0-21:0'...",1,36.195615,-115.040529,Maria's Mexican Restaurant & Bakery,89156,184,4.5,NV
2,5cc1e601c171cff98cbbc00d,6125 Spring Mountain Rd,"{'RestaurantsPriceRange2': '1', 'Ambience': '{...",kANF0dbeoW34s2vwh6Umfw,"Fast Food, Food, Restaurants, Ice Cream & Froz...",Las Vegas,,0,36.125031,-115.22562,Dairy Queen,89146,33,2.0,NV
3,5cc1e601c171cff98cbbc031,4343 N Rancho Dr,,X8mtoSxY8whtmbDlj0D3Aw,"Restaurants, Chinese",Las Vegas,,1,36.238596,-115.233331,Imperial Asian Buffet,89030,4,2.0,NV
4,5cc1e601c171cff98cbbc049,"Artisan Hotel, 1501 W Sahara Ave","{'RestaurantsAttire': ''dressy'', 'Corkage': '...",bJP4l_BGq2CudEu0m-wNjg,"Restaurants, Pizza, Italian, American (New)",Las Vegas,"{'Monday': '16:0-0:0', 'Tuesday': '16:0-0:0', ...",0,36.143672,-115.169792,Artisan Fine Dining Room,89102,3,2.0,NV


In [16]:
review_coll = db['review']

In [17]:
# 7 million reviews
review_coll.count_documents({})

6685900

In [18]:
cur = review_coll.find({})
next(cur)

{'_id': ObjectId('5ceee5ad92ffa1b8f2a3567a'),
 'review_id': 'yi0R0Ugj_xUx_Nek0-_Qig',
 'user_id': 'dacAIZ6fTM6mqwW5uxkskg',
 'business_id': 'ikCg8xy5JIg_NGPx-MSIDA',
 'stars': 5.0,
 'useful': 0,
 'funny': 0,
 'cool': 0,
 'text': "Went in for a lunch. Steak sandwich was delicious, and the Caesar salad had an absolutely delicious dressing, with a perfect amount of dressing, and distributed perfectly across each leaf. I know I'm going on about the salad ... But it was perfect.\n\nDrink prices were pretty good.\n\nThe Server, Dawn, was friendly and accommodating. Very happy with her.\n\nIn summation, a great pub experience. Would go again!",
 'date': '2018-01-09 20:56:38'}

Make an review_dataframe

In [19]:
nv_cur = business_coll.find({'categories':{"$regex": u"Restaurants"}, 'city':'Las Vegas'})

review_list = []
count = 0
for doc in nv_cur:
    review_cur = review_coll.find({
        'business_id': doc['business_id']
    })
    count +=1
    
    for review in review_cur:
        review_list.append({**doc, **review})
        
    # if count == 10:
    #   break

In [20]:
review_df = pd.DataFrame(review_list)
review_df[['name', 'review_count', 'is_open', 'stars', 'text', 'date']].head()

Unnamed: 0,name,review_count,is_open,stars,text,date
0,Carluccio's Tivoli Gardens,40,0,5.0,"We went there for dinner the other night, bein...",2011-06-29 02:55:07
1,Carluccio's Tivoli Gardens,40,0,4.0,i had the best Chicken Marcela ever. The spagh...,2010-10-06 18:20:13
2,Carluccio's Tivoli Gardens,40,0,5.0,Basically the best Italian in town for the pri...,2010-01-13 00:35:45
3,Carluccio's Tivoli Gardens,40,0,3.0,Mmmmm delicious food and a little history. Mr....,2008-08-23 20:30:33
4,Carluccio's Tivoli Gardens,40,0,3.0,"This is old Vegas, this atmosphere is old scho...",2009-06-01 20:02:55


In [247]:
review_df.head(5)

Unnamed: 0,_id,address,attributes,business_id,categories,city,cool,date,funny,hours,...,longitude,name,postal_code,review_count,review_id,stars,state,text,useful,user_id
0,5ceee5ae92ffa1b8f2a35d63,"1775 E Tropicana Ave, Ste 29","{'OutdoorSeating': 'False', 'BusinessAcceptsCr...",PZ-LZzSlhSe9utkQYU8pFg,"Restaurants, Italian",Las Vegas,0,2011-06-29 02:55:07,0,,...,-115.128529,Carluccio's Tivoli Gardens,89119,40,klcF45wKIOpJW_BhJslOJg,5.0,NV,"We went there for dinner the other night, bein...",1,-Yz2wIcsdJxUOFMbTgoKQA
1,5ceee5ae92ffa1b8f2a373e1,"1775 E Tropicana Ave, Ste 29","{'OutdoorSeating': 'False', 'BusinessAcceptsCr...",PZ-LZzSlhSe9utkQYU8pFg,"Restaurants, Italian",Las Vegas,0,2010-10-06 18:20:13,0,,...,-115.128529,Carluccio's Tivoli Gardens,89119,40,Li-pQG6A7p5gbgZHTMeDSQ,4.0,NV,i had the best Chicken Marcela ever. The spagh...,1,jYcf_e5p0UG0S-9gJq_tNA
2,5ceee5af92ffa1b8f2a3e850,"1775 E Tropicana Ave, Ste 29","{'OutdoorSeating': 'False', 'BusinessAcceptsCr...",PZ-LZzSlhSe9utkQYU8pFg,"Restaurants, Italian",Las Vegas,0,2010-01-13 00:35:45,0,,...,-115.128529,Carluccio's Tivoli Gardens,89119,40,iRLX3dJ3ONvncIxPnXy1cw,5.0,NV,Basically the best Italian in town for the pri...,1,nQC0JiPIk_jCooRDxpuw5A
3,5ceee5b092ffa1b8f2a4786f,"1775 E Tropicana Ave, Ste 29","{'OutdoorSeating': 'False', 'BusinessAcceptsCr...",PZ-LZzSlhSe9utkQYU8pFg,"Restaurants, Italian",Las Vegas,0,2008-08-23 20:30:33,0,,...,-115.128529,Carluccio's Tivoli Gardens,89119,40,rklteWf9xnTU3fAtMFBRRw,3.0,NV,Mmmmm delicious food and a little history. Mr....,1,Gv_-mtOKhWFtCjn9xFe0SQ
4,5ceee5b192ffa1b8f2a4a7f2,"1775 E Tropicana Ave, Ste 29","{'OutdoorSeating': 'False', 'BusinessAcceptsCr...",PZ-LZzSlhSe9utkQYU8pFg,"Restaurants, Italian",Las Vegas,0,2009-06-01 20:02:55,0,,...,-115.128529,Carluccio's Tivoli Gardens,89119,40,UfRqM0RGdZa86hFcFEAnjw,3.0,NV,"This is old Vegas, this atmosphere is old scho...",1,pabMYegF28KjHQ5hybAJ0A


In [145]:
# pickle / unpickle raw dataframe
review_df.to_pickle('review_df.pkl')
# df_raw = pd.read_pickle('df_raw.pkl')

In [252]:
review_df_sub = review_df.loc[:, ['text', 'stars']]

In [256]:
# 2.5 million reviews
review_df_sub.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2485422 entries, 0 to 2485421
Data columns (total 2 columns):
text     object
stars    float64
dtypes: float64(1), object(1)
memory usage: 37.9+ MB


In [458]:
review_df_sub.head(5)

Unnamed: 0,text,stars
0,"We went there for dinner the other night, bein...",5.0
1,i had the best Chicken Marcela ever. The spagh...,4.0
2,Basically the best Italian in town for the pri...,5.0
3,Mmmmm delicious food and a little history. Mr....,3.0
4,"This is old Vegas, this atmosphere is old scho...",3.0


#### Let's check the stars

In [22]:
# stars = review_coll.find({}).distinct("stars")
# sorted(stars)

[1.0, 2.0, 3.0, 4.0, 5.0]

In [None]:
# df.hist('stars');

In [23]:
# # validates idea that restaurant rating was an aggregate score across all reviews
# cur = business_coll.find({
#     'city': 'Las Vegas'
# })

# for i, doc in enumerate(cur):
#     review_cur = review_coll.find({
#         'business_id': doc['business_id']
#     })
#     stars = 0
#     counts = 0
#     for review in review_cur:
#         stars += review['stars']
#         counts += 1

#     print(doc['stars'], stars/counts)
    
#     if i == 10:
#         break

3.5 3.6666666666666665
4.0 4.097560975609756
5.0 4.761904761904762
4.5 4.421052631578948
4.0 3.890909090909091
5.0 4.75
4.5 4.301587301587301
4.5 4.37037037037037
3.5 3.289473684210526
1.5 1.4736842105263157
5.0 5.0


#### Partition data

In [481]:
# set train & test variables
train_features_raw, test_features_raw, train_labels_raw, test_labels_raw = train_test_split(review_df_sub['text'], 
                                                                            review_df_sub['stars'], 
                                                                            test_size=0.2, 
                                                                            random_state=42)


In [482]:
len(train_features_raw), len(train_labels_raw), len(test_features_raw), len(test_labels_raw)

(1988337, 1988337, 497085, 497085)

In [None]:
# pipe = Pipeline([
#     ('count_vect', TfidfVectorizer(**kwargs_vect)),
#     ('logreg_model', LogisticRegression())
# ])

#### Create pipeline

In [468]:
kwargs_vect = {
            'ngram_range': (1, 1),
            'stop_words': 'english',
            'dtype': 'int32',
            'strip_accents': 'unicode',
            'decode_error': 'replace',
            'analyzer': 'word',
            'min_df': 2,
}

In [469]:
# text_columns = review_df_sub['text']
text_transformer = Pipeline(steps=[
    ('token_vect', TfidfVectorizer(**kwargs_vect))])

In [478]:
#
pipe_preprocessing = ColumnTransformer(
    transformers=[
        ('text_transformer', text_transformer, ['text'])
        ], remainder='passthrough')

In [475]:
#
pipe_1 = Pipeline(steps=[('pipe_preprocessing', pipe_preprocessing),
                         ('logreg_model', LogisticRegression())])

In [476]:
pipe_1.fit(train_features_raw, train_labels_raw)

IndexError: tuple index out of range

In [457]:
text_transformer.fit(train_features_raw)

Pipeline(memory=None,
     steps=[('token_vect', TfidfVectorizer(analyzer='word', binary=False, decode_error='replace',
        dtype='int32', encoding='utf-8', input='content', lowercase=True,
        max_df=1.0, max_features=None, min_df=2, ngram_range=(1, 1),
        norm='l2', preprocessor=None, smooth_idf=True,
        stop_words='english', strip_accents='unicode', sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None))])

In [467]:
pipe_preprocessing.fit(train_features_raw)

IndexError: tuple index out of range

In [403]:
pipe_1.named_steps

{'pipe_preprocessing': ColumnTransformer(n_jobs=None, remainder='drop', sparse_threshold=0.3,
          transformer_weights=None,
          transformers=[('text_transformer', Pipeline(memory=None,
      steps=[('token_vect', TfidfVectorizer(analyzer='word', binary=False, decode_error='replace',
         dtype='int32', encoding='utf-8', input='content', lowercase=True,
         max_df=1.0, max_features=None, min_df=2, ngram_range=(1, 1),
   ...n_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
         vocabulary=None))]), ['text'])],
          verbose=False),
 'logreg_model': LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
           intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
           penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
           verbose=0, warm_start=False)}

In [405]:
pipe_1.fit_transform(train_features_raw, train_labels)

IndexError: tuple index out of range

In [404]:
pipe_1.named_steps['pipe_preprocessing'].transform(test_features_raw)

NotFittedError: This ColumnTransformer instance is not fitted yet. Call 'fit' with appropriate arguments before using this method.

#### first_model

In [483]:
kwargs_vect = {
            'ngram_range': (1, 1),
            'stop_words': 'english',
            'dtype': 'int32',
            'strip_accents': 'unicode',
            'decode_error': 'replace',
            'analyzer': 'word',
            'min_df': 2,
}

In [484]:
vectorizer = TfidfVectorizer(**kwargs)

In [486]:
x_train = vectorizer.fit_transform(train_features_raw)

In [105]:
#
x_train.shape

(1988337, 194175)

In [58]:
model = LogisticRegression()

In [59]:
model.fit(x_train, train_labels_raw)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [61]:
x_test = vectorizer.transform(test_features_raw)

In [62]:
y_preds = model.predict(x_test)
y_preds.shape

(497085,)

In [63]:
# calculate probability estimates, returned estimates for all classes, ordered by label of classes.
# if multi_class set to “multinomial” the softmax function is used to find the predicted probability of each class. Else use a one-vs-rest approach, i.e calculate the probability of each class assuming it to be positive using the logistic function. and normalize these values across all the classes.
y_probas = model.predict_proba(x_test)
y_probas.shape

(497085, 5)

In [64]:
# calculate subset accuracy
accuracy = accuracy_score(test_labels, y_preds)
accuracy

0.6616353339972036

In [65]:
# calculate number of correct predictions
accuracy2 = accuracy_score(test_labels, y_preds, normalize=False)
accuracy2

328889

In [66]:
# calculate loss function using log loss (cross-entropy loss), the negative 
# log-likelihood of true labels given a probabilistic classifier's predictions

logloss = log_loss(test_labels, y_probas)
logloss

0.8151836878756975

In [None]:
# count = 0
# total = 0

# for i, value in enumerate(test_labels):
#     if value == pred[i]:
#         count += 1
#     total = i+1
        
# print(count / total)

In [None]:
# test_df = pd.DataFrame(pred)

In [None]:
# test_df.hist()

In [None]:
# len(test_df[test_df[0] == 5]) / len(test_df)

In [None]:
# len(review_df[review_df['stars'] == 4]) / len(review_df)

In [None]:
# review_df.hist('stars')

#### feature selection from 2MM to 20K and run same logistical regression model

Measure features importance using ANOVA F-Value and reduce features from 2 million to 20,000
Drop tokens.
Measure feature importance (how much each token contributes to label predictions),
and only include the most informative tokens.

Take features and corresponding labels and output the feature importance score. 
Two commonly used functions are f_classif and chi2, which perform well per Google experiments

In [120]:
# computes ANOVA F-value for provided sample

In [150]:
NGRAM_RANGE = (1, 1)
STOP_WORDS = 'english'
TOKEN_MODE = 'word'
MIN_DOCUMENT_FREQUENCY = 2

In [151]:
kwargs = {
            'ngram_range': NGRAM_RANGE,
            'stop_words': STOP_WORDS,
            'dtype': 'int32',
            'strip_accents': 'unicode',
            'decode_error': 'replace',
            'analyzer': TOKEN_MODE,
            'min_df': MIN_DOCUMENT_FREQUENCY,
}

In [152]:
vectorizer_ksel = TfidfVectorizer(**kwargs)

In [153]:
TOP_K = 20000

In [154]:
# Select top 'k' of the vectorized features.
selector = SelectKBest(f_classif, k=min(TOP_K, x_train.shape[1]))
    

In [155]:
selector.fit(x_train, train_labels)
    

SelectKBest(k=20000, score_func=<function f_classif at 0x1bf079c80>)

In [156]:
x_train_ksel = selector.transform(x_train).astype('float32')

In [157]:
x_train_ksel.shape

(1988337, 20000)

In [158]:
# ratio of features to observations = 

In [159]:
x_val_ksel = selector.transform(x_test).astype('float32')

In [160]:
x_val_ksel.shape

(497085, 20000)

In [161]:
model_ksel = LogisticRegression()

In [162]:
model_ksel.fit(x_train_ksel, train_labels)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [143]:
model_ksel.classes_

array([1., 2., 3., 4., 5.])

In [None]:
model_

In [163]:
y_preds_ksel = model_ksel.predict(x_val_ksel)
y_preds_ksel.shape

(497085,)

In [164]:
y_probas_ksel = model_ksel.predict_proba(x_val_ksel)
y_probas_ksel.shape

(497085, 5)

In [241]:
accuracy_ksel = accuracy_score(test_labels, y_preds_ksel)
accuracy_ksel

0.6557912630636611

In [242]:
accuracy2_ksel = accuracy_score(test_labels, y_preds_ksel, normalize=False)
accuracy2_ksel

325984

In [243]:
logloss_ksel = log_loss(test_labels, y_probas_ksel)
logloss_ksel

0.8248980531024912

### n-gram variations models

In [195]:
NGRAM_RANGE = (2, 2)

TOP_K = 20000
STOP_WORDS = 'english'
TOKEN_MODE = 'word'
MIN_DOCUMENT_FREQUENCY = 2

In [196]:
kwargs = {
            'ngram_range': NGRAM_RANGE,
            'stop_words': STOP_WORDS,
            'dtype': 'int32',
            'strip_accents': 'unicode',
            'decode_error': 'replace',
            'analyzer': TOKEN_MODE,
            'min_df': MIN_DOCUMENT_FREQUENCY,
}

In [197]:
vectorizer_bi = TfidfVectorizer(**kwargs)

In [198]:
x_train = vectorizer_bi.fit_transform(train_features)

In [199]:
# Select top 'k' of the vectorized features.
selector = SelectKBest(f_classif, k=min(TOP_K, x_train.shape[1]))

In [200]:
selector.fit(x_train, train_labels)

SelectKBest(k=20000, score_func=<function f_classif at 0x1bf079c80>)

In [201]:
x_train = selector.transform(x_train).astype('float32')

In [202]:
x_train.shape

(1988337, 20000)

In [204]:
x_test = vectorizer_bi.transform(test_features)

In [210]:
x_val = selector.transform(x_test).astype('float32')

In [211]:
x_val.shape

(497085, 20000)

In [205]:
model = LogisticRegression()

In [206]:
model.fit(x_train, train_labels)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [212]:
# TODO (Lee) question for Miles - without having donw selector - feature number mismatch ?
y_preds_bi = model.predict(x_val)
y_preds_bi.shape

(497085,)

In [213]:
y_preds_bi = model.predict(x_val)
y_preds_bi.shape

(497085,)

In [214]:
y_probas_bi = model.predict_proba(x_val)
y_probas_bi.shape

(497085, 5)

In [239]:
accuracy_bi = accuracy_score(test_labels, y_preds_bi)
accuracy_bi

0.6279549775189354

In [240]:
logloss_bi = log_loss(test_labels, y_probas_bi)
logloss_bi

0.9108587796559224

In [245]:
logloss, logloss_bi, logloss_ksel

(0.8151836878756975, 0.9108587796559224, 0.8248980531024912)

In [246]:
accuracy, accuracy_bi, accuracy_ksel

(0.6616353339972036, 0.6279549775189354, 0.6557912630636611)

In [None]:
# stemming and lemmatization

In [217]:
y_probas_bi[0]

array([0.03306378, 0.02442959, 0.03171863, 0.04111088, 0.86967713])

In [218]:
y_probas[0]

array([0.05474705, 0.00567109, 0.00933909, 0.01362086, 0.9166219 ])

In [219]:
y_probas_ksel[0]

array([0.0856819 , 0.00453427, 0.00444504, 0.02894287, 0.87639592])

In [223]:
x_val[0].toarray()

array([[0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [230]:
x_test[0].toarray()

array([[0., 0., 0., ..., 0., 0., 0.]])

In [231]:
y_preds_bi[0]

5.0

In [232]:
y_preds[0]

5.0

In [233]:
y_preds_ksel[0]

5.0

In [234]:
y_preds[0:10]

array([5., 5., 5., 3., 1., 5., 5., 5., 5., 5.])

In [235]:
y_preds_bi[0:10]

array([5., 1., 5., 3., 1., 5., 5., 4., 5., 5.])

In [236]:
y_preds_ksel[0:10]

array([5., 1., 5., 3., 1., 5., 5., 5., 5., 5.])