In [1]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline  
import numpy as np
from sklearn.model_selection import train_test_split

## Load Full Restaurant Reviews Dataset

In [2]:
# Aaron's file structure
# reviews_df = pd.read_json("../../dataset/restaurant_reviews.json", lines=True)

# Alex's file structure
reviews_df = pd.read_json("../../full_dataset/restaurant_reviews.json", lines=True)

In [3]:
reviews_df.head()

Unnamed: 0,business_id,cool,date,funny,stars,text,useful,user_id
0,--6MefnULPED_I942VcFNA,0,2017-08-17,0,4,This is one of my top 3 places to get BBQ pork...,2,FEg8v92qx3kK4Hu4TF28Fg
1,--6MefnULPED_I942VcFNA,0,2017-05-31,0,3,This restaurant is famous for their BBQ dishes...,0,HPtjvIrhzAUkKsiVkeT4MA
2,--6MefnULPED_I942VcFNA,0,2016-10-23,0,2,Roasted pork is one of my favorite things... A...,1,MpvqV7lQcl15rflTBEUhXA
3,--6MefnULPED_I942VcFNA,0,2017-07-30,0,2,I walked by the restaurant more than 5 years a...,1,x-Gbs8sVid3yhJIoHD6Gfw
4,--6MefnULPED_I942VcFNA,0,2017-02-07,1,2,I came here to order a roast duck over rice to...,0,7Dykd1HolQx8mKPYhYDYSg


## Filter Only to Users with > 5 Reviews

In [4]:
review_counts_df = reviews_df.groupby('user_id').agg('count')['business_id']

In [5]:
review_counts_df = review_counts_df[review_counts_df > 5]
review_counts_df.shape

(111200,)

In [6]:
final_reviews_df = reviews_df.set_index('user_id').join(review_counts_df, rsuffix="_r", how='inner').drop('business_id_r', axis=1)

In [7]:
final_reviews_df.head()

Unnamed: 0_level_0,business_id,cool,date,funny,stars,text,useful
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
---1lKK3aKOuomHnwAkAow,--9e1ONYQuAa-CB_Rrw7Tw,0,2008-11-11,0,4,So when you go to a restaurant like this pleas...,0
---1lKK3aKOuomHnwAkAow,1JgaRBX0oiRsvEhHF3ZMjw,1,2011-02-16,7,1,Food was 30 mins late and the pizza guy thinks...,5
---1lKK3aKOuomHnwAkAow,2BbFeotL85cIaBjSq1SWiA,1,2010-10-17,1,1,When you say your a vegetarian don't recomend ...,2
---1lKK3aKOuomHnwAkAow,5aeR9KcboZmhDZlFscnYRA,0,2013-01-16,0,5,So Fresh Mama let us host a Homeschool Board G...,1
---1lKK3aKOuomHnwAkAow,5cbsjFtrntUAeUx51FaFTg,0,2010-11-05,0,1,So I was coming here once a month or so maybe ...,1


In [8]:
final_reviews_df['user_id'] = final_reviews_df.index
final_reviews_df.head()

Unnamed: 0_level_0,business_id,cool,date,funny,stars,text,useful,user_id
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
---1lKK3aKOuomHnwAkAow,--9e1ONYQuAa-CB_Rrw7Tw,0,2008-11-11,0,4,So when you go to a restaurant like this pleas...,0,---1lKK3aKOuomHnwAkAow
---1lKK3aKOuomHnwAkAow,1JgaRBX0oiRsvEhHF3ZMjw,1,2011-02-16,7,1,Food was 30 mins late and the pizza guy thinks...,5,---1lKK3aKOuomHnwAkAow
---1lKK3aKOuomHnwAkAow,2BbFeotL85cIaBjSq1SWiA,1,2010-10-17,1,1,When you say your a vegetarian don't recomend ...,2,---1lKK3aKOuomHnwAkAow
---1lKK3aKOuomHnwAkAow,5aeR9KcboZmhDZlFscnYRA,0,2013-01-16,0,5,So Fresh Mama let us host a Homeschool Board G...,1,---1lKK3aKOuomHnwAkAow
---1lKK3aKOuomHnwAkAow,5cbsjFtrntUAeUx51FaFTg,0,2010-11-05,0,1,So I was coming here once a month or so maybe ...,1,---1lKK3aKOuomHnwAkAow


In [9]:
# Aaron's file structure
# final_reviews_df.to_json('../../dataset/restaurant_reviews_final.json', orient='records', lines=True)

# Alex's file structure
final_reviews_df.to_json('../../full_dataset/restaurant_reviews_final.json', orient='records', lines=True)

## Split Final Reviews Dataset by Train/Test, Split by User

In [10]:
# y_train and y_test is not used here, we just need the train and test user IDs
X_train, X_test, y_train, y_test = train_test_split(final_reviews_df.user_id.unique(), 
                                                    final_reviews_df.user_id.unique(), 
                                                    test_size=0.2, 
                                                    random_state=123)

In [11]:
final_reviews_train_df = final_reviews_df[final_reviews_df.user_id.isin(X_train)]
print(final_reviews_train_df.shape)
final_reviews_train_df.head()

(1500464, 8)


Unnamed: 0_level_0,business_id,cool,date,funny,stars,text,useful,user_id
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
---1lKK3aKOuomHnwAkAow,--9e1ONYQuAa-CB_Rrw7Tw,0,2008-11-11,0,4,So when you go to a restaurant like this pleas...,0,---1lKK3aKOuomHnwAkAow
---1lKK3aKOuomHnwAkAow,1JgaRBX0oiRsvEhHF3ZMjw,1,2011-02-16,7,1,Food was 30 mins late and the pizza guy thinks...,5,---1lKK3aKOuomHnwAkAow
---1lKK3aKOuomHnwAkAow,2BbFeotL85cIaBjSq1SWiA,1,2010-10-17,1,1,When you say your a vegetarian don't recomend ...,2,---1lKK3aKOuomHnwAkAow
---1lKK3aKOuomHnwAkAow,5aeR9KcboZmhDZlFscnYRA,0,2013-01-16,0,5,So Fresh Mama let us host a Homeschool Board G...,1,---1lKK3aKOuomHnwAkAow
---1lKK3aKOuomHnwAkAow,5cbsjFtrntUAeUx51FaFTg,0,2010-11-05,0,1,So I was coming here once a month or so maybe ...,1,---1lKK3aKOuomHnwAkAow


In [12]:
final_reviews_test_df = final_reviews_df[final_reviews_df.user_id.isin(X_test)]
print(final_reviews_test_df.shape)
final_reviews_test_df.head()

(373155, 8)


Unnamed: 0_level_0,business_id,cool,date,funny,stars,text,useful,user_id
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
--2vR0DIsmQ6WfcSzKWigw,7dHYudt6OOIjiaxkSvv3lQ,197,2017-11-08,179,5,Auf unserer Rundreise haben wir häufig die Res...,203,--2vR0DIsmQ6WfcSzKWigw
--2vR0DIsmQ6WfcSzKWigw,9edPSkfXKsJmkZYIaOmA7Q,194,2017-11-06,176,4,Nachdem wir die Las Vegas North Premium Outlet...,199,--2vR0DIsmQ6WfcSzKWigw
--2vR0DIsmQ6WfcSzKWigw,El4FC8jcawUVgw_0EIcbaQ,122,2015-10-13,109,4,Das MGM Grand ist mit seinen 6.853 Zimmern ein...,130,--2vR0DIsmQ6WfcSzKWigw
--2vR0DIsmQ6WfcSzKWigw,GHS1rVjO-RMcRB6WJLpCDQ,108,2015-09-16,94,3,In Las Vegas kann man zwischen zwei verschiede...,112,--2vR0DIsmQ6WfcSzKWigw
--2vR0DIsmQ6WfcSzKWigw,IhlKa2x5J4vr47hjDY8Jnw,11,2013-06-02,7,5,Bei einer meiner letzten Besuche in Stuttgart ...,15,--2vR0DIsmQ6WfcSzKWigw


In [14]:
# Aaron's file structure
# final_reviews_train_df.to_json('../../dataset/restaurant_reviews_final_train.json', orient='records', lines=True)

# Alex's file structure
final_reviews_train_df.to_json('../../full_dataset/restaurant_reviews_final_train.json', orient='records', lines=True)

In [15]:
# Aaron's file structure
# final_reviews_test_df.to_json('../../dataset/restaurant_reviews_final_test.json', orient='records', lines=True)

# Alex's file structure
final_reviews_test_df.to_json('../../full_dataset/restaurant_reviews_final_test.json', orient='records', lines=True)