In [3]:
import pandas as pd
from sklearn.utils import shuffle
import re

In [4]:
# Read raw data
train_reviews = pd.read_csv("../../data/yelp/raw_train.csv", names=['rating','review'])

In [5]:
# Split the dataset based on rating values
train_reviews_1 = train_reviews.loc[train_reviews['rating'] == 1]
train_reviews_2 = train_reviews.loc[train_reviews['rating'] == 2]

In [6]:
# shuffle those two datasets, and subset 5% datapoints from each of the two datasets
train_reviews_1 = shuffle(train_reviews_1)
train_reviews_1.reset_index(inplace=True,drop=True)
train_reviews_2 = shuffle(train_reviews_2)
train_reviews_2.reset_index(inplace=True,drop=True)
num = int(len(train_reviews) * 0.05)
train_reviews_1 = train_reviews_1.head(num)
train_reviews_2 = train_reviews_2.head(num)

In [7]:
# concatenate two dfs into one, and suffle the new df
review_subset = pd.concat([train_reviews_1,train_reviews_2])
review_subset = shuffle(review_subset)
review_subset.head()

Unnamed: 0,rating,review
1885,2,"I've been here twice now, and the food is cons..."
5104,2,So my refrigerator broke down and I found Mike...
20628,2,We love the Living Room! My boyfriend and I fr...
80,2,Not at all the stereotypical sleazy bus statio...
16535,2,"Nice place for a decent drink. No sports tv, n..."


In [8]:
review_subset.rating.value_counts()

2    28000
1    28000
Name: rating, dtype: int64

In [9]:
# Splitting the subset by rating to create our new train, val, and test splits
n_total = len(review_subset)
n_train = int(0.7*n_total)
n_val = int(0.15*n_total)
n_test = int(0.15*n_total)

ls = ['train']*int(n_train/2) + ['test']*int(n_test/2)+['val']*int(n_val/2)
ls = shuffle(ls)

review_subset_1 = review_subset.loc[review_subset['rating'] ==1]
review_subset_1['split']= ls
review_subset_2 = review_subset.loc[review_subset['rating'] ==2]
review_subset_2['split']= ls



In [10]:
# combine dfs and shuffle it
final_reviews = pd.concat([review_subset_1,review_subset_2])
final_reviews = shuffle(final_reviews)

In [11]:
final_reviews.head()

Unnamed: 0,rating,review,split
26267,2,Come here for the MEAT. Fresh Cuts and Carne A...,train
23330,2,I tried Boardhouse for the first time today du...,train
19334,2,Always good service,train
11852,1,Mario should be ashamed!\n\nThere is no questi...,train
11066,1,This has been my worst experience at any boba ...,train


In [12]:
final_reviews.split.value_counts()

train    39200
val       8400
test      8400
Name: split, dtype: int64

In [13]:
final_reviews.groupby(['rating','split']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,review
rating,split,Unnamed: 2_level_1
1,test,4200
1,train,19600
1,val,4200
2,test,4200
2,train,19600
2,val,4200


In [14]:
# Preprocess the reviews
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r"([.,!?])", r" \1 ", text)
    text = re.sub(r"[^a-zA-Z.,!?]+", r" ", text)
    return text
    
final_reviews.review = final_reviews.review.apply(preprocess_text)

In [15]:
final_reviews['rating'] = final_reviews.rating.apply({1: 'negative', 2: 'positive'}.get)

In [16]:
final_reviews.head()

Unnamed: 0,rating,review,split
26267,positive,come here for the meat . fresh cuts and carne ...,train
23330,positive,i tried boardhouse for the first time today du...,train
19334,positive,always good service,train
11852,negative,mario should be ashamed ! n nthere is no quest...,train
11066,negative,this has been my worst experience at any boba ...,train


In [19]:
final_reviews.to_csv("../../data/output_munged_csv", index=False)