In [4]:
import pandas as pd
import os.path

In [7]:
DATA_FOLDER = './datasets/'
ABSA_RESULTS_PATH = 'results/absa_results.csv'
REVIEWS_PATH = 'BeerAdvocate/reviews.pkl'

In [8]:
# Loading in the data
absa_results = pd.read_csv(os.path.join(DATA_FOLDER, ABSA_RESULTS_PATH))
reviews = pd.read_pickle(os.path.join(DATA_FOLDER, REVIEWS_PATH))

In [9]:
# dropping unnecessary column
absa_results = absa_results.drop('Text', axis=1)

In [10]:
# setting row index as a column - useful later
reviews['Row Index'] = reviews.index

In [11]:
# selecting important columns
reviews_clean = reviews[['Beer Id','Brewery Id','Style','Abv','Date','User Id','Appearance','Aroma',\
                         'Palate','Taste','Overall','Rating','Text', 'Row Index']]

In [12]:
# merging absa results
reviews_clean = reviews_clean.merge(absa_results, on='Row Index')

In [13]:
# matching on User Id, Style and Abv
reviews_cartesian = reviews_clean.merge(reviews_clean, on=['User Id','Style','Abv'], suffixes=('1', '2'))

In [14]:
# filtering pairs where first one is absolutely better
reviews_pairs = reviews_cartesian[(reviews_cartesian['Rating1'] > reviews_cartesian['Rating2']) & \
                                  (reviews_cartesian['Overall1'] > reviews_cartesian['Overall2']) & \
                                  (reviews_cartesian['Aroma1'] > reviews_cartesian['Aroma2']) & \
                                  (reviews_cartesian['Appearance1'] > reviews_cartesian['Appearance2']) & \
                                  (reviews_cartesian['Taste1'] > reviews_cartesian['Taste2']) & \
                                  (reviews_cartesian['Palate1'] > reviews_cartesian['Palate2'])].copy()

In [15]:
# adding month and text length columns to filtered data
reviews_pairs['Month1'] = reviews_pairs['Date1'].apply(lambda x: x.month)
reviews_pairs['Month2'] = reviews_pairs['Date2'].apply(lambda x: x.month)
reviews_pairs['Text Length1'] = reviews_pairs['Text1'].apply(len)
reviews_pairs['Text Length2'] = reviews_pairs['Text2'].apply(len)

In [16]:
winter_months = [1, 2, 3]
spring_months = [4, 5, 6]
summer_months = [7, 8, 9]
autumn_months = [10, 11, 12]

In [17]:
# matching pairs on seasonality
seasonal_pairs = reviews_pairs[
            ((reviews_pairs['Month1'].isin(winter_months) & reviews_pairs['Month2'].isin(winter_months)) | \
             (reviews_pairs['Month1'].isin(spring_months) & reviews_pairs['Month2'].isin(spring_months)) | \
             (reviews_pairs['Month1'].isin(summer_months) & reviews_pairs['Month2'].isin(summer_months)) | \
             (reviews_pairs['Month1'].isin(autumn_months) & reviews_pairs['Month2'].isin(autumn_months)))
                              ]

In [18]:
matched_text_difference = 100
maximum_text_size = 2000

In [19]:
matched_len_reviews_pairs = seasonal_pairs[abs(seasonal_pairs['Text Length1'] - \
                                               seasonal_pairs['Text Length2']) < matched_text_difference]

In [20]:
long_reviews = matched_len_reviews_pairs[(matched_len_reviews_pairs['Text Length1']> maximum_text_size) | \
                                           (matched_len_reviews_pairs['Text Length2']> maximum_text_size)]

In [21]:
matched_pairs = matched_len_reviews_pairs.drop(index = long_reviews.index)
print('Matched review pairs shape after dropping too long texts: {}'.format(matched_pairs.shape))

Matched review pairs shape after dropping too long texts: (61957, 53)


In [22]:
winter_pairs = matched_pairs[matched_pairs['Month1'].isin(winter_months)].copy()
spring_pairs = matched_pairs[matched_pairs['Month1'].isin(spring_months)].copy()
summer_pairs = matched_pairs[matched_pairs['Month1'].isin(summer_months)].copy()
autumn_pairs = matched_pairs[matched_pairs['Month1'].isin(autumn_months)].copy()

print('Matched winter review pairs shape: {}'.format(winter_pairs.shape))
print('Matched spring review pairs shape: {}'.format(spring_pairs.shape))
print('Matched summer review pairs shape: {}'.format(summer_pairs.shape))
print('Matched autumn review pairs shape: {}'.format(autumn_pairs.shape))

Matched winter review pairs shape: (14757, 53)
Matched spring review pairs shape: (19234, 53)
Matched summer review pairs shape: (14375, 53)
Matched autumn review pairs shape: (13591, 53)
