# Matching pairs for deeper text analysis

In [1]:
# Useful imports
import pandas as pd
import seaborn as sns
import numpy as np
import random
import math
import os.path
from pathlib import Path

In [2]:
# Loading in the data
reviews = pd.read_pickle('./datasets/BeerAdvocate/reviews.pkl')
print(reviews.shape)

(2589587, 17)


In [3]:
# setting row index as a column - useful later
reviews['Row Index'] = reviews.index

In [4]:
# selecting necessary columns
reviews_clean = reviews[['Beer Id','Brewery Id','Style','Abv','Date','User Id','Appearance','Aroma',\
                         'Palate','Taste','Overall','Rating','Text', 'Row Index']]

In [5]:
# matching on User Id, Style and Abv
reviews_cartesian = reviews_clean.merge(reviews_clean, on=['User Id','Style','Abv'], suffixes=('1', '2'))

In [6]:
reviews_cartesian.shape

(8255601, 25)

In [7]:
# filtering pairs where first one is absolutely better
reviews_pairs = reviews_cartesian[(reviews_cartesian['Rating1'] > reviews_cartesian['Rating2']) & \
                                  (reviews_cartesian['Overall1'] > reviews_cartesian['Overall2']) & \
                                  (reviews_cartesian['Aroma1'] > reviews_cartesian['Aroma2']) & \
                                  (reviews_cartesian['Appearance1'] > reviews_cartesian['Appearance2']) & \
                                  (reviews_cartesian['Taste1'] > reviews_cartesian['Taste2']) & \
                                  (reviews_cartesian['Palate1'] > reviews_cartesian['Palate2'])].copy()

In [8]:
reviews_pairs.shape

(599517, 25)

In [9]:
# adding month columns to filtered data
reviews_pairs['Month1'] = reviews_pairs['Date1'].apply(lambda x: x.month)
reviews_pairs['Month2'] = reviews_pairs['Date2'].apply(lambda x: x.month)
reviews_pairs['Text Length1'] = reviews_pairs['Text1'].apply(len)
reviews_pairs['Text Length2'] = reviews_pairs['Text2'].apply(len)
# sns.lineplot(data=reviews_pairs, y='Rating1', x='Month1')
# sns.lineplot(data=reviews_pairs, y='Rating2', x='Month2')

In [10]:
winter_review_pairs = reviews_pairs[(reviews_pairs['Month1']>=1) & (reviews_pairs['Month1']<=3) & \
              (reviews_pairs['Month2']>=1) & (reviews_pairs['Month2']<=3)]
summer_review_pairs = reviews_pairs[(reviews_pairs['Month1']>=7) & (reviews_pairs['Month1']<=9) & \
              (reviews_pairs['Month2']>=7) & (reviews_pairs['Month2']<=9)]

In [11]:
print('Winter review pairs shape: {}'.format(winter_review_pairs.shape))
print('Summer review pairs shape: {}'.format(summer_review_pairs.shape))

Winter review pairs shape: (41478, 29)
Summer review pairs shape: (39424, 29)


In [12]:
matched_winter_pairs = winter_review_pairs[abs(winter_review_pairs['Text Length1'] - \
                                               winter_review_pairs['Text Length2']) <100]
matched_summer_pairs = summer_review_pairs[abs(summer_review_pairs['Text Length1'] - \
                                               summer_review_pairs['Text Length2']) <100]

In [13]:
print('Matched winter review pairs shape: {}'.format(matched_winter_pairs.shape))
print('Matched summer review pairs shape: {}'.format(matched_summer_pairs.shape))

Matched winter review pairs shape: (14803, 29)
Matched summer review pairs shape: (14426, 29)


In [14]:
long_winter = matched_winter_pairs[(matched_winter_pairs['Text Length1']>2000) | \
                                   (matched_winter_pairs['Text Length2']>2000)]
long_summer = matched_summer_pairs[(matched_summer_pairs['Text Length1']>2000) | \
                                   (matched_summer_pairs['Text Length2']>2000)]
print('Too long winter review pairs shape: {}'.format(long_winter.shape))
print('Too long summer review pairs shape: {}'.format(long_summer.shape))

Too long winter review pairs shape: (45, 29)
Too long summer review pairs shape: (48, 29)


We will drop too long reviews since there are too big for semantical analyser. As we can see, they make only $93/29229 \sim 0.31 \%$ of the pairs.

In [15]:
matched_winter_pairs = matched_winter_pairs.drop(index = long_winter.index)
matched_summer_pairs = matched_summer_pairs.drop(index = long_summer.index)
print('Matched winter review pairs shape after dropping too long texts: {}'.format(matched_winter_pairs.shape))
print('Matched summer review pairs shape after dropping too long texts: {}'.format(matched_summer_pairs.shape))

Matched winter review pairs shape after dropping too long texts: (14758, 29)
Matched summer review pairs shape after dropping too long texts: (14378, 29)


In [17]:
# joining 'better' and 'worse' reviews into same column for sentiment analysis
winter_row_indices = pd.concat([matched_winter_pairs['Row Index1'], matched_winter_pairs['Row Index2']]).unique()
summer_row_indices = pd.concat([matched_summer_pairs['Row Index1'], matched_summer_pairs['Row Index2']]).unique()

# all rows for analysis
all_row_indices_for_analysis = np.union1d(winter_row_indices, summer_row_indices)

In [18]:
print('Number of winter reviews for sentiment analysis: {}'.format(winter_row_indices.size))
print('Number of summer reviews for sentiment analysis: {}'.format(summer_row_indices.size))
print('Total number of reviews for sentiment analysis: {}'.format(all_row_indices_for_analysis.size))

Number of winter reviews for sentiment analysis: 22379
Number of summer reviews for sentiment analysis: 21780
Total number of reviews for sentiment analysis: 44159


### Time and size estimation

1 reviews ~ 10 seconds  </br>
6 reviews ~ 1 minute   </br>
360 reviews ~ 1 hour </br>

Therefore, I will separete row indices in batches of size 360. There will be $ceil(44159/360) = 123$ files.

In [19]:
batch_size = 360
file_number = math.ceil(all_row_indices_for_analysis.size/batch_size)
batch_folder = './datasets/batches/'

In [20]:
matrix = np.split(all_row_indices_for_analysis, range(batch_size, all_row_indices_for_analysis.size, batch_size))

In [21]:
# create batch folder
Path(batch_folder).mkdir(parents=True, exist_ok=True)

# create batches for sentiment analysis
for i in range(file_number):
    batch = reviews[reviews['Row Index'].isin(matrix[i])][['Row Index', 'Text']]
    batch.to_csv(os.path.join(batch_folder, '{}.csv'.format(i)), index=False)

In [22]:
# check import export and create a small test sample
test = pd.read_csv(os.path.join(batch_folder, '{}.csv'.format(random.randint(0, file_number))))
test = test.sample(50)
test.to_csv(os.path.join(batch_folder, 'test.csv'), index=False)

### Code for sentiment analysis (used in Google Colab)

We are doing sentiment analysis in Google Colab. Because of that we have a separate jupyter notebook **absa_script** for that. Be free to check it out!