In [1]:
import pandas as pd
import sklearn 

## Objective
This notebook is to conduct downsampling of the raw data file, to better deal with the severe class imbalance.  
We first read data from the raw data file that we did not include in the github repo due to it's size.  
We then standardise the target label and conduct down sampling of the majority class to achieve a better class composition.

In [3]:
# Reviews_CSV is the raw data file that contains ~ 800k rows of data
df = pd.read_csv('data/Reviews CSV.csv')

In [4]:
df.columns

Index(['date', 'reviewID', 'reviewerID', 'reviewContent', 'rating',
       'usefulCount', 'coolCount', 'funnyCount', 'flagged', 'restaurantID'],
      dtype='object')

In [5]:
df['flagged'].value_counts()

flagged
NR    402774
YR    318678
N      58716
Y       8303
Name: count, dtype: int64

In [7]:
# Replace null and NaN values in the flagged column with a default label
default_label = 'NR'
df['flagged'] = df['flagged'].fillna(default_label)
# Convert the flagged column to binary labels
df['flagged'] = df['flagged'].apply(lambda x: 1 if x == 'Y' else 0)
df['flagged'].unique()

array([0, 1], dtype=int64)

In [8]:
df['flagged'].value_counts()

flagged
0    780168
1      8303
Name: count, dtype: int64

In [9]:
minority = df[df['flagged'] == 1]   # minority class
majority = df[df['flagged'] == 0]   # majority class

N_min = len(minority)
target_total = 50000

# number of majority samples needed
N_maj_sample = target_total - N_min

if N_maj_sample < 0:
    raise ValueError("Minority class alone exceeds 50k rows â€” cannot downsample to 50k.")

# downsample majority class
majority_down = majority.sample(n=N_maj_sample, random_state=42)

# combine
df_downsampled = pd.concat([minority, majority_down]).sample(frac=1, random_state=42).reset_index(drop=True)

In [10]:
df_downsampled['flagged'].value_counts()

flagged
0    41697
1     8303
Name: count, dtype: int64

In [11]:
df_downsampled.to_csv('data/Smaller_Reviews.csv')