# Subset the data to only include potential rows that could have discrepancies

Our method to subset the data so we don't need to look through all the rows to find the potential false positives is two-fold:

1. We filter by rows contains a set of keywords from the honors thesis.

2. We filter by rows that have sentiment score of 4 or lower (5 being the most positive sentiment score).

In [14]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
tokenizer = AutoTokenizer.from_pretrained('nlptown/bert-base-multilingual-uncased-sentiment')
model = AutoModelForSequenceClassification.from_pretrained('nlptown/bert-base-multilingual-uncased-sentiment')

## Define filters

In [19]:
def keyword_filter(row: pd.Series) -> bool:
    """Return True if the row contains a keyword in the list of keywords"""
    keywords = [
        'deceiving', 
        "decieving", # I bet there's typos
        'disappointing', 
        'horrible',
        'terrible',
        'awful',
        'bad',
        "misleading",
        "inaccurate",
        "incorrect",
        "not as pictured",
        "photoshopped"]
    if not isinstance(row['comments'], str):
        return False
    return any([keyword in row['comments'] for keyword in keywords])

def sentiment_filter(row: pd.Series) -> bool:
    comment = row["comments"]
    tokens = tokenizer.encode(comment, return_tensors='pt')
    result = model(tokens)
    pred = int(torch.argmax(result.logits))+1

    return pred <= 4

def apply_filters(raw_fp: str, filtered_fp: str) -> None:
    """Apply the filters to the raw data and save the result to a new file"""

    print("Reading data from", raw_fp)
    data = pd.read_csv(raw_fp)

    original_len = len(data)

    print("Number of rows:", original_len)

    print("Applying keyword filter...")
    data = data[data.apply(keyword_filter, axis=1)]
    print("Number of remaining rows:", len(data))

    print("Applying sentiment filter...")
    data = data[data.apply(sentiment_filter, axis=1)]
    print("Number of remaining rows:", len(data))

    print("Data to look through reduced by", (1 - (len(data) / original_len)) * 100, "%")

    print("Saving filtered data to", filtered_fp)
    data.to_csv(filtered_fp, index=False)

## Apply filters and save results to /data/filtered

In [20]:
apply_filters("../data/raw/texas_reviews.csv", "../data/filtered/texas_reviews.csv")

Reading data from ../data/raw/texas_reviews.csv
Number of rows: 332098
Applying keyword filter...
Number of remaining rows: 2554
Applying sentiment filter...


Token indices sequence length is longer than the specified maximum sequence length for this model (627 > 512). Running this sequence through the model will result in indexing errors


RuntimeError: The expanded size of the tensor (627) must match the existing size (512) at non-singleton dimension 1.  Target sizes: [1, 627].  Tensor sizes: [1, 512]

In [17]:
apply_filters("../data/raw/florida_reviews.csv", "../data/filtered/florida_reviews.csv")

Reading data from ../data/raw/florida_reviews.csv
Number of rows: 195857
Applying keyword filter...
Number of remaining rows: 2000
Data to look through reduced by 98.97884681170446 %
Saving filtered data to ../data/filtered/florida_reviews.csv


In [18]:
apply_filters("../data/raw/california_reviews.csv", "../data/filtered/california_reviews.csv")

Reading data from ../data/raw/california_reviews.csv
Number of rows: 366643
Applying keyword filter...
Number of remaining rows: 3178
Data to look through reduced by 99.13321678035582 %
Saving filtered data to ../data/filtered/california_reviews.csv
