# Subset the data to only include potential reviews that could indicate misleading property listings

Our method to subset the data so we don't need to look through all the rows to find the potential false positives is two-fold:

1. We filter by rows contains a set of keywords from the honors thesis.

2. We filter by rows that have a sentiment score lower than 5 (5 being the most positive sentiment score).

In [1]:
import pandas as pd

## Define filters

In [2]:
def keyword_filter(row: pd.Series) -> bool:
    """Return True if the row contains a keyword in the list of keywords"""
    keywords = [
        'deceiving', 
        "decieving", # I bet there's typos
        'disappointing', 
        'horrible',
        'terrible',
        'awful',
        'bad',
        "misleading",
        "inaccurate",
        "incorrect",
        "missing",
        "not as described",
        "not there",
        "wrong",
        "not as pictured",
        "lied",
        "lie",
        "liar",
        "lying",
        "fraud",
        "fraudulent",
        "scam",
        "scammer",
        "scamming",
        "scammed",
        "unsatisfactory",
        "unacceptable",
        "wasn't there",
        "was not there",
        "wasnt there"
        "photoshopped"]
    if not isinstance(row['comments'], str):
        return False
    return any([keyword in row['comments'] for keyword in keywords])

def sentiment_filter(row: pd.Series, threshold=5) -> bool:
    sentiment = row["sentiment"]
    return sentiment < threshold

def apply_filters(raw_fp: str | pd.DataFrame, filtered_fp: str) -> None:
    """Apply the filters to the raw data and save the result to a new file"""

    if isinstance(raw_fp, pd.DataFrame):
        data = raw_fp
    else:
        print("Reading data from", raw_fp)
        data = pd.read_csv(raw_fp)

    original_len = len(data)

    print("Number of rows:", original_len)

    sentiment_threshold = 5
    print(f"Applying sentiment filter (sentiment threshold = {sentiment_threshold})...")
    data = data[data.apply(lambda row: sentiment_filter(row, sentiment_threshold), axis=1)]
    reduction_pcnt = (1 - (len(data) / original_len)) * 100
    print(f"Number of remaining rows: {len(data)} ({reduction_pcnt:.2f}% reduction)")

    print("Applying keyword filter...")
    data = data[data.apply(keyword_filter, axis=1)]
    reduction_pcnt = (1 - (len(data) / original_len)) * 100
    print(f"Number of remaining rows: {len(data)} ({reduction_pcnt:.2f}% reduction)")

    print("Saving filtered data to", filtered_fp)

    data.comments = data.comments.apply(lambda x: x.replace("_x000D_", ""))

    data.to_csv(filtered_fp, index=False)

## Apply filters and save results to /data/filtered

In [4]:
apply_filters("../../data/sentiment/texas_w_sentiment.csv", "../../data/filtered/texas_reviews_filtered.csv")

Reading data from ../../data/sentiment/texas_w_sentiment.csv
Number of rows: 332098
Applying sentiment filter (sentiment threshold = 5)...
Number of remaining rows: 72776 (78.09% reduction)
Applying keyword filter...
Number of remaining rows: 5629 (98.31% reduction)
Saving filtered data to ../../data/filtered/texas_reviews_filtered2.csv


In [9]:
apply_filters("../data/sentiment/florida_w_sentiment.csv", "../data/filtered/florida_reviews_filtered.csv")

Reading data from ../data/sentiment/florida_w_sentiment.csv
Number of rows: 195857
Applying sentiment filter (sentiment threshold = 5)...
Number of remaining rows: 47843 (75.57% reduction)
Applying keyword filter...
Number of remaining rows: 4416 (97.75% reduction)
Saving filtered data to ../data/filtered/florida_reviews_filtered.csv


In [9]:
JOIN_CALI = False
if JOIN_CALI:
    california_reviews = pd.read_csv("../data/raw/california_reviews.csv")
    california_sentiment = pd.read_csv("../data/sentiment/california_sentiment_126287.csv", header=None)
    california_sentiment.columns = ["idx", "sentiment"]

    # inner join on index
    california_w_sentiment = pd.merge(california_reviews, california_sentiment, how="inner", left_index=True, right_on="idx")

    apply_filters(california_w_sentiment, "../data/filtered/california_reviews_filtered_126287.csv")

else:
    apply_filters("../data/sentiment/california_w_sentiment.csv", "../data/filtered/california_reviews_filtered.csv")

Unnamed: 0,listing_id,id,date,reviewer_id,reviewer_name,comments,idx,sentiment
0,958,5977,2009-07-23,15695,Edmund C,"Our experience was, without a doubt, a five st...",0,5.0
1,958,6660,2009-08-03,26145,Simon,Returning to San Francisco is a rejuvenating t...,1,4.0
2,958,11519,2009-09-27,25839,Denis,We were very pleased with the accommodations a...,2,3.5
3,958,16282,2009-11-05,33750,Anna,We highly recommend this accomodation and agre...,3,5.0
4,958,26008,2010-02-13,15416,Venetia,Holly's place was great. It was exactly what I...,4,5.0
