# EDA
In this notebook we take a very crude approach to filtering down the data to what may potentially be reviews that indicate misleading listings.

In [1]:
import pandas as pd

Load data

In [2]:
florida_l = pd.read_csv("../data/raw/florida_listings.csv", encoding="unicode_escape")
texas_l = pd.read_csv("../data/raw/texas_listings.csv", encoding="unicode_escape")
florida_r = pd.read_csv("../data/raw/florida_reviews.csv")
texas_r = pd.read_csv("../data/raw/texas_reviews.csv")

  florida_l = pd.read_csv("../data/raw/florida_listings.csv", encoding="unicode_escape")
  texas_l = pd.read_csv("../data/raw/texas_listings.csv", encoding="unicode_escape")


In [3]:
def parse_amenities(amenities):
  amenities = amenities.replace("{", "").replace("]", "").replace('"', "")
  return amenities.split(",")

florida = pd.merge(florida_l, florida_r, left_on="id", right_on="listing_id", suffixes=("_listing", "_review"))
texas = pd.merge(texas_l, texas_r, left_on="id", right_on="listing_id", suffixes=("_listing", "_review"))
florida.amenities = florida.amenities.apply(parse_amenities)
texas.amenities = texas.amenities.apply(parse_amenities)

Apply preprocessing to the data

In [29]:
# import necessary libraries
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'\w+')

# download stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# remove negative words from stopwords
negative_words = [
    "no",
    "not",
    "nor",
    "neither",
    "never",
    "none",
    "doesnt",
    "couldnt",
    "shouldnt",
    "wouldnt",
    "cant",
    "cannot",
    "wont",
    "isnt",
    "arent",
    "wasnt",
    "werent",
    "hasnt",
    "havent",
    "hadnt",
    "dont",
    "didnt",
    "neednt",
    "very"
]
for w in negative_words:
    try:
        stop_words.remove(w)
    except KeyError:
        pass

additional_stopwords = [
    "airbnb",
    "austin",
    "texas",
    "home",
    "house"
]
for w in additional_stopwords:
    stop_words.add(w)

# remove some specific phrases, using regular expressions
specific_phrases = [
    r"\(.* hidden by airbnb\)",
]


# download lemmatizer
nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()

def remove_stopwords_and_lemmatize(tokens) -> list:
    processed_tokens = []
    for w in tokens:
        if w in stop_words:
            continue
        lemmatized = lemmatizer.lemmatize(w)
        processed_tokens.append(lemmatized)

    return processed_tokens

def preprocess_text(row, col) -> list:

    if not isinstance(row[col], str):
        return []

    # lowercase
    text: str = row[col].lower()

    for phrase in specific_phrases:
        text = re.sub(phrase, "", text)

    # tokenize
    tokens = tokenizer.tokenize(text)

    # remove stopwords and lemmatize
    return remove_stopwords_and_lemmatize(tokens)

def preprocess_row(row):
    print(row.name)
    return {
        "description": preprocess_text(row, "description"),
        "comments": preprocess_text(row, "comments"),
        "amenities": row["amenities"],
        "listing_id": row["id_listing"],
        "review_id": row["id_review"],
    }

def preprocess_pipeline(df):
    processed = df.apply(preprocess_row, axis=1)

    # turn list of dicts into dataframe
    to_return = pd.DataFrame()
    for col in processed[0].keys():
        to_return[col] = processed.apply(lambda x: x[col])

    return to_return

PROCESS = False

if PROCESS:

    florida_processed = preprocess_pipeline(florida)
    texas_processed = preprocess_pipeline(texas)

    florida_processed.to_csv("../data/raw/florida_raw_processed.csv")
    texas_processed.to_csv("../data/raw/texas_raw_processed.csv")

else:
    florida_processed = pd.read_csv("../data/raw/florida_raw_processed.csv")
    texas_processed = pd.read_csv("../data/raw/texas_raw_processed.csv")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\grego\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\grego\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Filter the data. First looking for reviews that mention amenities, then for with misleading and negation words.

In [68]:
negative_words = [
    "no",
    "not",
    "nor",
    "neither",
    "never",
    "none",
    "doesnt",
    "couldnt",
    "shouldnt",
    "wouldnt",
    "cant",
    "cannot",
    "wont",
    "isnt",
    "arent",
    "wasnt",
    "werent",
    "hasnt",
    "havent",
    "hadnt",
    "dont",
    "didnt",
    "neednt",
    "very"
]
keywords = [
        'deceiving', 
        "decieving", # I bet there's typos
        'disappointing', 
        'horrible',
        'terrible',
        'awful',
        'bad',
        "misleading",
        "inaccurate",
        "incorrect",
        "missing",
        "not as described",
        "not there",
        "wrong",
        "not as pictured",
        "lied",
        "lie",
        "liar",
        "lying",
        "fraud",
        "fraudulent",
        "scam",
        "scammer",
        "scamming",
        "scammed",
        "unsatisfactory",
        "unacceptable",
        "wasn't there",
        "was not there",
        "wasnt there"
        "photoshopped"]

In [30]:
florida_processed.comments = florida_processed.comments.apply(lambda x: set(eval(x)))
florida_processed.amenities = florida_processed.amenities.apply(lambda x: set(eval(x)))

In [38]:
# same for texas
texas_processed.comments = texas_processed.comments.apply(lambda x: set(eval(x)))
texas_processed.amenities = texas_processed.amenities.apply(lambda x: set(eval(x)))

In [39]:
def compare_comments_with(row, col):
    to_compare = set([x.lower() for x in row[col]])
    return len(row.comments.intersection(to_compare)) > 0

florida_with_amenity = florida_processed.comments[florida_processed.apply(lambda row: compare_comments_with(row, "amenities"), axis=1)]
texas_with_amenity = texas_processed.comments[texas_processed.apply(lambda row: compare_comments_with(row, "amenities"), axis=1)]

In [42]:
tx_working = texas_processed.iloc[texas_with_amenity.index]
fl_working = florida_processed.iloc[florida_with_amenity.index]

In [81]:
# find all the comments that contain a negative word or misleading keyword
negative_florida = fl_working[fl_working.comments.apply(lambda x: any([word in x for word in negative_words]))]
negative_texas = tx_working[tx_working.comments.apply(lambda x: any([word in x for word in negative_words]))]
misleading_florida = fl_working[fl_working.comments.apply(lambda x: any([word in x for word in keywords]))]
misleading_texas = tx_working[tx_working.comments.apply(lambda x: any([word in x for word in keywords]))]

# combine negative and misleading dataframes, keeping only the intersection
neg_and_mis_florida = pd.merge(negative_florida, misleading_florida, how="inner", left_index=True, right_index=True)
neg_and_mis_texas = pd.merge(negative_texas, misleading_texas, how="inner", left_index=True, right_index=True)

In [80]:
print("% of reviews with negative and misleading words, that also mention an amenity (florida): ", len(neg_and_mis_florida)/len(florida) * 100)
print("% of reviews with negative and misleading words, that also mention an amenity (texas): ", len(neg_and_mis_texas)/len(texas) * 100)

% of reviews with negative and/or misleading words, that also mention an amenity (florida):  11.230642764874373
% of reviews with negative and/or misleading words, that also mention an amenity (texas):  6.594137874964619
