# Generate processed data
Here we will combine the subset of reviews with their labels and associated property description to create a processed dataset.

In [237]:
import pandas as pd
import matplotlib.pyplot as plt

In [238]:
# Which GEO to process?
GEO = "texas"

In [239]:
# load all the relevant data
DATA_FP = "../../data"
LABELS = pd.read_csv(DATA_FP + f"/labels/{GEO}_reviews_labels.csv")
SUBSET = pd.read_csv(DATA_FP + f"/filtered/{GEO}_reviews_filtered.csv")
LISTINGS = pd.read_csv(DATA_FP + f"/raw/{GEO}_listings.csv", encoding="unicode_escape", low_memory=False)

For now, we are dropping labels of "maybe". These are reviews we weren't sure about the label of. If we need more training data, we can revisit these reviews for labeling.

In [240]:
maybes_others = LABELS[(LABELS.label == "maybe") | (LABELS.label == "other")]
print(f"Number of maybe/other reviews: {len(maybes_others)}/{len(LABELS)}")

LABELS = LABELS[~((LABELS.label == "maybe") | (LABELS.label == "other"))]

Number of maybe/other reviews: 22/1000


Join datasets together.

In [241]:
# join with labels
subset_with_labels = pd.merge(LABELS, SUBSET, on="id", suffixes=("_labels", "_subset"))

# join with listings
subset_labels_and_listing = pd.merge(subset_with_labels, LISTINGS, left_on="listing_id", right_on="id", suffixes=("","_listings"))

Keep only the columns we need for the processed dataset. Also, rename the columns to be more descriptive, and clean the amentities column.

In [242]:
cols_to_keep =[
    "id",
    "listing_id",
    "description",
    "comments",
    "sentiment",
    "label",
    "name",
    "amenities"
]

subset_labels_and_listing = subset_labels_and_listing[cols_to_keep]

# rename id to review_id, for clarity
subset_labels_and_listing = subset_labels_and_listing.rename(columns={"id": "review_id"})

def parse_amenities(amenities):
  amenities = amenities.replace("{", "").replace("]", "").replace('"', "")
  return amenities.split(",")

subset_labels_and_listing.amenities = subset_labels_and_listing.amenities.apply(parse_amenities)

subset_labels_and_listing.sample(n=5)

Unnamed: 0,review_id,listing_id,description,comments,sentiment,label,name,amenities
264,62538105,324552,This is a private detached studio apartmnet in...,This place gets 5 out of 5 stars for location....,3.75,no,South Congress private SOCO Studio,"[Wifi, Air conditioning, Kitchen, Free parking..."
123,69599634,83643,Join me in my comfy house. I'll entertain you ...,Michele made me feel very welcome on my arriva...,4.333333,no,Fun Central Austin Convenience,"[Internet, Wifi, Air conditioning, Kitchen, Fr..."
658,430472854,915082,Modern 2 Bedroom 2 bath just 3 blocks from Sou...,The unit its self is very nice and up to date....,4.0,no,SoCo: Ultra-Modern 2 bed 2 bath privacy & loca...,"[TV, Cable TV, Internet, Wifi, Air conditionin..."
728,74237263,958172,Large studio in the back house. Tucked away i...,"Though I didn't meet Donna myself, she was ver...",4.0,no,An ArtistÛªs Downtown Paradise....,"[Wifi, Air conditioning, Kitchen, Free street ..."
351,8712775,362662,Downtown Central Austin Best Location Huge New...,Robert's house is just awesome. Plenty of room...,4.5,no,Downtown Central Austin HUGE HOUSE!,"[TV, Cable TV, Internet, Wifi, Air conditionin..."


## Cleaning text columns

We need to clean the text columns to make them more useful for our model. We will do the following:

- Convert to lowercase
- remove some specific phrases that we know are not useful
- Remove punctuation
- Remove stop words
- Lemmatize words

We will use nltk to do this.

In [243]:
# import necessary libraries
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'\w+')

# download stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# remove negative words from stopwords
negative_words = [
    "no",
    "not",
    "nor",
    "neither",
    "never",
    "none",
    "doesnt",
    "couldnt",
    "shouldnt",
    "wouldnt",
    "cant",
    "cannot",
    "wont",
    "isnt",
    "arent",
    "wasnt",
    "werent",
    "hasnt",
    "havent",
    "hadnt",
    "dont",
    "didnt",
    "neednt",
    "very"
]
for w in negative_words:
    try:
        stop_words.remove(w)
    except KeyError:
        pass

additional_stopwords = [
    "airbnb",
    "austin",
    "texas",
    "home",
    "house"
]
for w in additional_stopwords:
    stop_words.add(w)

# remove some specific phrases, using regular expressions
specific_phrases = [
    r"\(.* hidden by airbnb\)",
]


# download lemmatizer
nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()

def remove_stopwords_and_lemmatize(tokens) -> list:
    processed_tokens = []
    for w in tokens:
        if w in stop_words:
            continue
        lemmatized = lemmatizer.lemmatize(w)
        processed_tokens.append(lemmatized)

    return processed_tokens

def preprocess_text(text: str) -> list:
    # lowercase
    text: str = text.lower()

    for phrase in specific_phrases:
        text = re.sub(phrase, "", text)

    # tokenize
    tokens = tokenizer.tokenize(text)

    # remove stopwords and lemmatize
    return remove_stopwords_and_lemmatize(tokens)

print("Preprocessing description...")
subset_labels_and_listing.description = subset_labels_and_listing.description.apply(preprocess_text)

print("Preprocessing comments...")
subset_labels_and_listing.comments = subset_labels_and_listing.comments.apply(preprocess_text)

print("Preprocessing name...")
subset_labels_and_listing.name = subset_labels_and_listing.name.apply(preprocess_text)

subset_labels_and_listing.sample(n=5)

Preprocessing description...


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\grego\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\grego\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Preprocessing comments...
Preprocessing name...


Unnamed: 0,review_id,listing_id,description,comments,sentiment,label,name,amenities
616,45714753,850518,"[newly, redone, eastside, open, air, cottage, ...","[brian, mollie, place, warm, welcoming, comfor...",4.0,no,"[prime, east, side, location]","[TV, Wifi, Air conditioning, Kitchen, Free par..."
226,237887938,294708,"[spacious, townhouse, 2, king, bed, 1, queen, ...","[good, location, maybe, not, suitable, neighbo...",3.0,no,"[3bd, loft, soco, area, convenience]","[TV, Cable TV, Internet, Wifi, Air conditionin..."
676,30616482,925342,"[perfect, location, ut, campus, central, explo...","[carlton, very, nice, available, quick, messag...",4.0,no,"[ut, football, sxsw, acl, f, 1, bass, hall]","[TV, Cable TV, Internet, Wifi, Air conditionin..."
752,234671770,971055,"[grad, student, furnished, studio, apartment, ...","[alex, great, person, responsible, solving, po...",2.0,mbad,"[cozy, quiet, studioapt, super, close, ut]","[Internet, Wifi, Air conditioning, Kitchen, Fr..."
647,11744491,910696,"[best, place, acl, sxsw, formula, 1, holiday, ...","[coming, brooklyn, frankly, coming, anywhere, ...",4.333333,no,"[mid, century, modern, w, spacious, yard]","[TV, Cable TV, Internet, Wifi, Air conditionin..."


In [244]:
# save it
subset_labels_and_listing.to_csv(DATA_FP + f"/processed/{GEO}_processed.csv", index=False)