# Generate processed data
Here we will combine the subset of reviews with their labels and associated property description to create a processed dataset.

In [166]:
import pandas as pd
import matplotlib.pyplot as plt

In [167]:
# Which GEO to process?
GEO = "texas"

In [168]:
# load all the relevant data
DATA_FP = "../../data"
LABELS = pd.read_csv(DATA_FP + f"/labels/{GEO}_reviews_labels.csv")
SUBSET = pd.read_csv(DATA_FP + f"/filtered/{GEO}_reviews_filtered.csv")
LISTINGS = pd.read_csv(DATA_FP + f"/raw/{GEO}_listings.csv", encoding="unicode_escape", low_memory=False)

For now, we are dropping labels of "maybe". These are reviews we weren't sure about the label of. If we need more training data, we can revisit these reviews for labeling.

In [169]:
maybes = LABELS[LABELS.label == "maybe"]
print(f"Number of maybe reviews: {len(maybes)}/{len(LABELS)}")

LABELS = LABELS[LABELS.label != "maybe"]

Number of maybe reviews: 16/870


Join datasets together.

In [170]:
# join with labels
subset_with_labels = pd.merge(LABELS, SUBSET, on="id", suffixes=("_labels", "_subset"))

# join with listings
subset_labels_and_listing = pd.merge(subset_with_labels, LISTINGS, left_on="listing_id", right_on="id", suffixes=("","_listings"))

Keep only the columns we need for the processed dataset. Also, rename the columns to be more descriptive, and clean the amentities column.

In [171]:
cols_to_keep =[
    "id",
    "listing_id",
    "description",
    "comments",
    "sentiment",
    "label",
    "name",
    "amenities"
]

subset_labels_and_listing = subset_labels_and_listing[cols_to_keep]

# rename id to review_id, for clarity
subset_labels_and_listing = subset_labels_and_listing.rename(columns={"id": "review_id"})

def parse_amenities(amenities):
  amenities = amenities.replace("{", "").replace("]", "").replace('"', "")
  return amenities.split(",")

subset_labels_and_listing.amenities = subset_labels_and_listing.amenities.apply(parse_amenities)

subset_labels_and_listing.sample(n=5)

Unnamed: 0,review_id,listing_id,description,comments,sentiment,label,name,amenities
230,17510255,294708,"Spacious townhouse. 2 king beds, 1 queen bed, ...",We found the condo very spacious as described....,3.5,no,3bd+Loft...SOCO Area Convenience,"[TV, Cable TV, Internet, Wifi, Air conditionin..."
336,191735468,349447,"Welcome to the very hip, unique and beautiful ...",Nice small studio home tucked in a quiet worki...,4.5,no,"Eastside Cabana - Near UT,Downtown","[TV, Cable TV, Internet, Wifi, Air conditionin..."
752,255194396,958172,Large studio in the back house. Tucked away i...,Great place to stay! Space is beautiful with g...,4.0,no,An ArtistÛªs Downtown Paradise....,"[Wifi, Air conditioning, Kitchen, Free street ..."
721,88445269,949054,The condo is in a superior location within wal...,Overall this rental was satisfactory. The pic...,2.5,yes,ª´of Downtown w/free parking! Sleeps 8 in 7 b...,"[TV, Cable TV, Internet, Wifi, Air conditionin..."
185,157118137,217637,Take advantage of our Central East location to...,Felt like your own home. House has all the ess...,4.0,no,East Austin Charmer -- Super Cozy,"[TV, Internet, Wifi, Air conditioning, Kitchen..."


## Cleaning text columns

We need to clean the text columns to make them more useful for our model. We will do the following:

- Convert to lowercase
- Remove punctuation
- Remove stop words
- Lemmatize words

We will use nltk to do this.

In [172]:
# import necessary libraries
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'\w+')

# download stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# remove negative words from stopwords
negative_words = [
    "no",
    "not",
    "nor",
    "neither",
    "never",
    "none",
    "doesnt",
    "couldnt",
    "shouldnt",
    "wouldnt",
    "cant",
    "cannot",
    "wont",
    "isnt",
    "arent",
    "wasnt",
    "werent",
    "hasnt",
    "havent",
    "hadnt",
    "dont",
    "didnt",
    "neednt",
    "very"
]
for w in negative_words:
    try:
        stop_words.remove(w)
    except KeyError:
        pass

additional_stopwords = [
    "airbnb",
    "austin",
    "texas",
    "home",
    "house"
]
for w in additional_stopwords:
    stop_words.add(w)


# download lemmatizer
nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()

def remove_stopwords_and_lemmatize(tokens) -> list:
    processed_tokens = []
    for w in tokens:
        if w in stop_words:
            continue
        lemmatized = lemmatizer.lemmatize(w)
        processed_tokens.append(lemmatized)

    return processed_tokens

def preprocess_text(text: str) -> list:
    # lowercase
    text: str = text.lower()

    # tokenize
    tokens = tokenizer.tokenize(text)

    # remove stopwords and lemmatize
    return remove_stopwords_and_lemmatize(tokens)

print("Preprocessing description...")
subset_labels_and_listing.description = subset_labels_and_listing.description.apply(preprocess_text)

print("Preprocessing comments...")
subset_labels_and_listing.comments = subset_labels_and_listing.comments.apply(preprocess_text)

print("Preprocessing name...")
subset_labels_and_listing.name = subset_labels_and_listing.name.apply(preprocess_text)

subset_labels_and_listing.sample(n=5)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\grego\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\grego\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Preprocessing description...
Preprocessing comments...
Preprocessing name...


Unnamed: 0,review_id,listing_id,description,comments,sentiment,label,name,amenities
367,102598692,363040,"[opened, october, 2012, nicely, furnished, eff...","[long, time, austinite, moved, away, one, favo...",4.666667,no,"[zilker, festival, suite]","[TV, Cable TV, Internet, Wifi, Air conditionin..."
518,144798596,698082,"[1940s, bungalow, quiet, street, thriving, cen...","[ellen, place, everything, hoped, great, locat...",4.5,no,"[soco, bungalow, bombshell, ûócharming, cozy, ...","[TV, Cable TV, Internet, Wifi, Air conditionin..."
159,10037826,155359,"[available, min, 90, day, overlooking, lake, t...","[stayed, weekend, great, experience, keith, go...",4.0,no,"[thetramhouse, 90, day, min]","[TV, Cable TV, Internet, Wifi, Air conditionin..."
444,33766112,585041,"[lovely, historic, highly, desirable, location...","[absolutely, perfect, sister, bachelorette, pa...",3.0,no,"[soco, 6, bedroom, travis, height]","[TV, Cable TV, Internet, Wifi, Air conditionin..."
461,69449529,632659,"[mid, century, modern, conveniently, located, ...","[selfishly, speaking, want, write, review, kee...",4.5,no,"[central, east, modern, guest]","[TV, Cable TV, Wifi, Air conditioning, Kitchen..."


In [173]:
# save it
subset_labels_and_listing.to_csv(DATA_FP + f"/processed/{GEO}_processed.csv", index=False)