# Generate processed data
Here we will combine the subset of reviews with their labels and associated property description to create a processed dataset.

In [1]:
import pandas as pd

In [2]:
# Which GEO to process?
GEO = "florida"

In [3]:
# load all the relevant data
DATA_FP = "../../data"
LABELS = pd.read_csv(DATA_FP + f"/labels/{GEO}_reviews_labels.csv")
SUBSET = pd.read_csv(DATA_FP + f"/filtered/{GEO}_reviews_filtered.csv")
LISTINGS = pd.read_csv(DATA_FP + f"/raw/{GEO}_listings.csv", encoding="unicode_escape", low_memory=False)

For now, we are dropping labels of "maybe". These are reviews we weren't sure about the label of. If we need more training data, we can revisit these reviews for labeling.

In [4]:
maybes_others = LABELS[(LABELS.label == "maybe") | (LABELS.label == "other")]
print(f"Number of maybe/other reviews: {len(maybes_others)}/{len(LABELS)}")

LABELS = LABELS[~((LABELS.label == "maybe") | (LABELS.label == "other"))]

Number of maybe/other reviews: 50/701


Join datasets together.

In [5]:
# join with labels
subset_with_labels = pd.merge(LABELS, SUBSET, on="id", suffixes=("_labels", "_subset"))

# join with listings
subset_labels_and_listing = pd.merge(subset_with_labels, LISTINGS, left_on="listing_id", right_on="id", suffixes=("","_listings"))

Keep only the columns we need for the processed dataset. Also, rename the columns to be more descriptive, and clean the amentities column.

In [6]:
cols_to_keep =[
    "id",
    "listing_id",
    "description",
    "comments",
    "sentiment",
    "label",
    "name",
    "amenities"
]

subset_labels_and_listing = subset_labels_and_listing[cols_to_keep]

# rename id to review_id, for clarity
subset_labels_and_listing = subset_labels_and_listing.rename(columns={"id": "review_id"})

def parse_amenities(amenities):
  amenities = amenities.replace("{", "").replace("]", "").replace('"', "")
  return amenities.split(",")

subset_labels_and_listing.amenities = subset_labels_and_listing.amenities.apply(parse_amenities)

subset_labels_and_listing.sample(n=5)

Unnamed: 0,review_id,listing_id,description,comments,sentiment,label,name,amenities
608,142516857,6278267,Thank you looking at our Airbnb home - Villa G...,What an amazing place to stay! If you want to ...,4.5,good,Hibiscus Hideaway,"[TV, Cable TV, Internet, Wifi, Air conditionin..."
580,201648190,6276494,"Just steps away from the beach, The Delray at ...",This was our first Airbnb experience. The apa...,3.5,good,The Delray at Cabana Carioca,"[TV, Cable TV, Internet, Wifi, Air conditionin..."
235,73485649,2509161,NICE STUDIO IN HALLANDALE A FEW BLOCKS TO THE ...,The description is not entirely accurate - the...,2.0,mgood,STUDIO APT 3 Blocks to BEACH w/POOL,"[TV, Cable TV, Internet, Wifi, Air conditionin..."
124,123961802,1127112,Hollywood Beach Resort is a beachfront histori...,Overall the place was as described . The bed w...,4.0,good,Studio with kitchen at Beach Hotel,"[TV, Internet, Wifi, Air conditioning, Pool, K..."
643,445619904,1559542,"Awesome space 1 mile south of downtown, 3 mile...",everything was quite good. I missed a dishwash...,4.0,mgood,Private Home Near Downtown & Beach remodeled!,"[TV, Cable TV, Internet, Wifi, Air conditionin..."


## Cleaning text columns

We need to clean the text columns to make them more useful for our model. We will do the following:

- Convert to lowercase
- remove some specific phrases that we know are not useful
- Remove punctuation
- Remove stop words
- Lemmatize words

We will use nltk to do this.

In [7]:
# import necessary libraries
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'\w+')

# download stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# remove negative words from stopwords
negative_words = [
    "no",
    "not",
    "nor",
    "neither",
    "never",
    "none",
    "doesnt",
    "couldnt",
    "shouldnt",
    "wouldnt",
    "cant",
    "cannot",
    "wont",
    "isnt",
    "arent",
    "wasnt",
    "werent",
    "hasnt",
    "havent",
    "hadnt",
    "dont",
    "didnt",
    "neednt",
    "very"
]
for w in negative_words:
    try:
        stop_words.remove(w)
    except KeyError:
        pass

additional_stopwords = [
    "airbnb",
    "austin",
    "texas",
    "home",
    "house"
]
for w in additional_stopwords:
    stop_words.add(w)

# remove some specific phrases, using regular expressions
specific_phrases = [
    r"\(.* hidden by airbnb\)",
]


# download lemmatizer
nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()

def remove_stopwords_and_lemmatize(tokens) -> list:
    processed_tokens = []
    for w in tokens:
        if w in stop_words:
            continue
        lemmatized = lemmatizer.lemmatize(w)
        processed_tokens.append(lemmatized)

    return processed_tokens

def preprocess_text(text: str) -> list:
    # lowercase
    text: str = text.lower()

    for phrase in specific_phrases:
        text = re.sub(phrase, "", text)

    # tokenize
    tokens = tokenizer.tokenize(text)

    # remove stopwords and lemmatize
    return remove_stopwords_and_lemmatize(tokens)

print("Preprocessing description...")
subset_labels_and_listing.description = subset_labels_and_listing.description.apply(preprocess_text)

print("Preprocessing comments...")
subset_labels_and_listing.comments = subset_labels_and_listing.comments.apply(preprocess_text)

print("Preprocessing name...")
subset_labels_and_listing.name = subset_labels_and_listing.name.apply(preprocess_text)

subset_labels_and_listing.sample(n=5)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\grego\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\grego\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Preprocessing description...
Preprocessing comments...
Preprocessing name...


Unnamed: 0,review_id,listing_id,description,comments,sentiment,label,name,amenities
523,51575200,5628197,"[spacious, 1br, apartment, well, equipped, kit...","[matthew, not, left, instruction, get, key, he...",4.0,good,"[quiet, 1br, real, kitchen, central, ac]","[TV, Internet, Wifi, Air conditioning, Kitchen..."
526,126283984,5628197,"[spacious, 1br, apartment, well, equipped, kit...","[good, location, allergy, apartment, apartment...",4.0,good,"[quiet, 1br, real, kitchen, central, ac]","[TV, Internet, Wifi, Air conditioning, Kitchen..."
379,188516052,4492175,"[beautifully, decorated, room, wilton, manor, ...","[nice, place, stay, every, room, separate, ent...",4.0,good,"[red, room, heart, wilton, manor, heated, pool]","[TV, Cable TV, Wifi, Air conditioning, Pool, K..."
543,416434132,5773722,"[apartment, made, half, historical, 2, bedroom...","[nice, little, place, u, spend, night, heading...",4.0,good,"[sky, private, apartment, 2, pool, access]","[TV, Internet, Wifi, Air conditioning, Pool, K..."
9,142486695,216046,"[rest, assured, city, hollywood, state, florid...","[great, group, six, plenty, pool, table, large...",4.666667,good,"[ultimate, family, vacation]","[TV, Cable TV, Internet, Wifi, Air conditionin..."


In [8]:
# save it
subset_labels_and_listing.to_csv(DATA_FP + f"/processed/{GEO}_processed.csv", index=False)