## Feature Engineering on the processed dataset

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px

In [2]:
# Read in the data
GEO = ["texas", "florida"]

if isinstance(GEO, str):
    df = pd.read_csv(f"../../data/processed/{GEO}_processed.csv")
elif isinstance(GEO, list):
    dfs = []
    for geo in GEO:
        df = pd.read_csv(f"../../data/processed/{geo}_processed.csv")
        df["source"] = geo
        dfs.append(df)
    df = pd.concat(dfs)

# eval list columns
df.description = df.description.apply(lambda x: eval(x))
df.comments = df.comments.apply(lambda x: eval(x))
df.amenities = df.amenities.apply(lambda x: eval(x))

df.reset_index(drop=True, inplace=True)

print(df.shape)
df.head()

(1629, 9)


Unnamed: 0,review_id,listing_id,description,comments,sentiment,label,name,amenities,source
0,83097,5456,"[fabulous, location, walking, convention, cent...","[sylvia, very, nice, informal, relaxed, arrive...",4.333333,good,"['walk', '6th', 'rainey', 'st', 'convention', ...","[TV, Wifi, Air conditioning, Kitchen, Pets liv...",texas
1,133337,5456,"[fabulous, location, walking, convention, cent...","[sylvia, picked, airport, gave, beautiful, eve...",4.0,good,"['walk', '6th', 'rainey', 'st', 'convention', ...","[TV, Wifi, Air conditioning, Kitchen, Pets liv...",texas
2,150928,5456,"[fabulous, location, walking, convention, cent...","[lovely, time, enjoyed, apartment, clean, spac...",3.0,good,"['walk', '6th', 'rainey', 'st', 'convention', ...","[TV, Wifi, Air conditioning, Kitchen, Pets liv...",texas
3,2706775,5456,"[fabulous, location, walking, convention, cent...","[sylvia, excellent, host, stayed, touch, made,...",4.5,good,"['walk', '6th', 'rainey', 'st', 'convention', ...","[TV, Wifi, Air conditioning, Kitchen, Pets liv...",texas
4,8602878,5456,"[fabulous, location, walking, convention, cent...","[place, cute, little, self, contained, cottage...",4.0,good,"['walk', '6th', 'rainey', 'st', 'convention', ...","[TV, Wifi, Air conditioning, Kitchen, Pets liv...",texas


### Distribution of labels

First, let's look at the distribution of labels in the processed dataset.

In [3]:
mbad_pcnt = df[df.label == "mbad"].shape[0] / df.shape[0]
mgood_pcnt = df[df.label == "mgood"].shape[0] / df.shape[0]
mbad_or_mgood_pcnt = df[df.label.isin(["mbad", "mgood"])].shape[0] / df.shape[0]
print(
    f"mbad: {mbad_pcnt:.2%}, mgood: {mgood_pcnt:.2%}, mbad or mgood: {mbad_or_mgood_pcnt:.2%}"
)

bar_vals = df.label.value_counts()

fig = px.bar(
    bar_vals,
    x=bar_vals.index,
    y=bar_vals.values,
    title=f"Distribution of labels (GEO={GEO})",
    labels={"index": "Label", "y": "Count"}
)

fig.show()

mbad: 7.24%, mgood: 6.45%, mbad or mgood: 13.69%


## Correlation filter helper function
This function allows us to filter down to only include highly correlated features with the label.

In [4]:
def corr_filter(features, corr_thresh=0.05) -> pd.DataFrame:
    """Filter features based on correlation with label."""

    if corr_thresh is None:
        return features

    to_filter = features.copy()
    
    to_filter["review_id"] = df.review_id
    to_filter["label"] = df.label

    # drop na labels. Thanks Charlie :/
    to_filter = to_filter.dropna(subset=["label"])

    # Turn labels into numbers
    labels_list: list = to_filter.label.unique().tolist()
    to_filter.label = to_filter.label.apply(lambda label: labels_list.index(label))

    # calculate correlation of features with label
    features_to_filter = to_filter.drop(["review_id", "label"], axis=1)
    corrs = features_to_filter.corrwith(to_filter.label).sort_values(ascending=False)

    # filter features
    highly_correlated_features = corrs[corrs.abs() > corr_thresh].index
    # drop features with nan correlation
    highly_correlated_features = highly_correlated_features.dropna()

    # filter features
    print(f"Features above correlation threshold ({corr_thresh}):")
    print(highly_correlated_features.tolist())
    filtered = features[highly_correlated_features]
    
    return filtered

### Ngrams

What are the most common ngrams for the different types of reviews? What are the most common overlaps?

In [5]:
def get_consecutive_ngrams(review, n) -> list:
    """Helper function to get ngrams from a review.

    Args:
        review (str): The review to get ngrams from.
        n (int): The number of ngrams to get.

    Returns:
        list: the list of ngrams, joined by underscores.
    """
    if isinstance(review, str):
        review = review.split("")

    return ["_".join(review[i:i+n]) for i in range(len(review)-n-1)]

def series_to_ngrams(series: pd.Series, N):
    n_grams = series.apply(lambda x: get_consecutive_ngrams(x, N))
    return n_grams.explode()

def get_exclusive_sizes(good: set, bad: set, mgood: set, mbad: set) -> dict:
    """Get the sizes of the exclusive sets"""

    only_good = good - bad - mgood - mbad
    only_bad = bad - good - mgood - mbad
    only_mgood = mgood - good - bad - mbad
    only_mbad = mbad - good - bad - mgood
    
    return {
        "good": len(only_good),
        "bad": len(only_bad),
        "mgood": len(only_mgood),
        "mbad": len(only_mbad)
    }

We can look at the number of unique values to each label for different values of N on a line chart.

In [6]:
import plotly.express as px

intersection_sizes = []
for n in [1, 2, 3, 4, 5, 6]:

    good_ngrams = series_to_ngrams(df[df.label == "good"].comments, n).value_counts()
    bad_ngrams = series_to_ngrams(df[df.label == "bad"].comments, n).value_counts()
    mgood_ngrams = series_to_ngrams(df[df.label == "mgood"].comments, n).value_counts()
    mbad_ngrams = series_to_ngrams(df[df.label == "mbad"].comments, n).value_counts()
    intersection_sizes_n = get_exclusive_sizes(
        set(good_ngrams.index),
        set(bad_ngrams.index),
        set(mgood_ngrams.index),
        set(mbad_ngrams.index)
    )

    intersection_sizes_n["n"] = n

    intersection_sizes.append(intersection_sizes_n)


# prepare data
is_df = pd.DataFrame(intersection_sizes)
is_df = is_df.melt(id_vars="n", var_name="label", value_name="count")
is_df["count"] = is_df["count"].astype(int)
is_df["n"] = is_df["n"].astype(int)

# exclude good 
is_df = is_df[is_df.label != "good"]

# plot a line chart
px.line(is_df, 
    x="n", y="count",
    color="label",
    title=f"Unique # of {GEO} n-grams for each label", 
    labels={"n": "N-gram size", "count": "Unique # of n-grams"},
    markers=True
    )

We can see that the number of unique ngrams increases until about n=4, and then levels off. What are the top 10 4-grams for misleading reviews?

In [7]:
import numpy as np
from collections import Counter

# get subset of misleading reviews
subset = df[df.label.isin(["mbad", "mgood"])]

# get groups of N consecutive words
N = 3
n_grams = series_to_ngrams(subset.comments, N).to_list()

print(f"Top 10 most common {N}-grams in misleading reviews:")
Counter(n_grams).most_common(10)

Top 10 most common 3-grams in misleading reviews:


[('within_walking_distance', 10),
 ('would_not_recommend', 7),
 ('living_room_furniture', 4),
 ('not_recommend_staying', 4),
 ('not_big_deal', 4),
 ('great_place_stay', 4),
 ('let_u_know', 4),
 ('not_work_not', 3),
 ('give_4_star', 3),
 ('made_u_feel', 3)]

Let's manually remove some of these that don't make sense

Now let's look at a word cloud of the N-grams in misleading reviews.

In [8]:
from wordcloud import WordCloud
from PIL import Image

ngrams_str = " ".join(n_grams)

# make house-shaped mask
mask = np.array(Image.open("../../house_shape.png"))
# fill transparent areas with white
mask[mask[:, :, 3] == 0] = [255, 255, 255, 255]

# generate wordcloud
wc = WordCloud(
    background_color="white", 
    max_words=1000, 
    width=mask.shape[0]*5,
    height=mask.shape[1]*5,
    mask = mask,
    contour_width=3,
    contour_color='steelblue',
    colormap="twilight_shifted"
).generate(ngrams_str)

wc

# show wordcloud
fig = px.imshow(wc, title=f"Most common {N}-grams in misleading reviews", width=700, height=700)
fig.update_xaxes(visible=False)
fig.update_yaxes(visible=False)
fig.show()

Here is a reusable function to add ngrams features to a dataframe.

In [9]:
def ngrams_features(features, n, prefix="ngrams_", corr_thresh=0.05):
    """Add n-gram features to the features dataframe."""

    # one-hot encode ngrams
    df["ngrams"] = df.comments.apply(lambda x: set(get_consecutive_ngrams(x, 3)))

    # get set of ngrams
    ng_set = set(series_to_ngrams(subset.comments, n).to_list())

    # one-hot encode ngrams
    ngram_features = {}
    for ngram in ng_set:
        ngram_features[prefix + ngram] = df.ngrams.apply(lambda ngrams: 1 if ngram in ngrams else 0)
    ngram_df = pd.DataFrame(ngram_features)
    
    # filter features on correlation with label
    ngrams_df = corr_filter(ngram_df, corr_thresh=corr_thresh)

    # add ngram features to features dataframe
    features = pd.concat([features, ngrams_df], axis=1)


    return features

In [10]:
features = pd.DataFrame()
ngrams_features(features, 3, prefix="ngrams_", corr_thresh=0.09)

Features above correlation threshold (0.09):
['ngrams_would_not_recommend', 'ngrams_no_toilet_paper', 'ngrams_not_recommend_staying', 'ngrams_could_not_access', 'ngrams_per_night_not', 'ngrams_very_old_not', 'ngrams_call_text_message', 'ngrams_phone_call_text', 'ngrams_left_first_night', 'ngrams_old_not_well', 'ngrams_worse_experience_ever', 'ngrams_refund_second_night', 'ngrams_paid_two_night', 'ngrams_expected_based_description', 'ngrams_need_deep_cleaning', 'ngrams_not_really_clean', 'ngrams_got_no_response', 'ngrams_toilet_paper_buy', 'ngrams_never_met_host', 'ngrams_property_management_company', 'ngrams_nothing_like_picture', 'ngrams_freezer_not_work', 'ngrams_obviously_not_cleaned', 'ngrams_shower_curtain_covered', 'ngrams_never_responded_message', 'ngrams_not_clean_arrived', 'ngrams_not_feel_comfortable', 'ngrams_very_bad_shape', 'ngrams_start_good_thing', 'ngrams_got_no_answer', 'ngrams_time_opened_door', 'ngrams_spend_lot_time', 'ngrams_living_room_couch', 'ngrams_write_negati

Unnamed: 0,ngrams_would_not_recommend,ngrams_no_toilet_paper,ngrams_not_recommend_staying,ngrams_could_not_access,ngrams_per_night_not,ngrams_very_old_not,ngrams_call_text_message,ngrams_phone_call_text,ngrams_left_first_night,ngrams_old_not_well,...,ngrams_need_major_overhaul,ngrams_sleep_bedroom_door,ngrams_guest_not_able,ngrams_take_cold_shower,ngrams_place_sleep_10,ngrams_staying_another_unit,ngrams_told_cleaning_lady,ngrams_big_issue_u,ngrams_let_start_saying,ngrams_felt_compelled_write
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1624,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1625,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1626,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1627,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## "Mentioned Amenities" Features
Was an amenity that was included in the listing mentioned in the review?

First, we need to do some cleaning amenities to make them easier to match.

In [11]:
import re

def clean_amenities(amenities):
    """Clean the amenities column."""

    cleaned = []

    # basic cleaning
    for amenity in amenities:
        # remove quotes
        amenity = amenity.replace('"', "")
        # remove anything in parentheses or brackets
        amenity = re.sub(r"\(.*\)", "", amenity)
        amenity = re.sub(r"\[.*\]", "", amenity)
        # strip whitespace
        amenity = amenity.strip()
        # lowercase
        amenity = amenity.lower()

        cleaned.append(amenity)

    # split entries with a slash, "and", or "or"
    for to_split_on in ["/", " and ", " or "]:
        cleaned = [amenity.split(to_split_on) for amenity in cleaned]
        cleaned = [item.strip() for sublist in cleaned for item in sublist]

    # remove empty strings
    cleaned = [amenity for amenity in cleaned if amenity != ""]

    return cleaned

df.amenities = df.amenities.apply(clean_amenities)

Look at some examples of the cleaned amenities.

In [12]:
df.amenities.sample(1).values[0]

['tv',
 'internet',
 'wifi',
 'air conditioning',
 'pool',
 'kitchen',
 'gym',
 'elevator',
 'hot tub',
 'heating',
 'family',
 'kid friendly',
 'washer',
 'dryer',
 'smoke detector',
 'essentials',
 'shampoo',
 'hangers',
 'hair dryer',
 'iron',
 'self check-in',
 'building staff',
 'hot water',
 'microwave',
 'coffee maker',
 'refrigerator',
 'dishes',
 'silverware',
 'long term stays allowed',
 'beachfront',
 'paid parking on premises}']

Calculate our amenities features, and look at the correlation between them and the label.

In [13]:
def amenities_features(features: pd.DataFrame, corr_thresh=None, prefix="amenities_") -> pd.DataFrame:
    print("Adding amenities features...")
    amenities_features_df = pd.DataFrame()
    # 1. Find the amenities in the listings
    for index, row in df.iterrows():
        # 1. Find the amenities in the listings
        amenities = row.amenities

        # Add one row to the features dataframe using pd.concat
        amenities_features_df = pd.concat([amenities_features_df, pd.DataFrame(columns=amenities_features_df.columns)])

        # 2. For each amenity, see if it is present in the review
        for amenity in amenities:
            if amenity in row.comments:
                # 3. If it is present, add 1 for that feature
                if amenity in amenities_features_df.columns:
                    amenities_features_df.loc[index, amenity] = 1
                #   If the amenity does not already exist from another review, add it to the features dataframe
                else:
                    amenities_features_df.loc[index, amenity] = 0
    
    # 4. fill missing values with 0
    amenities_features_df = amenities_features_df.fillna(0)

    # 6. Only keep amenities features that have a correlation with the label above a certain threshold
    amenities_features_df = corr_filter(amenities_features_df, corr_thresh=corr_thresh)

    # prefix features
    amenities_features_df = amenities_features_df.add_prefix(prefix)

    # 5. Add the features to the greater features dataframe
    features = pd.concat([features, amenities_features_df], axis=1)

    return features

features = pd.DataFrame()
features_with_amenities = amenities_features(features, corr_thresh=0.05, prefix="")
features_with_amenities["review_id"] = df.review_id
features_with_amenities["label"] = df.label
features_with_amenities.head()

Adding amenities features...
Features above correlation threshold (0.05):
['lockbox', 'dishwasher', 'pool', 'stove', 'keypad', 'bathtub', 'kitchen', 'balcony', 'shampoo', 'garden', 'cat', 'family', 'breakfast']


Unnamed: 0,lockbox,dishwasher,pool,stove,keypad,bathtub,kitchen,balcony,shampoo,garden,cat,family,breakfast,review_id,label
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,83097,good
2,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,150928,good
4,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,8602878,good
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,17827296,good
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,22768604,good


In [14]:
# OHE labels
labels_list: list = features_with_amenities.label.unique().tolist()
dummies = features_with_amenities.label.apply(lambda label: labels_list.index(label))

# calculate correlations with the label
corrs = features_with_amenities.drop("review_id", axis=1).corrwith(dummies).sort_values(ascending=False)
corrs





lockbox       0.119924
dishwasher    0.091862
pool          0.084724
stove         0.075858
keypad        0.071675
bathtub       0.062482
kitchen       0.054540
balcony      -0.050448
shampoo      -0.054802
garden       -0.072476
cat          -0.072627
family       -0.078019
breakfast    -0.081856
dtype: float64

## Word Embeddings
Another way to look at the data is to use word embeddings. We can use the word2vec model from gensim to create word embeddings for each word in the dataset. We can then take the average of all word vectors in a review to get a vector representation of the review. For now, let's embed into a 10-dimensional space.

In [35]:
from gensim.models.word2vec import Word2Vec
from typing import Tuple

def embeddings_features(features, emb_vector_size=10, col="comments") -> Tuple[pd.DataFrame, Word2Vec]:

    print("Creating model...")
    model = Word2Vec(df[col].to_list(), 
            min_count=1,
            vector_size=emb_vector_size,
            workers=3, 
            window=3, 
            sg=1
        )
    
    # generate embeddings
    print("Generating embeddings...")
    get_review_embeddings = lambda review: np.mean([model.wv[word] for word in review if word in model.wv], axis=0)
    embeddings = np.array([get_review_embeddings(review) for review in df[col].to_list()])
    
    for i in range(embeddings.shape[1]):
        features[f"embedding_{col}_{i}"] = embeddings[:, i]
    
    return features, model

# Generate Features

With what we know, let's generate some features. I've provided a function for each type of feature, so we can mix and match.

In [30]:
features = pd.DataFrame()

# get the embeddings
embeddings_comments, w2vmodel_comments = embeddings_features(features, col="comments", emb_vector_size=10)
embeddings_description, w2vmodel_description = embeddings_features(features, col="description", emb_vector_size=10)
features = pd.concat([features, embeddings_comments, embeddings_description], axis=1)

# for some reason there are duplicate columns after this. Remove them.
features = features.loc[:,~features.columns.duplicated()].copy()

# get 3-grams with 0.09  and 0.1 correlation thresholds
features = ngrams_features(features, n=3, corr_thresh=0.09, prefix="3gram09_")
features = ngrams_features(features, n=3, corr_thresh=0.1, prefix="3gram10_")

# get the amenities
features = amenities_features(features, corr_thresh=0.05, prefix="amenity005_")
features = amenities_features(features, corr_thresh=None, prefix="amenityall_")

# fill missing values with 0
features = features.fillna(0)

features["review_id"] = df.review_id
features["label"] = df.label

print(features.shape)

features.head()

Creating model...
Generating embeddings...
Creating model...
Generating embeddings...
Features above correlation threshold (0.09):
['3gram09_would_not_recommend', '3gram09_no_toilet_paper', '3gram09_not_recommend_staying', '3gram09_could_not_access', '3gram09_per_night_not', '3gram09_very_old_not', '3gram09_call_text_message', '3gram09_phone_call_text', '3gram09_left_first_night', '3gram09_old_not_well', '3gram09_worse_experience_ever', '3gram09_refund_second_night', '3gram09_paid_two_night', '3gram09_expected_based_description', '3gram09_need_deep_cleaning', '3gram09_not_really_clean', '3gram09_got_no_response', '3gram09_toilet_paper_buy', '3gram09_never_met_host', '3gram09_property_management_company', '3gram09_nothing_like_picture', '3gram09_freezer_not_work', '3gram09_obviously_not_cleaned', '3gram09_shower_curtain_covered', '3gram09_never_responded_message', '3gram09_not_clean_arrived', '3gram09_not_feel_comfortable', '3gram09_very_bad_shape', '3gram09_start_good_thing', '3gram09_

Unnamed: 0,embedding_comments_0,embedding_comments_1,embedding_comments_2,embedding_comments_3,embedding_comments_4,embedding_comments_5,embedding_comments_6,embedding_comments_7,embedding_comments_8,embedding_comments_9,...,amenityall_silverware,amenityall_bathtub,amenityall_lockbox,amenityall_crib,amenityall_elevator,amenityall_gym,amenityall_oven,amenityall_toilet,review_id,label
0,0.172036,-0.637597,0.612706,-0.143775,0.107823,-0.262747,0.943272,1.330471,-0.957359,-0.665829,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,83097,good
1,0.167699,-0.5653,0.554699,-0.221078,0.18865,-0.42898,0.80081,1.322331,-0.830094,-0.90698,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,133337,good
2,0.087849,-0.681386,0.636865,-0.132271,0.068009,-0.221255,1.035058,1.249915,-1.004797,-0.703693,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,150928,good
3,0.0943,-0.676064,0.689126,-0.222929,0.110279,-0.348316,0.964549,1.553874,-0.992685,-0.720572,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2706775,good
4,0.176663,-0.511448,0.665672,-0.197158,0.05405,-0.158278,0.949448,1.282783,-0.942654,-0.67005,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8602878,good


## Save the features

Here we will save the features to a csv. We will also save the trained word2vec model, so we can use it later.

In [22]:
# drop the emeddings features
# embd_cols = [col for col in features.columns if "embedding_" in col]
# features = features.drop(embd_cols, axis=1)

In [33]:
if isinstance(GEO, list):
    geo_display_name = "_".join(GEO)
else:
    geo_display_name = GEO

features.to_csv(f"../../data/processed/features_{geo_display_name}_no_tsne_no_doubles.csv", index=False)

In [34]:
# save the w2v models
w2vmodel_comments.save(f"../../models/w2vmodel_comments_{geo_display_name}_no_tsne.model")
w2vmodel_description.save(f"../../models/w2vmodel_description_{geo_display_name}_no_tsne.model")