## Feature Engineering on the processed dataset

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px

In [42]:
# Read in the data
GEO = ["texas", "florida"]

if isinstance(GEO, str):
    df = pd.read_csv(f"../../data/processed/{GEO}_processed.csv")
elif isinstance(GEO, list):
    dfs = []
    for geo in GEO:
        df = pd.read_csv(f"../../data/processed/{geo}_processed.csv")
        df["source"] = geo
        dfs.append(df)
    df = pd.concat(dfs)

# eval list columns
df.description = df.description.apply(lambda x: eval(x))
df.comments = df.comments.apply(lambda x: eval(x))
df.amenities = df.amenities.apply(lambda x: eval(x))

df.reset_index(drop=True, inplace=True)

print(df.shape)
df.head()

(1629, 9)


Unnamed: 0,review_id,listing_id,description,comments,sentiment,label,name,amenities,source
0,83097,5456,"[fabulous, location, walking, convention, cent...","[sylvia, very, nice, informal, relaxed, arrive...",4.333333,good,"['walk', '6th', 'rainey', 'st', 'convention', ...","[TV, Wifi, Air conditioning, Kitchen, Pets liv...",texas
1,133337,5456,"[fabulous, location, walking, convention, cent...","[sylvia, picked, airport, gave, beautiful, eve...",4.0,good,"['walk', '6th', 'rainey', 'st', 'convention', ...","[TV, Wifi, Air conditioning, Kitchen, Pets liv...",texas
2,150928,5456,"[fabulous, location, walking, convention, cent...","[lovely, time, enjoyed, apartment, clean, spac...",3.0,good,"['walk', '6th', 'rainey', 'st', 'convention', ...","[TV, Wifi, Air conditioning, Kitchen, Pets liv...",texas
3,2706775,5456,"[fabulous, location, walking, convention, cent...","[sylvia, excellent, host, stayed, touch, made,...",4.5,good,"['walk', '6th', 'rainey', 'st', 'convention', ...","[TV, Wifi, Air conditioning, Kitchen, Pets liv...",texas
4,8602878,5456,"[fabulous, location, walking, convention, cent...","[place, cute, little, self, contained, cottage...",4.0,good,"['walk', '6th', 'rainey', 'st', 'convention', ...","[TV, Wifi, Air conditioning, Kitchen, Pets liv...",texas


### Distribution of labels

First, let's look at the distribution of labels in the processed dataset.

In [4]:
mbad_pcnt = df[df.label == "mbad"].shape[0] / df.shape[0]
mgood_pcnt = df[df.label == "mgood"].shape[0] / df.shape[0]
mbad_or_mgood_pcnt = df[df.label.isin(["mbad", "mgood"])].shape[0] / df.shape[0]
print(
    f"mbad: {mbad_pcnt:.2%}, mgood: {mgood_pcnt:.2%}, mbad or mgood: {mbad_or_mgood_pcnt:.2%}"
)

bar_vals = df.label.value_counts()

fig = px.bar(
    bar_vals,
    x=bar_vals.index,
    y=bar_vals.values,
    title=f"Distribution of labels (GEO={GEO})",
    labels={"index": "Label", "y": "Count"}
)

fig.show()

mbad: 7.24%, mgood: 6.45%, mbad or mgood: 13.69%


### Ngrams

What are the most common ngrams for the different types of reviews? What are the most common overlaps?

In [5]:
def get_consecutive_ngrams(review, n) -> list:
    """Helper function to get ngrams from a review.

    Args:
        review (str): The review to get ngrams from.
        n (int): The number of ngrams to get.

    Returns:
        list: the list of ngrams, joined by underscores.
    """
    if isinstance(review, str):
        review = review.split("")

    return ["_".join(review[i:i+n]) for i in range(len(review)-n-1)]

def series_to_ngrams(series: pd.Series, N):
    n_grams = series.apply(lambda x: get_consecutive_ngrams(x, N))
    return n_grams.explode()

def get_exclusive_sizes(good: set, bad: set, mgood: set, mbad: set) -> dict:
    """Get the sizes of the exclusive sets"""

    only_good = good - bad - mgood - mbad
    only_bad = bad - good - mgood - mbad
    only_mgood = mgood - good - bad - mbad
    only_mbad = mbad - good - bad - mgood
    
    return {
        "good": len(only_good),
        "bad": len(only_bad),
        "mgood": len(only_mgood),
        "mbad": len(only_mbad)
    }

We can look at the number of unique values to each label for different values of N on a line chart.

In [6]:
import plotly.express as px

intersection_sizes = []
for n in [1, 2, 3, 4, 5, 6]:

    good_ngrams = series_to_ngrams(df[df.label == "good"].comments, n).value_counts()
    bad_ngrams = series_to_ngrams(df[df.label == "bad"].comments, n).value_counts()
    mgood_ngrams = series_to_ngrams(df[df.label == "mgood"].comments, n).value_counts()
    mbad_ngrams = series_to_ngrams(df[df.label == "mbad"].comments, n).value_counts()
    intersection_sizes_n = get_exclusive_sizes(
        set(good_ngrams.index),
        set(bad_ngrams.index),
        set(mgood_ngrams.index),
        set(mbad_ngrams.index)
    )

    intersection_sizes_n["n"] = n

    intersection_sizes.append(intersection_sizes_n)


# prepare data
is_df = pd.DataFrame(intersection_sizes)
is_df = is_df.melt(id_vars="n", var_name="label", value_name="count")
is_df["count"] = is_df["count"].astype(int)
is_df["n"] = is_df["n"].astype(int)

# exclude good 
is_df = is_df[is_df.label != "good"]

# plot a line chart
px.line(is_df, 
    x="n", y="count",
    color="label",
    title=f"Unique # of {GEO} n-grams for each label", 
    labels={"n": "N-gram size", "count": "Unique # of n-grams"},
    markers=True
    )

We can see that the number of unique ngrams increases until about n=4, and then levels off. What are the top 10 4-grams for misleading reviews?

In [7]:
import numpy as np
from collections import Counter

# generate wordcloud for misleading reviews
subset = df[df.label.isin(["mbad", "mgood"])]

# get groups of N consecutive words
N = 4
n_grams = series_to_ngrams(subset.comments, N).to_list()

print(f"Top 10 most common {N}-grams in misleading reviews:")
Counter(n_grams).most_common(10)

Top 10 most common 4-grams in misleading reviews:


[('would_not_recommend_staying', 3),
 ('give_u_partial_refund', 2),
 ('thing_within_walking_distance', 2),
 ('would_not_recommend_anyone', 2),
 ('main_living_area_not', 2),
 ('host_allowed_u_check', 2),
 ('allowed_u_check_early', 2),
 ('come_next_day_fix', 2),
 ('place_may_sleep_9', 2),
 ('one_roll_toilet_paper', 2)]

Let's manually remove some of these that don't make sense

Now let's look at a word cloud of the N-grams in misleading reviews.

In [8]:
from wordcloud import WordCloud
from PIL import Image

ngrams_str = " ".join(n_grams)

# make house-shaped mask
mask = np.array(Image.open("../../house_shape.png"))
# fill transparent areas with white
mask[mask[:, :, 3] == 0] = [255, 255, 255, 255]

# generate wordcloud
wc = WordCloud(
    background_color="white", 
    max_words=1000, 
    width=mask.shape[0]*5,
    height=mask.shape[1]*5,
    mask = mask,
    contour_width=3,
    contour_color='steelblue',
    colormap="twilight_shifted"
).generate(ngrams_str)

wc

# show wordcloud
fig = px.imshow(wc, title=f"Most common {N}-grams in misleading reviews", width=700, height=700)
fig.update_xaxes(visible=False)
fig.update_yaxes(visible=False)
fig.show()

Here is a reusable function to add ngrams features to a dataframe.

In [9]:
def ngrams_features(features, n):
    """Add n-gram features to the features dataframe."""

    # one-hot encode ngrams
    df["ngrams"] = df.comments.apply(lambda x: set(get_consecutive_ngrams(x, 3)))

    # get ngrams that are common in misleading reviews
    misleading = df[df.label.isin(["mbad", "mgood"])]
    ng_set = set(series_to_ngrams(subset.comments, n).to_list())

    # one-hot encode ngrams
    ngram_features = {}
    for ngram in ng_set:
        ngram_features[ngram] = df.ngrams.apply(lambda ngrams: 1 if ngram in ngrams else 0)
    ngram_df = pd.DataFrame(ngram_features)
    features = pd.concat([features, ngram_df], axis=1)

    # Only keep ngram features that appear in at least 3 reviews
    min_num_reviews = 3
    num_reviews_ngram_is_in = features.apply(lambda x: x > 0).sum()
    ngram_features_to_drop= num_reviews_ngram_is_in[num_reviews_ngram_is_in < min_num_reviews].index

    features = features.drop(ngram_features_to_drop, axis=1)

    return features

## "Mentioned Amenities" Features
Was an amenity that was included in the listing mentioned in the review?

First, we need to do some cleaning amenities to make them easier to match.

In [43]:
import re

def clean_amenities(amenities):
    """Clean the amenities column."""

    cleaned = []

    # basic cleaning
    for amenity in amenities:
        # remove quotes
        amenity = amenity.replace('"', "")
        # remove anything in parentheses or brackets
        amenity = re.sub(r"\(.*\)", "", amenity)
        amenity = re.sub(r"\[.*\]", "", amenity)
        # strip whitespace
        amenity = amenity.strip()
        # lowercase
        amenity = amenity.lower()

        cleaned.append(amenity)

    # split entries with a slash, "and", or "or"
    for to_split_on in ["/", " and ", " or "]:
        cleaned = [amenity.split(to_split_on) for amenity in cleaned]
        cleaned = [item.strip() for sublist in cleaned for item in sublist]

    # remove empty strings
    cleaned = [amenity for amenity in cleaned if amenity != ""]

    return cleaned

df.amenities = df.amenities.apply(clean_amenities)

Look at some examples of the cleaned amenities.

In [46]:
df.amenities.sample(1).values[0]

['tv',
 'cable tv',
 'wifi',
 'air conditioning',
 'kitchen',
 'free parking on premises',
 'smoking allowed',
 'pets live on this property',
 'dog',
 'cat',
 'heating',
 'suitable for events',
 'washer',
 'dryer}']

In [57]:
def amenities_features(features: pd.DataFrame) -> pd.DataFrame:
    print("Adding amenities features...")
    amenities_features_df = pd.DataFrame()
    # 1. Find the amenities in the listings
    for index, row in df.iterrows():
        # 1. Find the amenities in the listings
        amenities = row.amenities

        # Add one row to the features dataframe using pd.concat
        amenities_features_df = pd.concat([amenities_features_df, pd.DataFrame(columns=amenities_features_df.columns)])

        # 2. For each amenity, see if it is present in the review
        for amenity in amenities:
            if amenity in row.comments:
                # 3. If it is present, add 1 for that feature
                if amenity in amenities_features_df.columns:
                    amenities_features_df.loc[index, amenity] = 1
                #   If the amenity does not already exist from another review, add it to the features dataframe
                else:
                    amenities_features_df.loc[index, amenity] = 0
    
    # 4. fill missing values with 0
    amenities_features_df = amenities_features_df.fillna(0)

    # 5. Add the features to the features dataframe
    features = pd.concat([features, amenities_features_df], axis=1)

    return features

features = pd.DataFrame()
features_with_amenities = amenities_features(features)
features_with_amenities.sum().sort_values(ascending=False).head(10)

Adding amenities features...


kitchen      304.0
pool         235.0
family       179.0
tv            97.0
wifi          82.0
backyard      52.0
dog           46.0
dryer         45.0
breakfast     36.0
washer        34.0
dtype: float64

## Word Embeddings and T-SNE
Another way to look at the data is to use word embeddings. We can use the word2vec model from gensim to create word embeddings for each word in the dataset. We can then take the average of all word vectors in a review to get a vector representation of the review. We can then use T-SNE to reduce the dimensionality of the word embeddings to 3 dimensions so we can plot them.

In [50]:
from gensim.models.word2vec import Word2Vec

def embeddings_features(features, emb_vector_size=50) -> pd.DataFrame:

    print("Creating model...")
    model = Word2Vec(df.comments.to_list(), 
            min_count=1,
            vector_size=emb_vector_size,
            workers=3, 
            window=3, 
            sg=1
        )
    
    # generate embeddings
    print("Generating embeddings...")
    get_review_embeddings = lambda review: np.mean([model.wv[word] for word in review if word in model.wv], axis=0)
    embeddings = np.array([get_review_embeddings(review) for review in df.comments.to_list()])
    
    for i in range(embeddings.shape[1]):
        features[f"embedding_{i}"] = embeddings[:, i]
    
    return features


def tsne_features(features, dims=3):

    from sklearn.manifold import TSNE
    print(f"Applying t-SNE... (dim={dims})")
    tsne = TSNE(n_components=dims, random_state=0)
    embeddings_2d = tsne.fit_transform(features)
    embd_df = pd.DataFrame(embeddings_2d, columns=[f"tsne_{i}" for i in range(dims)])

    return embd_df

Let's apply this to the misleading reviews, and then reduce down to 2 dimensions using T-SNE.

In [52]:
# ignore PerformanceWarning
import warnings
from pandas.errors import PerformanceWarning
warnings.filterwarnings("ignore", category=PerformanceWarning)

embeddings_df = pd.DataFrame()
embeddings_df["review_id"] = df.review_id
embeddings_df["label"] = df.label
embeddings_df = embeddings_features(embeddings_df)

# apply tsne to reduce to `dims` dimensions
dims = 3
embd_df = tsne_features(embeddings_df.drop(columns=["review_id", "label"], axis=1), dims=dims)
embeddings_df = embeddings_df.reset_index()
embd_df["review_id"] = embeddings_df.review_id
embd_df["label"] = embeddings_df.label

# plot 3d scatter with plotly
fig = px.scatter_3d(embd_df, x="tsne_0", y="tsne_1", z="tsne_2", color="label", title="t-SNE embeddings of reviews", )
# make size of point smaller and opaque
fig.update_traces(marker=dict(size=4, opacity=0.5))
# make inital zoom bigger
fig.update_layout(scene=dict(camera_eye=dict(x=0.9, y=0.9, z=0.9)))
fig.show()

Creating model...
Generating embeddings...
Applying t-SNE... (dim=3)


It looks like the "mbad" labels are mostly together. The "mgood" are kind of in the  middle, but there is still lots of noise of "good" labels in that region.

# Generate Features

With what we know, let's generate some features. I've provided a function for each type of feature, so we can mix and match.

In [59]:
features = pd.DataFrame()

# get the embeddings
features = embeddings_features(features)

# apply tsne to reduce to `dims` dimensions
dims = 3
features = tsne_features(features, dims=dims)

# get the amenities
features = amenities_features(features)

# fill missing values with 0
features = features.fillna(0)

features["review_id"] = df.review_id
features["label"] = df.label

print(features.shape)
features.head()

Creating model...
Generating embeddings...
Applying t-SNE... (dim=3)
Adding amenities features...
(1629, 36)


Unnamed: 0,tsne_0,tsne_1,tsne_2,kitchen,garden,microwave,backyard,iron,shampoo,wifi,...,silverware,bathtub,lockbox,crib,elevator,gym,oven,toilet,review_id,label
0,-5.613824,-3.080757,2.447573,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,83097,good
1,-2.376282,7.860886,2.428963,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,133337,good
2,-5.809454,-2.778553,-1.081195,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,150928,good
3,-10.103469,-1.812967,-2.710917,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2706775,good
4,1.280295,-2.094504,4.721372,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8602878,good


In [61]:
corr = features.drop(columns=["review_id", "label"], axis=1).corr()

fig = px.imshow(corr, title="Correlation matrix of features")
fig.show()

In [63]:
if isinstance(GEO, list):
    geo_display_name = "_".join(GEO)
else:
    geo_display_name = GEO

features.to_csv(f"../../data/processed/features_{geo_display_name}.csv", index=False)