## EDA and Feature Engineering on the processed dataset

In [348]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px

In [349]:
# Read in the data
GEO = "texas"

if isinstance(GEO, str):
    df = pd.read_csv(f"../../data/processed/{GEO}_processed.csv")
elif isinstance(GEO, list):
    dfs = []
    for geo in GEO:
        df = pd.read_csv(f"../../data/processed/{geo}_processed.csv")
        dfs.append(df)
    df = pd.concat(dfs)

df.description = df.description.apply(lambda x: eval(x))
df.comments = df.comments.apply(lambda x: eval(x))

print(df.shape)
df.head()

(978, 8)


Unnamed: 0,review_id,listing_id,description,comments,sentiment,label,name,amenities
0,83097,5456,"[fabulous, location, walking, convention, cent...","[sylvia, very, nice, informal, relaxed, arrive...",4.333333,good,"['walk', '6th', 'rainey', 'st', 'convention', ...","['TV', 'Wifi', 'Air conditioning', 'Kitchen', ..."
1,133337,5456,"[fabulous, location, walking, convention, cent...","[sylvia, picked, airport, gave, beautiful, eve...",4.0,good,"['walk', '6th', 'rainey', 'st', 'convention', ...","['TV', 'Wifi', 'Air conditioning', 'Kitchen', ..."
2,150928,5456,"[fabulous, location, walking, convention, cent...","[lovely, time, enjoyed, apartment, clean, spac...",3.0,good,"['walk', '6th', 'rainey', 'st', 'convention', ...","['TV', 'Wifi', 'Air conditioning', 'Kitchen', ..."
3,2706775,5456,"[fabulous, location, walking, convention, cent...","[sylvia, excellent, host, stayed, touch, made,...",4.5,good,"['walk', '6th', 'rainey', 'st', 'convention', ...","['TV', 'Wifi', 'Air conditioning', 'Kitchen', ..."
4,8602878,5456,"[fabulous, location, walking, convention, cent...","[place, cute, little, self, contained, cottage...",4.0,good,"['walk', '6th', 'rainey', 'st', 'convention', ...","['TV', 'Wifi', 'Air conditioning', 'Kitchen', ..."


### Distribution of labels

First, let's look at the distribution of labels in the processed dataset.

In [350]:
mbad_pcnt = df[df.label == "mbad"].shape[0] / df.shape[0]
mgood_pcnt = df[df.label == "mgood"].shape[0] / df.shape[0]
mbad_or_mgood_pcnt = df[df.label.isin(["mbad", "mgood"])].shape[0] / df.shape[0]
print(
    f"mbad: {mbad_pcnt:.2%}, mgood: {mgood_pcnt:.2%}, mbad or mgood: {mbad_or_mgood_pcnt:.2%}"
)

bar_vals = df.label.value_counts()

fig = px.bar(
    bar_vals,
    x=bar_vals.index,
    y=bar_vals.values,
    title=f"Distribution of labels (GEO={GEO})",
    labels={"index": "Label", "y": "Count"}
)

fig.show()

mbad: 7.36%, mgood: 6.54%, mbad or mgood: 13.91%


### Ngrams

What are the most common ngrams for the different types of reviews? What are the most common overlaps?

In [351]:
def get_consecutive_ngrams(review, n) -> list:
    """Helper function to get ngrams from a review.

    Args:
        review (str): The review to get ngrams from.
        n (int): The number of ngrams to get.

    Returns:
        list: the list of ngrams, joined by underscores.
    """
    if isinstance(review, str):
        review = review.split("")

    return ["_".join(review[i:i+n]) for i in range(len(review)-n-1)]

def series_to_ngrams(series: pd.Series, N):
    n_grams = series.apply(lambda x: get_consecutive_ngrams(x, N))
    return n_grams.explode()

def get_exclusive_sizes(good: set, bad: set, mgood: set, mbad: set) -> dict:
    """Get the sizes of the exclusive sets"""

    only_good = good - bad - mgood - mbad
    only_bad = bad - good - mgood - mbad
    only_mgood = mgood - good - bad - mbad
    only_mbad = mbad - good - bad - mgood
    
    return {
        "good": len(only_good),
        "bad": len(only_bad),
        "mgood": len(only_mgood),
        "mbad": len(only_mbad)
    }

We can look at the number of unique values to each label for different values of N on a line chart.

In [352]:
import plotly.express as px

intersection_sizes = []
for n in [1, 2, 3, 4, 5, 6]:

    good_ngrams = series_to_ngrams(df[df.label == "good"].comments, n).value_counts()
    bad_ngrams = series_to_ngrams(df[df.label == "bad"].comments, n).value_counts()
    mgood_ngrams = series_to_ngrams(df[df.label == "mgood"].comments, n).value_counts()
    mbad_ngrams = series_to_ngrams(df[df.label == "mbad"].comments, n).value_counts()
    intersection_sizes_n = get_exclusive_sizes(
        set(good_ngrams.index),
        set(bad_ngrams.index),
        set(mgood_ngrams.index),
        set(mbad_ngrams.index)
    )

    intersection_sizes_n["n"] = n

    intersection_sizes.append(intersection_sizes_n)


# prepare data
is_df = pd.DataFrame(intersection_sizes)
is_df = is_df.melt(id_vars="n", var_name="label", value_name="count")
is_df["count"] = is_df["count"].astype(int)
is_df["n"] = is_df["n"].astype(int)

# exclude good 
is_df = is_df[is_df.label != "good"]

# plot a line chart
px.line(is_df, 
    x="n", y="count",
    color="label",
    title=f"Unique # of {GEO} n-grams for each label", 
    labels={"n": "N-gram size", "count": "Unique # of n-grams"},
    markers=True
    )

We can see that the number of unique ngrams increases until about n=3, and then levels off. What are the top 10 3-grams for misleading reviews?

In [353]:
import numpy as np
from collections import Counter

# generate wordcloud for misleading reviews
subset = df[df.label.isin(["mbad", "mgood"])]

# get groups of N consecutive words
N = 3
n_grams_3 = series_to_ngrams(subset.comments, N).to_list()

print("Top 10 most common 3-grams in misleading reviews:")
Counter(n_grams_3).most_common(10)

Top 10 most common 3-grams in misleading reviews:


[('within_walking_distance', 8),
 ('would_not_recommend', 6),
 ('not_recommend_staying', 4),
 ('made_u_feel', 3),
 ('great_host_very', 3),
 ('great_place_stay', 3),
 ('solid_internet_connectivity', 3),
 ('walking_distance_lot', 2),
 ('stay_good_location', 2),
 ('good_location_clean', 2)]

Let's manually remove some of these that don't make sense

Now let's look at a word cloud of the 3-grams in misleading reviews.

**NOTE:** Also broken due to pandas library error. Not sure why it happened all of a sudden.

In [354]:
from wordcloud import WordCloud
from PIL import Image

ngrams_str = " ".join(n_grams_3)

# make house-shaped mask
mask = np.array(Image.open("../../house_shape.png"))
# fill transparent areas with white
mask[mask[:, :, 3] == 0] = [255, 255, 255, 255]

# generate wordcloud
wc = WordCloud(
    background_color="white", 
    max_words=1000, 
    width=mask.shape[0]*5,
    height=mask.shape[1]*5,
    mask = mask,
    contour_width=3,
    contour_color='steelblue',
    colormap="twilight_shifted"
).generate(ngrams_str)

wc

# show wordcloud
fig = px.imshow(wc, title=f"Most common {N}-grams in misleading reviews", width=700, height=700)
fig.update_xaxes(visible=False)
fig.update_yaxes(visible=False)
fig.show()

## Word Embeddings
Another way to look at the data is to use word embeddings. We can use the word2vec model from gensim to create word embeddings for each word in the dataset. We can then use t-SNE to reduce the dimensionality of the word embeddings to 2 dimensions, and plot them on a scatter plot. We can then color the points by the label of the review.

# Generate Features

With what we know, let's generate some features. I've provided a function for each type of feature, so we can mix and match.

In [358]:
from collections import Counter

def ngrams_features(features, n):
    """Add n-gram features to the features dataframe."""

    # one-hot encode ngrams
    df["ngrams"] = df.comments.apply(lambda x: set(get_consecutive_ngrams(x, 3)))

    # get ngrams that are common in misleading reviews
    misleading = df[df.label.isin(["mbad", "mgood"])]
    ng_set = set(series_to_ngrams(subset.comments, n).to_list())

    # one-hot encode ngrams
    ngram_features = {}
    for ngram in ng_set:
        ngram_features[ngram] = df.ngrams.apply(lambda ngrams: 1 if ngram in ngrams else 0)
    ngram_df = pd.DataFrame(ngram_features)
    features = pd.concat([features, ngram_df], axis=1)

    # Only keep ngram features that appear in at least 3 reviews
    min_num_reviews = 3
    num_reviews_ngram_is_in = features.apply(lambda x: x > 0).sum()
    ngram_features_to_drop= num_reviews_ngram_is_in[num_reviews_ngram_is_in < min_num_reviews].index

    features = features.drop(ngram_features_to_drop, axis=1)

    return features

# build features
features = pd.DataFrame()
features = ngrams_features(features, 3)

# add id and label
features["review_id"] = df.review_id
features["label"] = df.label

print(features.shape)
features.head()

(978, 108)


Unnamed: 0,drive_south_congress,great_place_great,even_gave_u,safe_walking_around,check_check_procedure,would_not_recommend,also_gave_u,make_feel_like,arrived_late_evening,u_feel_comfortable,...,walking_distance_great,host_very_responsive,enjoyed_stay_very,made_u_feel,very_comfortable_stay,available_answer_question,not_big_deal,not_bother_u,review_id,label
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,83097,good
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,133337,good
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,150928,good
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,2706775,good
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,8602878,good


In [359]:
if isinstance(GEO, list):
    geo_display_name = "_".join(GEO)
else:
    geo_display_name = GEO

features.to_csv(f"../../data/processed/features_{geo_display_name}.csv", index=False)