# Generate processed data
Here we will combine the subset of reviews with their labels and associated property description to create a processed dataset.

In [1]:
import pandas as pd

In [38]:
# Which GEO to process?
GEO = "texas"

In [39]:
# load all the relevant data
LABELS = pd.read_csv(f"../data/labels/{GEO}_reviews_labels.csv")
SUBSET = pd.read_csv(f"../data/filtered/{GEO}_reviews_filtered.csv")
LISTINGS = pd.read_csv(f"../data/raw/{GEO}_listings.csv", encoding="unicode_escape", low_memory=False)

Join datasets together.

In [40]:
# join with labels
subset_with_labels = pd.merge(LABELS, SUBSET, on="id", suffixes=("_labels", "_subset"))

# join with listings
subset_labels_and_listing = pd.merge(subset_with_labels, LISTINGS, left_on="listing_id", right_on="id", suffixes=("","_listings"))

Keep only the columns we need for the processed dataset. Also, rename the columns to be more descriptive, and clean the amentities column.

In [41]:
cols_to_keep =[
    "id",
    "listing_id",
    "description",
    "comments",
    "sentiment",
    "label",
    "name",
    "amenities"
]

subset_labels_and_listing = subset_labels_and_listing[cols_to_keep]

# rename id to review_id, for clarity
subset_labels_and_listing = subset_labels_and_listing.rename(columns={"id": "review_id"})

def parse_amenities(amenities):
  amenities = amenities.replace("{", "").replace("]", "").replace('"', "")
  return amenities.split(",")

subset_labels_and_listing.amenities = subset_labels_and_listing.amenities.apply(parse_amenities)

subset_labels_and_listing

Unnamed: 0,review_id,listing_id,description,comments,sentiment,label,name,amenities
0,83097,5456,Fabulous location for walking to Convention Ce...,"Sylvia was very nice, informal, and she was re...",4.333333,no,"Walk to 6th, Rainey St and Convention Ctr","[TV, Wifi, Air conditioning, Kitchen, Pets liv..."
1,133337,5456,Fabulous location for walking to Convention Ce...,Sylvia picked me up from the airport and gave ...,4.000000,no,"Walk to 6th, Rainey St and Convention Ctr","[TV, Wifi, Air conditioning, Kitchen, Pets liv..."
2,150928,5456,Fabulous location for walking to Convention Ce...,We had a lovely time in Austin and enjoyed the...,3.000000,no,"Walk to 6th, Rainey St and Convention Ctr","[TV, Wifi, Air conditioning, Kitchen, Pets liv..."
3,2706775,5456,Fabulous location for walking to Convention Ce...,Sylvia was an excellent host. Stayed in touch ...,4.500000,no,"Walk to 6th, Rainey St and Convention Ctr","[TV, Wifi, Air conditioning, Kitchen, Pets liv..."
4,8602878,5456,Fabulous location for walking to Convention Ce...,The place was a cute little self contained cot...,4.000000,no,"Walk to 6th, Rainey St and Convention Ctr","[TV, Wifi, Air conditioning, Kitchen, Pets liv..."
...,...,...,...,...,...,...,...,...
628,243926779,785270,"This cozy, sun-filled apartment in a quiet Cen...","Location is really nice, right in the Muller a...",3.250000,no,"The Darlington Arms, Central Austin","[TV, Cable TV, Internet, Wifi, Air conditionin..."
629,138631555,785963,Come stay in this cozy cottage featuring a ful...,Katy and Rick were absolutely lovely! I arrive...,4.500000,yes,Bouldin Cottage,"[TV, Cable TV, Internet, Wifi, Air conditionin..."
630,201595726,785963,Come stay in this cozy cottage featuring a ful...,Not in the good part of Bouldin. Filthy. After...,1.000000,yes,Bouldin Cottage,"[TV, Cable TV, Internet, Wifi, Air conditionin..."
631,27786173,813709,This home is a full time vacation rental so yo...,Pete was a great host. While we didn't meet hi...,4.000000,no,Cozy Bright Hyde Park Guest House,"[TV, Cable TV, Internet, Wifi, Air conditionin..."


In [42]:
# save it
subset_labels_and_listing.to_csv(f"../data/processed/{GEO}_processed.csv", index=False)