# Query Six B Preprocessing

####  Are there any other listings by the same host that can be suggested? Display listing’s name, url, description, host’s name, reviewer name, whether previously booked, availability days, minimum and maximum nights booking allowed. 

#### Goal: Clean the data files in prepartion for loading into Cassandra on Azure

In [32]:
#Imports
import numpy as np
import pandas as pd

In [33]:
#Uploading reviews file
reviews = pd.read_csv("q6_comb_reviews.csv")
reviews.head()

Unnamed: 0.1,Unnamed: 0,listing_id,id,reviewer_id,reviewer_name,review_month
0,0,12899,24767,69327,Stuart,1
1,1,12899,29230,72846,John,3
2,2,12899,29806,84196,Lois,3
3,3,12899,32572,89114,Troy,3
4,4,12899,32862,100318,Cathy,4


In [34]:
'''Steps:
    1. Load in the combined reviews table and filter it to only have listing_id, reviewer_id, and count(reviews) > 3
    2. Change the name of listing_id to "thrice_listing_id"
    3. Add host_id column to reviews table by joining on listings file
    4. Left join the listings table to reviews again, joining by host_id and pulling in the listing info
    5. Join the availability days file to the reviews file above
    6. Create logic to test if the reviewer has previously stayed at the booking and add column'''

'Steps:\n    1. Load in the combined reviews table and filter it to only have listing_id, reviewer_id, and count(reviews) > 3\n    2. Change the name of listing_id to "thrice_listing_id"\n    3. Add host_id column to reviews table by joining on listings file\n    4. Left join the listings table to reviews again, joining by host_id and pulling in the listing info\n    5. Join the availability days file to the reviews file above\n    6. Create logic to test if the reviewer has previously stayed at the booking and add column'

In [35]:
#Loading in the combined reviews table, grouping to get review counts, and filtering by count > 3
grouped_listings = reviews.groupby(['listing_id', 'reviewer_id', 'reviewer_name'])['id'].count().to_frame().reset_index()
grouped_listings = grouped_listings.rename({"id":"review_counts"}, axis = 1)
thrice_reviewed = grouped_listings.loc[grouped_listings['review_counts'] >= 3]
thrice_reviewed.head()

Unnamed: 0,listing_id,reviewer_id,reviewer_name,review_counts
231,3021,82971860,Jason,3
1635,9140,54090633,Shijuana,3
1969,12899,791723,Barbara,3
1987,12899,1227853,Beth,5
2065,12899,3132153,Dinah,3


In [36]:
#Updating the listing_id column name
thrice_reviewed = thrice_reviewed.rename({'listing_id':'thrice_rev_listing_id'}, axis = 1)
thrice_reviewed

Unnamed: 0,thrice_rev_listing_id,reviewer_id,reviewer_name,review_counts
231,3021,82971860,Jason,3
1635,9140,54090633,Shijuana,3
1969,12899,791723,Barbara,3
1987,12899,1227853,Beth,5
2065,12899,3132153,Dinah,3
...,...,...,...,...
2511389,789101243357990362,485134984,Hernan,6
2511777,792942008294249400,485134984,Hernan,4
2512605,798843846505276637,454135038,Danielle,3
2512897,802203851341181214,20189866,Moe,3


In [37]:
#Uploading all listings files and merging into a single file
la_listing = pd.read_csv('la_listings.csv')
pdx_listing = pd.read_csv('PDX_listings.csv')
sd_listing = pd.read_csv('sd_listings.csv')
salem_listing = pd.read_csv('salem_listings.csv')

df_list = [la_listing, pdx_listing, sd_listing, salem_listing]
all_listings = pd.concat(df_list)

In [38]:
#Joining the host_id column on the thrice reviewed table from all_listings
thrice_reviewed_host = pd.merge(thrice_reviewed, all_listings[['id','host_id']], how = 'left', left_on = 'thrice_rev_listing_id', right_on = 'id')
thrice_reviewed_host = thrice_reviewed_host.loc[:,thrice_reviewed_host.columns!='id']
thrice_reviewed_host.head()

Unnamed: 0,thrice_rev_listing_id,reviewer_id,reviewer_name,review_counts,host_id
0,3021,82971860,Jason,3,3415
1,9140,54090633,Shijuana,3,28350
2,12899,791723,Barbara,3,49682
3,12899,1227853,Beth,5,49682
4,12899,3132153,Dinah,3,49682


In [39]:
#Joining the listing information onto the thrice_reviewed_host table
thrice_listing = pd.merge(thrice_reviewed_host, all_listings[['id','name','listing_url','description','host_name','host_id','minimum_nights','maximum_nights']], how = 'left', on = 'host_id')
thrice_listing.head()

Unnamed: 0,thrice_rev_listing_id,reviewer_id,reviewer_name,review_counts,host_id,id,name,listing_url,description,host_name,minimum_nights,maximum_nights
0,3021,82971860,Jason,3,3415,3021,Large Furnished Bedroom in Hollywood Hills House,https://www.airbnb.com/rooms/3021,Great style and comfort in the spacious 4000 s...,Nataraj,30,730
1,3021,82971860,Jason,3,3415,38365,Hollywood Hills Private Br Suite,https://www.airbnb.com/rooms/38365,Laurel Canyon is a very musically historic seg...,Nataraj,30,730
2,3021,82971860,Jason,3,3415,8224450,Zen Hollywood Hills Unit,https://www.airbnb.com/rooms/8224450,Beautiful private apt wing guest house of larg...,Nataraj,30,730
3,9140,54090633,Shijuana,3,28350,9140,City Place Lofts,https://www.airbnb.com/rooms/9140,Shared space - private room with its own entra...,Wendell,31,1125
4,12899,791723,Barbara,3,49682,12899,"Alberta Arts 2 bedroom suite, charming 1906 house",https://www.airbnb.com/rooms/12899,"Please know that we are fully vaccinated, will...",Ali And David,3,730


### Loading Availability Data

In [40]:
#Loading in table with 2 columns - listing_id and avail_month
avail_table = pd.read_csv('final_avail_data.csv')
avail_table.head()

Unnamed: 0.1,Unnamed: 0,listing_id,date,available,minimum_nights,days_to_beg_isl,days_to_end_isl,true_availability
0,0,6,2023-03-25,f,10.0,0.0,0.0,False
1,1,6,2023-03-26,f,10.0,0.0,0.0,False
2,2,6,2023-03-27,f,10.0,0.0,0.0,False
3,3,6,2023-03-28,f,10.0,0.0,0.0,False
4,4,6,2023-03-29,f,10.0,0.0,0.0,False


In [41]:
#Removing the rows where true_avilability is False
avail_table = avail_table.loc[avail_table['true_availability']==True]
avail_table.true_availability.unique()

array([ True])

In [42]:
#Combining the dates into a single column of avail_days
avail_table = avail_table.rename({"date":"avail_days"}, axis = 1)
avail_table = avail_table.groupby(['listing_id','true_availability'])['avail_days'].apply(','.join).reset_index()
avail_table.head()

Unnamed: 0,listing_id,true_availability,avail_days
0,109,True,"2023-04-24,2023-04-25,2023-04-26,2023-04-27,20..."
1,2708,True,"2023-06-30,2023-07-01,2023-07-02,2023-07-03,20..."
2,2732,True,"2023-03-11,2023-03-12,2023-03-13,2023-03-14,20..."
3,3021,True,"2023-03-08,2023-03-09,2023-03-10,2023-03-11,20..."
4,5728,True,"2023-05-08,2023-05-09,2023-05-10,2023-05-11,20..."


In [43]:
#Match the availability table to the thrice listing table, removing listing_id from join & renaming columns for more obvious descriptions
thrice_listing_avail = pd.merge(thrice_listing, avail_table[['avail_days', 'listing_id']], how = "left", left_on = "id", right_on = 'listing_id')
thrice_listing_avail = thrice_listing_avail.loc[:, thrice_listing_avail.columns!='listing_id']
thrice_listing_avail = thrice_listing_avail.rename({'id':'listing_id', 'name':'listing_name', 'description':'listing_description'}, axis = 1)
thrice_listing_avail.head()

Unnamed: 0,thrice_rev_listing_id,reviewer_id,reviewer_name,review_counts,host_id,listing_id,listing_name,listing_url,listing_description,host_name,minimum_nights,maximum_nights,avail_days
0,3021,82971860,Jason,3,3415,3021,Large Furnished Bedroom in Hollywood Hills House,https://www.airbnb.com/rooms/3021,Great style and comfort in the spacious 4000 s...,Nataraj,30,730,"2023-03-08,2023-03-09,2023-03-10,2023-03-11,20..."
1,3021,82971860,Jason,3,3415,38365,Hollywood Hills Private Br Suite,https://www.airbnb.com/rooms/38365,Laurel Canyon is a very musically historic seg...,Nataraj,30,730,"2023-04-24,2023-04-25,2023-04-26,2023-04-27,20..."
2,3021,82971860,Jason,3,3415,8224450,Zen Hollywood Hills Unit,https://www.airbnb.com/rooms/8224450,Beautiful private apt wing guest house of larg...,Nataraj,30,730,"2023-03-08,2023-03-09,2023-03-10,2023-03-11,20..."
3,9140,54090633,Shijuana,3,28350,9140,City Place Lofts,https://www.airbnb.com/rooms/9140,Shared space - private room with its own entra...,Wendell,31,1125,
4,12899,791723,Barbara,3,49682,12899,"Alberta Arts 2 bedroom suite, charming 1906 house",https://www.airbnb.com/rooms/12899,"Please know that we are fully vaccinated, will...",Ali And David,3,730,


In [44]:
#Replace non-available days with 0s
print("Checking for nulls in avail days:\n",thrice_listing_avail.isnull().sum())
thrice_listing_avail['avail_days'] = thrice_listing_avail['avail_days'].fillna(0)
print("Checking for nulls in avail days after fix:\n",thrice_listing_avail.isnull().sum())

Checking for nulls in avail days:
 thrice_rev_listing_id       0
reviewer_id                 0
reviewer_name               0
review_counts               0
host_id                     0
listing_id                  0
listing_name                0
listing_url                 0
listing_description       206
host_name                   0
minimum_nights              0
maximum_nights              0
avail_days               2039
dtype: int64
Checking for nulls in avail days after fix:
 thrice_rev_listing_id      0
reviewer_id                0
reviewer_name              0
review_counts              0
host_id                    0
listing_id                 0
listing_name               0
listing_url                0
listing_description      206
host_name                  0
minimum_nights             0
maximum_nights             0
avail_days                 0
dtype: int64


### Create logic to test if the reviewer has previously stayed at the booking and add column

In [45]:
# Combine listing_id and reviewer_id columns in thrice_listing_avail
thrice_listing_avail['listing_reviewer'] = thrice_listing_avail['listing_id'].astype(str) + thrice_listing_avail['reviewer_id'].astype(str)
thrice_listing_avail.head()

Unnamed: 0,thrice_rev_listing_id,reviewer_id,reviewer_name,review_counts,host_id,listing_id,listing_name,listing_url,listing_description,host_name,minimum_nights,maximum_nights,avail_days,listing_reviewer
0,3021,82971860,Jason,3,3415,3021,Large Furnished Bedroom in Hollywood Hills House,https://www.airbnb.com/rooms/3021,Great style and comfort in the spacious 4000 s...,Nataraj,30,730,"2023-03-08,2023-03-09,2023-03-10,2023-03-11,20...",302182971860
1,3021,82971860,Jason,3,3415,38365,Hollywood Hills Private Br Suite,https://www.airbnb.com/rooms/38365,Laurel Canyon is a very musically historic seg...,Nataraj,30,730,"2023-04-24,2023-04-25,2023-04-26,2023-04-27,20...",3836582971860
2,3021,82971860,Jason,3,3415,8224450,Zen Hollywood Hills Unit,https://www.airbnb.com/rooms/8224450,Beautiful private apt wing guest house of larg...,Nataraj,30,730,"2023-03-08,2023-03-09,2023-03-10,2023-03-11,20...",822445082971860
3,9140,54090633,Shijuana,3,28350,9140,City Place Lofts,https://www.airbnb.com/rooms/9140,Shared space - private room with its own entra...,Wendell,31,1125,0,914054090633
4,12899,791723,Barbara,3,49682,12899,"Alberta Arts 2 bedroom suite, charming 1906 house",https://www.airbnb.com/rooms/12899,"Please know that we are fully vaccinated, will...",Ali And David,3,730,0,12899791723


In [46]:
# Combine listing_id and reviewer_id columns in thrice_listing_avail
reviews['listing_reviewer'] = reviews['listing_id'].astype(str) + reviews['reviewer_id'].astype(str)
reviews.head()

Unnamed: 0.1,Unnamed: 0,listing_id,id,reviewer_id,reviewer_name,review_month,listing_reviewer
0,0,12899,24767,69327,Stuart,1,1289969327
1,1,12899,29230,72846,John,3,1289972846
2,2,12899,29806,84196,Lois,3,1289984196
3,3,12899,32572,89114,Troy,3,1289989114
4,4,12899,32862,100318,Cathy,4,12899100318


In [47]:
#Testing if concat listing & reviewer id in thrice_listing_avail is also in the reviews tables via a merge
booking_reminder = pd.merge(thrice_listing_avail, reviews[['listing_reviewer', 'listing_id']], how = "left", on = 'listing_reviewer')
booking_reminder.head()

Unnamed: 0,thrice_rev_listing_id,reviewer_id,reviewer_name,review_counts,host_id,listing_id_x,listing_name,listing_url,listing_description,host_name,minimum_nights,maximum_nights,avail_days,listing_reviewer,listing_id_y
0,3021,82971860,Jason,3,3415,3021,Large Furnished Bedroom in Hollywood Hills House,https://www.airbnb.com/rooms/3021,Great style and comfort in the spacious 4000 s...,Nataraj,30,730,"2023-03-08,2023-03-09,2023-03-10,2023-03-11,20...",302182971860,3021.0
1,3021,82971860,Jason,3,3415,3021,Large Furnished Bedroom in Hollywood Hills House,https://www.airbnb.com/rooms/3021,Great style and comfort in the spacious 4000 s...,Nataraj,30,730,"2023-03-08,2023-03-09,2023-03-10,2023-03-11,20...",302182971860,3021.0
2,3021,82971860,Jason,3,3415,3021,Large Furnished Bedroom in Hollywood Hills House,https://www.airbnb.com/rooms/3021,Great style and comfort in the spacious 4000 s...,Nataraj,30,730,"2023-03-08,2023-03-09,2023-03-10,2023-03-11,20...",302182971860,3021.0
3,3021,82971860,Jason,3,3415,38365,Hollywood Hills Private Br Suite,https://www.airbnb.com/rooms/38365,Laurel Canyon is a very musically historic seg...,Nataraj,30,730,"2023-04-24,2023-04-25,2023-04-26,2023-04-27,20...",3836582971860,
4,3021,82971860,Jason,3,3415,8224450,Zen Hollywood Hills Unit,https://www.airbnb.com/rooms/8224450,Beautiful private apt wing guest house of larg...,Nataraj,30,730,"2023-03-08,2023-03-09,2023-03-10,2023-03-11,20...",822445082971860,


In [48]:
#Changing listing_id_y (the listing id from the reviews table) column name and setting to 1 or 0
booking_reminder = booking_reminder.rename({'listing_id_y':'prev_booked_by_reviewer', 'listing_id_x':'reccommended_listing_id'}, axis = 1)
booking_reminder['prev_booked_by_reviewer'] = np.where(booking_reminder['prev_booked_by_reviewer'].isnull(),0,1) 
booking_reminder.head()

Unnamed: 0,thrice_rev_listing_id,reviewer_id,reviewer_name,review_counts,host_id,reccommended_listing_id,listing_name,listing_url,listing_description,host_name,minimum_nights,maximum_nights,avail_days,listing_reviewer,prev_booked_by_reviewer
0,3021,82971860,Jason,3,3415,3021,Large Furnished Bedroom in Hollywood Hills House,https://www.airbnb.com/rooms/3021,Great style and comfort in the spacious 4000 s...,Nataraj,30,730,"2023-03-08,2023-03-09,2023-03-10,2023-03-11,20...",302182971860,1
1,3021,82971860,Jason,3,3415,3021,Large Furnished Bedroom in Hollywood Hills House,https://www.airbnb.com/rooms/3021,Great style and comfort in the spacious 4000 s...,Nataraj,30,730,"2023-03-08,2023-03-09,2023-03-10,2023-03-11,20...",302182971860,1
2,3021,82971860,Jason,3,3415,3021,Large Furnished Bedroom in Hollywood Hills House,https://www.airbnb.com/rooms/3021,Great style and comfort in the spacious 4000 s...,Nataraj,30,730,"2023-03-08,2023-03-09,2023-03-10,2023-03-11,20...",302182971860,1
3,3021,82971860,Jason,3,3415,38365,Hollywood Hills Private Br Suite,https://www.airbnb.com/rooms/38365,Laurel Canyon is a very musically historic seg...,Nataraj,30,730,"2023-04-24,2023-04-25,2023-04-26,2023-04-27,20...",3836582971860,0
4,3021,82971860,Jason,3,3415,8224450,Zen Hollywood Hills Unit,https://www.airbnb.com/rooms/8224450,Beautiful private apt wing guest house of larg...,Nataraj,30,730,"2023-03-08,2023-03-09,2023-03-10,2023-03-11,20...",822445082971860,0


In [49]:
#Remove duplicate rows in the booking_reminder table
booking_reminder_final = booking_reminder.drop_duplicates()
booking_reminder_final = booking_reminder_final.loc[:, booking_reminder_final.columns!="listing_reviewer"]
booking_reminder_final.head()

Unnamed: 0,thrice_rev_listing_id,reviewer_id,reviewer_name,review_counts,host_id,reccommended_listing_id,listing_name,listing_url,listing_description,host_name,minimum_nights,maximum_nights,avail_days,prev_booked_by_reviewer
0,3021,82971860,Jason,3,3415,3021,Large Furnished Bedroom in Hollywood Hills House,https://www.airbnb.com/rooms/3021,Great style and comfort in the spacious 4000 s...,Nataraj,30,730,"2023-03-08,2023-03-09,2023-03-10,2023-03-11,20...",1
3,3021,82971860,Jason,3,3415,38365,Hollywood Hills Private Br Suite,https://www.airbnb.com/rooms/38365,Laurel Canyon is a very musically historic seg...,Nataraj,30,730,"2023-04-24,2023-04-25,2023-04-26,2023-04-27,20...",0
4,3021,82971860,Jason,3,3415,8224450,Zen Hollywood Hills Unit,https://www.airbnb.com/rooms/8224450,Beautiful private apt wing guest house of larg...,Nataraj,30,730,"2023-03-08,2023-03-09,2023-03-10,2023-03-11,20...",0
5,9140,54090633,Shijuana,3,28350,9140,City Place Lofts,https://www.airbnb.com/rooms/9140,Shared space - private room with its own entra...,Wendell,31,1125,0,1
8,12899,791723,Barbara,3,49682,12899,"Alberta Arts 2 bedroom suite, charming 1906 house",https://www.airbnb.com/rooms/12899,"Please know that we are fully vaccinated, will...",Ali And David,3,730,0,1


In [50]:
#Saving output file
booking_reminder_final.to_csv("q6_b_reccommendations.csv")