# Query Six A Preprocessing

#### Are there any listings that a reviewer has reviewed more than thrice that is also available in the same month as was reviewed by them previously? (check against all the months that the previous reviews were posted on, if any match then it qualifies)

#### Goal: Clean the data files in prepartion for loading into Cassandra on Azure

In [3]:
#Imports
import numpy as np
import pandas as pd

In [4]:
#Uploading the data
pdx_reviews = pd.read_csv('PDX_reviews.csv')
la_reviews = pd.read_csv('la_reviews.csv')
sd_reviews = pd.read_csv('sd_reviews.csv')
salem_reviews = pd.read_csv('salem_reviews.csv')

In [5]:
salem_reviews.head()

Unnamed: 0,listing_id,id,date,reviewer_id,reviewer_name,comments
0,199568,486392,2011-08-29,66482,Victoria & Rob,"The Garden Apartment is lovely. It is open, ai..."
1,199568,491741,2011-08-31,66482,Victoria & Rob,We were so pleased with the apartment and our ...
2,199568,608314,2011-10-08,201283,Jenny,I stayed at Sara's apt.for two weeks while wo...
3,199568,615016,2011-10-10,66482,Victoria & Rob,My aunt and I enjoyed our stay in the lovely g...
4,199568,644195,2011-10-18,933601,Linda And Jerry,We truly enjoyed a restful time and environmen...


In [6]:
#Keeping only the columns we need (listing_id, id, date, reviewer_id, reviewer_name (for 6b))
pdx_select = pdx_reviews[["listing_id", "id","date","reviewer_id", "reviewer_name"]]
la_select = la_reviews[["listing_id", "id","date","reviewer_id", "reviewer_name"]]
sd_select = sd_reviews[["listing_id", "id","date","reviewer_id", "reviewer_name"]]
salem_select = salem_reviews[["listing_id", "id","date","reviewer_id", "reviewer_name"]]

In [7]:
#Combining all reviews for all cities
df_list = [pdx_select, la_select, sd_select, salem_select]
all_reviews = pd.concat(df_list)

#Testing concat
indiv_lengths = len(pdx_select) + len(la_select) + len(sd_select) + len(salem_select)
if len(all_reviews) == indiv_lengths:
    print("Concat worked correctly. New len: "+str(len(all_reviews)))
else:
    print("Concat did not merge right")

Concat worked correctly. New len: 2573162


In [8]:
#Converting date to datetime and adding month column
all_reviews['date'] = pd.to_datetime(all_reviews['date'])
all_reviews['review_month'] = all_reviews['date'].dt.month

In [9]:
#Removing date column
final_reviews = all_reviews.loc[:, all_reviews.columns!='date']
final_reviews

Unnamed: 0,listing_id,id,reviewer_id,reviewer_name,review_month
0,12899,24767,69327,Stuart,1
1,12899,29230,72846,John,3
2,12899,29806,84196,Lois,3
3,12899,32572,89114,Troy,3
4,12899,32862,100318,Cathy,4
...,...,...,...,...,...
14881,806288048006012567,840008042809282936,219002106,Cheryl,3
14882,806288048006012567,840728285056856719,52054229,Sarah,3
14883,808194120211324072,822551660654988407,82164517,Jody,2
14884,810859442087199945,818218402306004794,415177874,Alanmichael,2


In [10]:
#Saving the final reviews table for the creation of the dummy availability table
final_reviews.to_csv("q6_comb_reviews.csv")

##  LOAD KEVIN TABLE OF AVAILABILITY INTO THE BELOW BLOCK INSTEAD OF DUMMY TABLE
#### Cleaning may be necessary to get Kevin's table into my preferred format

In [11]:
#Loading in table with 2 columns - listing_id and avail_month
#listing_id is the primary key, avail month is a column containing a string with all the months of availability for the property 
#e.g., '1,2,3' for Jan, Feb, Mar. Kevin wrote the code to execute the availability calculation. 
avail_table = pd.read_csv('dummy_availability_table.csv')
avail_table.head()

Unnamed: 0.1,Unnamed: 0,listing_id,avail_months
0,0,12899,123
1,1,789798,123
2,2,801318,123
3,3,808393,123
4,4,838961,123


In [12]:
#Match the availability table to the review information table
review_avail = pd.merge(final_reviews, avail_table, how = "left", on = "listing_id")
review_avail = review_avail.drop('Unnamed: 0', axis = 1)
review_avail.head()

Unnamed: 0,listing_id,id,reviewer_id,reviewer_name,review_month,avail_months
0,12899,24767,69327,Stuart,1,123
1,12899,29230,72846,John,3,123
2,12899,29806,84196,Lois,3,123
3,12899,32572,89114,Troy,3,123
4,12899,32862,100318,Cathy,4,123


In [23]:
#Checking that rows are updated correctly
'''NEED TO WRITE SOMETHING THAT WORKS WELL WITH THE ACTUAL DATA
print(review_avail.loc[review_avail['listing_id'] == 3021].head())
print(review_avail.loc[review_avail['listing_id'] == 9140].head())'''

        listing_id        id  reviewer_id  review_month avail_months
424896        3021   7010448      7008708             9        4,5,6
424911        3021  11271580     13263115             3        4,5,6
424912        3021  12432298      8917495             5        4,5,6
424913        3021  50696133         7338            10        4,5,6
424914        3021  85297917     82124588             7        4,5,6
        listing_id       id  reviewer_id  review_month avail_months
429214        9140  3642309      3151773             2        7,8,9
429276        9140  3676763      2852774             3        7,8,9
429277        9140  3871746      4933235             3        7,8,9
429278        9140  3883911      5539941             3        7,8,9
429279        9140  3915002       300517             3        7,8,9


In [13]:
#Add the availability in review month indicator column
#assumes that the avail_months column is a string and casts review_month appropriately
review_avail['avail_ind'] = review_avail.apply(lambda review_avail: str(review_avail.review_month) in review_avail.avail_months, axis = 1)
review_avail

Unnamed: 0,listing_id,id,reviewer_id,reviewer_name,review_month,avail_months,avail_ind
0,12899,24767,69327,Stuart,1,123,True
1,12899,29230,72846,John,3,123,True
2,12899,29806,84196,Lois,3,123,True
3,12899,32572,89114,Troy,3,123,True
4,12899,32862,100318,Cathy,4,123,False
...,...,...,...,...,...,...,...
2573157,806288048006012567,840008042809282936,219002106,Cheryl,3,123,True
2573158,806288048006012567,840728285056856719,52054229,Sarah,3,123,True
2573159,808194120211324072,822551660654988407,82164517,Jody,2,123,True
2573160,810859442087199945,818218402306004794,415177874,Alanmichael,2,123,True


In [14]:
#Updates avail_ind from boolean to 1/0
review_avail.loc[review_avail['avail_ind'] == True, 'avail_ind'] = 1
review_avail.loc[review_avail['avail_ind'] == False, 'avail_ind'] = 0
review_avail.head(15)

Unnamed: 0,listing_id,id,reviewer_id,reviewer_name,review_month,avail_months,avail_ind
0,12899,24767,69327,Stuart,1,123,1
1,12899,29230,72846,John,3,123,1
2,12899,29806,84196,Lois,3,123,1
3,12899,32572,89114,Troy,3,123,1
4,12899,32862,100318,Cathy,4,123,0
5,12899,40792,106706,Johanna,5,123,0
6,12899,43566,122590,Madelyn,5,123,0
7,12899,43920,124878,Adam,5,123,0
8,12899,46120,115296,Craig,5,123,0
9,12899,48434,128686,Leah,5,123,0
