# Query Six A Preprocessing

#### Are there any listings that a reviewer has reviewed more than thrice that is also available in the same month as was reviewed by them previously? (check against all the months that the previous reviews were posted on, if any match then it qualifies)

#### Goal: Clean the data files in prepartion for loading into Cassandra on Azure

In [206]:
#Imports
import numpy as np
import pandas as pd

In [207]:
#Uploading the data
pdx_reviews = pd.read_csv('PDX_reviews.csv')
la_reviews = pd.read_csv('la_reviews.csv')
sd_reviews = pd.read_csv('sd_reviews.csv')
salem_reviews = pd.read_csv('salem_reviews.csv')

In [208]:
#Keeping only the columns we need (listing_id, id, date, reviewer_id, reviewer_name (for 6b))
pdx_select = pdx_reviews[["listing_id", "id","date","reviewer_id", "reviewer_name"]]
la_select = la_reviews[["listing_id", "id","date","reviewer_id", "reviewer_name"]]
sd_select = sd_reviews[["listing_id", "id","date","reviewer_id", "reviewer_name"]]
salem_select = salem_reviews[["listing_id", "id","date","reviewer_id", "reviewer_name"]]

In [209]:
#Combining all reviews for all cities
df_list = [pdx_select, la_select, sd_select, salem_select]
all_reviews = pd.concat(df_list)

#Testing concat
indiv_lengths = len(pdx_select) + len(la_select) + len(sd_select) + len(salem_select)
if len(all_reviews) == indiv_lengths:
    print("Concat worked correctly. New len: "+str(len(all_reviews)))
else:
    print("Concat did not merge right")

Concat worked correctly. New len: 2573162


In [210]:
#Converting date to datetime and adding month column
all_reviews['date'] = pd.to_datetime(all_reviews['date'])
all_reviews['review_month'] = all_reviews['date'].dt.month

In [211]:
#Removing date column
final_reviews = all_reviews.loc[:, all_reviews.columns!='date']
final_reviews

Unnamed: 0,listing_id,id,reviewer_id,reviewer_name,review_month
0,12899,24767,69327,Stuart,1
1,12899,29230,72846,John,3
2,12899,29806,84196,Lois,3
3,12899,32572,89114,Troy,3
4,12899,32862,100318,Cathy,4
...,...,...,...,...,...
14881,806288048006012567,840008042809282936,219002106,Cheryl,3
14882,806288048006012567,840728285056856719,52054229,Sarah,3
14883,808194120211324072,822551660654988407,82164517,Jody,2
14884,810859442087199945,818218402306004794,415177874,Alanmichael,2


In [212]:
'''#Saving the final reviews table for the creation of the dummy availability table
final_reviews.to_csv("q6_comb_reviews.csv")'''

'#Saving the final reviews table for the creation of the dummy availability table\nfinal_reviews.to_csv("q6_comb_reviews.csv")'

### Cleaning the Availability Table
Steps:
    1. Load table
    2. Change date to a datetime col
    3. Create a avail_month col
    4. Remove all columns which aren't listing_id, avail_month, and true_availability
    5. Combine the avail_month rows into a single col per listing id with comma separated values
    6. Remove the true_availability = False column
    7. Save dataset'''

In [213]:
#Loading the availability table
avail_table = pd.read_csv('final_avail_data.csv')
avail_table.head()

MemoryError: Unable to allocate 168. MiB for an array with shape (21966856,) and data type int64

In [None]:
#Changing the date col to a datetime col, and creating avail month col
avail_table['date'] = pd.to_datetime(avail_table['date'])
avail_table['avail_month'] = avail_table['date'].dt.month
avail_table['avail_month'] = avail_table['avail_month'].astype(str)
#print(avail_table.dtypes)
#print(avail_table.head())

In [None]:
#Removing all columns which aren't listing_id, avail_month, and true_availability
select_avail = avail_table.loc[:, ['listing_id','avail_month','true_availability']]
select_avail.head()

In [None]:
#Remove duplicates from the table
sel_avail_tf = select_avail.drop_duplicates()
print("The original length of the table was",len(select_avail),"The new length of the table is",len(sel_avail_tf))

In [None]:
#Combine the avail_month rows into a single col per listing id with comma separated values

#Checking "before" view
print(sel_avail_tf.loc[sel_avail_tf['listing_id'] ==29967])

#Combining rows
comb_avail = sel_avail_tf.groupby(['listing_id','true_availability'])['avail_month'].apply(','.join).reset_index()

#Checking the "after"
print(comb_avail.loc[comb_avail['listing_id'] == 29967])

In [None]:
#Removing the unavailable listings
is_avail = comb_avail.loc[comb_avail['true_availability']==True]
#is_avail.true_availability.unique()

In [None]:
#Saving the output
is_avail.to_csv('available_listings.csv')

### Joining the availability and review tables

In [None]:
#Loading in table with 2 columns - listing_id and avail_month
#listing_id is the primary key, avail month is a column containing a string with all the months of availability for the property 
#e.g., '1,2,3' for Jan, Feb, Mar.
avail_table = is_avail.copy()
#avail_table.head()

In [None]:
#Match the availability table to the review information table
review_avail = pd.merge(final_reviews, avail_table, how = "left", on = "listing_id")
#review_avail = review_avail.drop('Unnamed: 0', axis = 1)
review_avail.head()

In [None]:
#Checking that rows are updated correctly
nan_listing_list = review_avail[review_avail['avail_month'].isna()]['listing_id'].values #210227
print("There are",len(nan_listing_list),"listing ids which have null avail_month")

nan_avail_df = comb_avail.loc[comb_avail['listing_id'].isin(nan_listing_list)]
print("They only have the value",nan_avail_df.true_availability.unique(),"for availability and were removed prior to the \
join with the reviews table")

#Find Nans
#print(review_avail.isnull().sum())

In [None]:
#Changing the null values in true availability and avail_month to zero
review_avail['true_availability'] = review_avail['true_availability'].fillna(False)
review_avail['avail_month'] = review_avail['avail_month'].fillna(0)

#Find Nans
#print(review_avail.isnull().sum())

In [None]:
#Add the availability in review month indicator column
review_avail['avail_ind'] = review_avail.apply(lambda review_avail: str(review_avail.review_month) in str(review_avail.avail_month), axis = 1)
review_avail

In [None]:
#Updates avail_ind from boolean to 1/0
review_avail.loc[review_avail['avail_ind'] == True, 'avail_ind'] = 1
review_avail.loc[review_avail['avail_ind'] == False, 'avail_ind'] = 0
review_avail.loc[review_avail['listing_id']==29967]

In [None]:
#Verifying that everything with true availability == False has avail_ind == 0
false_avail = review_avail.loc[review_avail['true_availability']== False]
print(false_avail.avail_ind.unique())

#Verifying that everything with avail_month == 0 has avail_ind == 0
false_avail = review_avail.loc[review_avail['avail_month']== 0]
print(false_avail.avail_ind.unique())

In [None]:
#Saving the output data to csv
review_avail.to_csv('q6a_data.csv')