In [2]:
import pandas as pd
import numpy as np
import torch
import sklearn
import json

#### Read the files

In [3]:
listing_data = pd.read_csv("/content/drive/MyDrive/master/ML4G-Project/data/preprocessed/listings.csv", encoding = "windows-1252", decimal=',',sep = ";")
calendar_data = pd.read_csv("/content/drive/MyDrive/master/ML4G-Project/data/preprocessed/calendar.csv", encoding = "windows-1252",decimal=',', sep = ";")

## Prepare dynamic data

In [4]:
calendar_data.head()

Unnamed: 0,Column1,listing_id,date,available,price
0,0,5506,01-12-2016,t,145
1,1,5506,02-12-2016,t,145
2,2,5506,03-12-2016,t,145
3,3,5506,04-12-2016,t,145
4,4,5506,05-12-2016,t,145


### Group the values by the listing_id and sort the price in ascending order of data

In [None]:
# df = calendar_data.copy()
# # Convert 'date' column to datetime format
# df['date'] = pd.to_datetime(df['date'], format='%d-%m-%Y')

# # Group by 'listing_id' and aggregate 'price' into a list sorted by 'date'
# result = df.groupby('listing_id').apply(lambda x: sorted(x['price'].tolist(), key=lambda y: x.loc[x['price'] == y, 'date'].iloc[0]))

# #convert it to a dictionary
# grouped_calendar = dict(result)

#### Check for listings with different time frame length and remove them

In [None]:
# time_frame = len(grouped_calendar[5506])
# for keys in list(grouped_calendar.keys()):
#   if len(grouped_calendar[keys]) != time_frame:
#     print(f"Uneven time frame for key: {keys} ")


In [None]:
# # Remove the anomalous key
# _ = grouped_calendar.pop(12898806)

In [None]:
# Match the listing ids of calendar and listings file
# list1 = list(listing_data['id'])
# list2 = list(grouped_calendar.keys())
# uncommon_in_list1 = set(list1) - set(list2)
# uncommon_in_list2 = set(list2) - set(list1)

# uncommon_values = list(uncommon_in_list1.union(uncommon_in_list2))
# print(uncommon_values)

#### Save the processed file

In [None]:
# # Save this data
# with open("/content/drive/MyDrive/master/ML4G-Project/data/preprocessed/price.json", "w") as fp:
#   json.dump(grouped_calendar, fp)
# fp.close()

## Prepare static data

#### initialize the columns

In [None]:
# Columns/features to be used as static during training
static_columns = ["id","host_response_rate", "host_acceptance_rate", "host_is_superhost", "host_listings_count", "host_total_listings_count",
                  "host_has_profile_pic", "host_identity_verified", "neighbourhood_cleansed", "latitude", "longitude", "property_type",
                  "room_type", "accommodates", "bathrooms", "bedrooms", "beds", "minimum_nights", "maximum_nights", "number_of_reviews",
                  "availability_30", "availability_60", "availability_90", "availability_365", "number_of_reviews", "review_scores_rating",
                  "review_scores_accuracy", "review_scores_cleanliness", "review_scores_checkin", "review_scores_communication",
                  "review_scores_location", "review_scores_value", "requires_license", "instant_bookable", "cancellation_policy",
                  "reviews_per_month"]

static_categorical = ["neighbourhood_cleansed", "host_is_superhost", "host_has_profile_pic", "host_identity_verified", "property_type" , "room_type",
                      "requires_license", "instant_bookable", "cancellation_policy"]

# beds, bedrooms, bathrooms, property type has some issues with fillna, will include later
# requires_license has only one unique values
# "review_scores_rating", "review_scores_accuracy", "review_scores_cleanliness", "review_scores_checkin",
# "review_scores_communication", "review_scores_location", "review_scores_value", "reviews_per_month" have large number of missing values
cols_to_remove = ["review_scores_rating", "review_scores_accuracy", "review_scores_cleanliness", "review_scores_checkin",
                  "review_scores_communication", "review_scores_location", "review_scores_value", "reviews_per_month",
                  "property_type", "beds", "bathrooms", "bedrooms", "requires_license"]

In [None]:
cols = list(set(static_columns) - set(cols_to_remove)) # remove the columns that need to be removed
numeric_cols = list(set(cols) - set(static_categorical)) # remove categorical to get columns
numeric_cols = list(set(numeric_cols) - set(["id"])) # Id added to match with price json file. Now we can remove it
static_categorical = list(set(static_categorical) - set(cols_to_remove)) # get the categorical columns

#### View the data

In [None]:
listing_data = listing_data[listing_data['id'] != 12898806] # Drop this column as it has a different length of price values

In [None]:
data = listing_data[cols]
data.head()

Unnamed: 0,latitude,neighbourhood_cleansed,availability_60,host_response_rate,instant_bookable,availability_30,number_of_reviews,host_is_superhost,host_has_profile_pic,host_listings_count,...,longitude,minimum_nights,cancellation_policy,maximum_nights,host_acceptance_rate,availability_365,id,host_identity_verified,host_total_listings_count,accommodates
0,42.286241,Roslindale,54,1.0,t,26,36,f,t,1,...,-71.134374,2,moderate,15,1.0,359,3075044,t,1,2
1,42.292438,Roslindale,46,1.0,f,19,41,t,t,1,...,-71.135765,3,moderate,45,0.88,319,6976,t,1,2
2,42.281106,Roslindale,16,1.0,f,6,1,f,t,1,...,-71.121021,1,moderate,1125,0.5,98,1436513,f,1,4
3,42.284512,Roslindale,34,1.0,f,13,29,t,t,1,...,-71.136258,2,flexible,31,1.0,334,7651065,t,1,2
4,42.29169,Roslindale,28,1.0,f,5,8,t,t,2,...,-71.131893,2,flexible,1125,0.95,58,12386020,t,2,2


In [None]:
# data.isnull().sum()

#### Fill up the null values

In [None]:
# Fill up host acceptance rate with its mean value
data.loc[:, 'host_acceptance_rate'] = data['host_acceptance_rate'].fillna(data['host_acceptance_rate'].mean())

# Fill up property type with mode value
#data.loc[:, 'property_type'] = data['property_type'].fillna(data['property_type'].mode())

# Fill up beds with its mode value
#data.loc[:,'beds'] = data['beds'].fillna(data['beds'].mode())

# Fill up bedrooms with its mode value
#data.loc[:, 'bedrooms'] = data['bedrooms'].fillna(data['bedrooms'].mode())

# Fill up bathrooms with its mode value
#data.loc[:,'bathrooms'] = data['bathrooms'].fillna(data['bathrooms'].mode())

# Fill up host response rate with its mean value
data.loc[:,'host_response_rate'] = data['host_response_rate'].fillna(data['host_response_rate'].mean())

In [None]:
# data.isnull().sum()

#### count the number of unique values for each columns

In [None]:
# unique_count = {}
# for cat_col in static_categorical:
#   unique_count[cat_col] = len(listing_data[static_categorical][cat_col].unique())
# print(unique_count)

#### Read the price file and append them to the listing_data dataframe with matching ids

In [None]:
with open("/content/drive/MyDrive/master/ML4G-Project/data/preprocessed/price.json", "r") as fp:
  price_data = json.load(fp)
fp.close()
# price_data

In [None]:
# Iterate over the DataFrame rows
price_lists = []
for index, row in data.iterrows():


    # Get the ID from the current row
    current_id = str(row['id'])
    if current_id == '12898806':
      continue
    # Get the price list from the dictionary for the current ID
    price_list = price_data[current_id]

    # Append the price list to the list of price lists
    price_lists.append(price_list)

# Assign the list of price lists to the 'price_list' column
data['price_list'] = price_lists
data = data.drop(columns=['id'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['price_list'] = price_lists


#### Extract categorical data and oneHotEncode them.

In [None]:
categorical_data = listing_data[static_categorical]
# categorical_data.head()
categorical_columns = categorical_data.select_dtypes(include=['object', 'category']).columns.tolist()
# Convert categorical variables into dummy/indicator variables
categorical_encoded = pd.get_dummies(categorical_data, columns=categorical_columns).values.astype(int)

In [None]:
# Extract numerical columns
numeric_columns = data[numeric_cols].values

In [None]:
X_static = np.concatenate([numeric_columns, categorical_encoded], axis = 1)
X_static.shape

(2628, 54)

## Prepare dynamic data

In [None]:
data["price_list"]

0       [65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 6...
1       [65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 6...
2       [75, 75, 75, 75, 75, 75, 75, 75, 75, 75, 75, 7...
3       [79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 7...
4       [75, 75, 75, 75, 75, 75, 75, 75, 75, 75, 75, 7...
                              ...                        
2624    [69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 69, 6...
2625    [150, 150, 150, 150, 150, 150, 150, 150, 150, ...
2626    [115, 115, 115, 115, 115, 115, 115, 115, 115, ...
2627    [59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 5...
2628    [65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 6...
Name: price_list, Length: 2628, dtype: object

In [None]:
price_list = []
for i in data["price_list"]:
  price_list.append(i)

In [None]:
X_dynamic = np.array(price_list)
X_dynamic

array([[ 65,  65,  65, ...,  75,  75,  75],
       [ 65,  65,  65, ...,  65,  65,  65],
       [ 75,  75,  75, ...,  75,  75,  75],
       ...,
       [115, 115, 115, ..., 115, 115, 115],
       [ 59,  59,  59, ...,  59,  59,  59],
       [ 65,  65,  65, ...,  65,  65,  65]])

#### Save the data

In [None]:
np.save("/content/drive/MyDrive/master/ML4G-Project/data/preprocessed/dynamic_data.npy", X_dynamic)
np.save("/content/drive/MyDrive/master/ML4G-Project/data/preprocessed/static_data.npy", X_static)