# Item Data Parser 

In [1]:
from collections import OrderedDict
import pandas as pd
import numpy as np
import os
from tqdm import tqdm

In [2]:
DATA_PATH = os.path.join("data", "item_metadata.csv")
metadata = pd.read_csv(DATA_PATH)

METADATA_FILEPATH = os.path.join("data", "item_metadata.csv")
ITEM_FEATURES_FILEPATH = os.path.join("data", "item_features.csv")
ITEM_FEATURES_AS_DICT_FILEPATH = os.path.join("data", "item_features_as_dict.joblib")
DENSE_FEATURES_WRITE_PATH = os.path.join("data", "item_dense.csv")

NUM_OF_ITEMS = 927142
NUM_OF_PROPS = 157

In [26]:
metadata.head()

Unnamed: 0,item_id,properties
0,5101,Satellite TV|Golf Course|Airport Shuttle|Cosme...
1,5416,Satellite TV|Cosmetic Mirror|Safe (Hotel)|Tele...
2,5834,Satellite TV|Cosmetic Mirror|Safe (Hotel)|Tele...
3,5910,Satellite TV|Sailing|Cosmetic Mirror|Telephone...
4,6066,Satellite TV|Sailing|Diving|Cosmetic Mirror|Sa...


In [27]:
metadata["splitted_props"] = metadata["properties"].apply(lambda x: x.split("|"))

In [28]:
with open(DATA_PATH) as f:
    f.readline()
    all_props = set()
    for line in f:
        all_props.update(map(str.strip, line.split(",", 1)[1].split("|")))

all_props_as_list = sorted(list(all_props))

prop_name_to_index = {}  # maps prop name to the proper index in matrix

for i in range(len(all_props_as_list)):
    prop_name = all_props_as_list[i]
    prop_name_to_index[prop_name] = i + 2  # +2 for item id and index columns

In [31]:
NUM_OF_ITEMS = 927142
NUM_OF_PROPS = 157

item_features = np.zeros((NUM_OF_ITEMS, NUM_OF_PROPS+2))

current_item_index = 0

for row in tqdm(metadata[["item_id", "splitted_props"]].to_numpy()):
    
    item_id = row[0]
    props = row[1]
    
    # add item index as feature
    item_features[current_item_index][0] = current_item_index  # index feature
    item_features[current_item_index][1] = item_id  # id feature
    
    for prop in props:
        prop_index = prop_name_to_index[prop]
        item_features[current_item_index, prop_index] = 1
        
    current_item_index += 1   

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 927142/927142 [00:07<00:00, 122261.90it/s]


In [32]:
dataframe_columns = ["item_index", "item_id"] + all_props_as_list

In [42]:
item_features_dataframe = pd.DataFrame(item_features, columns=dataframe_columns, dtype=int)

In [43]:
item_features_dataframe.head()

Unnamed: 0,item_index,item_id,1 Star,2 Star,3 Star,4 Star,5 Star,Accessible Hotel,Accessible Parking,Adults Only,...,Terrace (Hotel),Theme Hotel,Towels,Very Good Rating,Volleyball,Washing Machine,Water Slide,Wheelchair Accessible,WiFi (Public Areas),WiFi (Rooms)
0,0,5101,0,0,0,1,0,1,1,0,...,1,0,0,0,0,0,0,0,1,1
1,1,5416,0,0,0,1,0,0,0,0,...,1,0,0,1,0,0,0,1,1,1
2,2,5834,0,0,1,0,0,0,0,0,...,1,0,0,1,0,0,0,0,1,1
3,3,5910,0,0,0,1,0,0,0,0,...,1,0,0,0,0,0,0,0,1,0
4,4,6066,0,0,0,1,0,1,1,0,...,1,0,1,0,1,0,0,1,1,1


In [22]:
from collections import defaultdict
from tqdm import tqdm
import pandas as pd
import numpy as np
import os
import joblib

METADATA_FILEPATH = os.path.join("data", "item_metadata.csv")
ITEM_FEATURES_FILEPATH = os.path.join("data", "item_features.csv")
ITEM_FEATURES_AS_DICT_FILEPATH = os.path.join("data", "item_features_as_dict.joblib")
DENSE_FEATURES_WRITE_PATH = os.path.join("data", "item_dense.csv")

NUM_OF_ITEMS = 927142
NUM_OF_PROPS = 157

metadata = pd.read_csv(METADATA_FILEPATH)

# split properties and save in another column
metadata["splitted_props"] = metadata["properties"].apply(lambda x: x.split("|"))

# get the set of all properties
with open(METADATA_FILEPATH) as f:
    f.readline()  # skip the header line
    all_props = set()  # to drop duplicate values
    for line in f:
        all_props.update(map(str.strip, line.split(",", 1)[1].split("|")))

all_props_as_list = sorted(list(all_props))

prop_name_to_index = {}  # maps prop name to the proper index in matrix

for i in range(len(all_props_as_list)):
    prop_name = all_props_as_list[i]
    prop_name_to_index[prop_name] = i

item_features = np.zeros((NUM_OF_ITEMS, NUM_OF_PROPS+2))
item_features_as_dict = defaultdict(list)

current_item_index = 0

for row in tqdm(metadata[["item_id", "splitted_props"]].to_numpy()):
    
    item_id = row[0]
    props = row[1]
    
    # add item index as feature
    item_features[current_item_index][0] = current_item_index  # index feature
    item_features[current_item_index][1] = item_id  # id feature
    
    for prop in props:
        prop_index = prop_name_to_index[prop]
        item_features[current_item_index, prop_index+2] = 1  # +2 for item_index and item_id
        item_features_as_dict[item_id].append(prop_index)
        
    current_item_index += 1   
    
dataframe_columns = ["item_index", "item_id"] + all_props_as_list
item_features_dataframe = pd.DataFrame(item_features, columns=dataframe_columns, dtype=int)

item_features_dataframe.to_csv(ITEM_FEATURES_FILEPATH, index=False)
joblib.dump(item_features_as_dict, ITEM_FEATURES_AS_DICT_FILEPATH)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 927142/927142 [00:10<00:00, 92462.58it/s]


['data\\item_features_as_dict.joblib']

# Extract Hotel Dense Features Experimental

In [33]:
item_features = pd.read_csv(ITEM_FEATURES_FILEPATH)

item_features.head()

Unnamed: 0,item_index,item_id,1 Star,2 Star,3 Star,4 Star,5 Star,Accessible Hotel,Accessible Parking,Adults Only,...,Terrace (Hotel),Theme Hotel,Towels,Very Good Rating,Volleyball,Washing Machine,Water Slide,Wheelchair Accessible,WiFi (Public Areas),WiFi (Rooms)
0,0,5101,0,0,0,1,0,1,1,0,...,1,0,0,0,0,0,0,0,1,1
1,1,5416,0,0,0,1,0,0,0,0,...,1,0,0,1,0,0,0,1,1,1
2,2,5834,0,0,1,0,0,0,0,0,...,1,0,0,1,0,0,0,0,1,1
3,3,5910,0,0,0,1,0,0,0,0,...,1,0,0,0,0,0,0,0,1,0
4,4,6066,0,0,0,1,0,1,1,0,...,1,0,1,0,1,0,0,1,1,1


In [34]:

hotel_categories = {
    "Hotel": "hotel",
    "Resort": "resort",
    "Hostal (ES)": "hostal",
    "Motel": "motel",
    "House / Apartment": "house",
}

## important features are taken from the paper of Logic AI team
## link: https://github.com/logicai-io/recsys2019/
important_features = [
    "Free WiFi (Combined)",
    "Swimming Pool (Combined Filter)",
    "Car Park",
    "Serviced Apartment",
    "Air Conditioning",
    "Spa (Wellness Facility)",
    "Pet Friendly",
    "All Inclusive (Upon Inquiry)",
]

item_features["n_properties"] = item_features.drop(columns=["item_index", "item_id"]).sum(axis=1)

item_features["rating"]= None
item_features.loc[item_features["Satisfactory Rating"] == 1, "rating"] = 1
item_features.loc[item_features["Good Rating"] == 1, "rating"] = 2
item_features.loc[item_features["Very Good Rating"] == 1, "rating"] = 3
item_features.loc[item_features["Excellent Rating"] == 1, "rating"] = 4


item_features["stars"]= None
item_features.loc[item_features["1 Star"] == 1, "stars"] = 1
item_features.loc[item_features["2 Star"] == 1, "stars"] = 2
item_features.loc[item_features["3 Star"] == 1, "stars"] = 3
item_features.loc[item_features["4 Star"] == 1, "stars"] = 4
item_features.loc[item_features["5 Star"] == 1, "stars"] = 5



item_features["hotel_category"] = None
item_features.loc[item_features["Hotel"] == 1, "hotel_category"] = "hotel"
item_features.loc[item_features["Resort"] == 1, "hotel_category"] = "resort"
item_features.loc[item_features["Hostal (ES)"] == 1, "hotel_category"] = "hostal"
item_features.loc[item_features["Motel"] == 1, "hotel_category"] = "motel"
item_features.loc[item_features["House / Apartment"] == 1, "hotel_category"] = "house"

final_features = important_features + ["item_index", "item_id", "rating", "stars", "hotel_category"]
item_features = item_features[final_features]

In [35]:
def normalize_feature_name(name):
    return name.replace(" ", "_").lower()

item_features.rename(
    columns=dict(zip(
        item_features.columns,
        map(normalize_feature_name, item_features.columns))),
    inplace=True
)

In [36]:
item_features.isna().sum()

free_wifi_(combined)                    0
swimming_pool_(combined_filter)         0
car_park                                0
serviced_apartment                      0
air_conditioning                        0
spa_(wellness_facility)                 0
pet_friendly                            0
all_inclusive_(upon_inquiry)            0
item_index                              0
item_id                                 0
rating                             393856
stars                              663938
hotel_category                     164147
dtype: int64

In [37]:
item_features.head()

Unnamed: 0,free_wifi_(combined),swimming_pool_(combined_filter),car_park,serviced_apartment,air_conditioning,spa_(wellness_facility),pet_friendly,all_inclusive_(upon_inquiry),item_index,item_id,rating,stars,hotel_category
0,0,0,1,0,1,0,0,0,0,5101,2,4,hotel
1,1,0,1,0,0,1,1,0,1,5416,4,4,hotel
2,1,0,1,0,0,0,0,0,2,5834,3,3,hotel
3,1,0,1,0,0,0,1,0,3,5910,2,4,hotel
4,1,1,1,0,0,1,1,0,4,6066,2,4,hotel


In [38]:
DENSE_FEATURES_WRITE_PATH = os.path.join("data", "item_dense.csv")
item_features.to_csv(DENSE_FEATURES_WRITE_PATH)

# Final Compiled Versions

## Item Data Parser

In [41]:
from collections import defaultdict
from tqdm import tqdm
import pandas as pd
import numpy as np
import os
import joblib

METADATA_FILEPATH = os.path.join("data", "item_metadata.csv")
ITEM_FEATURES_FILEPATH = os.path.join("data", "full_item_features.csv")
DENSE_FEATURES_WRITE_PATH = os.path.join("data", "item_dense.csv")

NUM_OF_ITEMS = 927142
NUM_OF_PROPS = 157

metadata = pd.read_csv(METADATA_FILEPATH)

# split properties and save in another column
metadata["splitted_props"] = metadata["properties"].apply(lambda x: x.split("|"))

# get the set of all properties
with open(METADATA_FILEPATH) as f:
    f.readline()  # skip the header line
    all_props = set()  # to drop duplicate values
    for line in f:
        all_props.update(map(str.strip, line.split(",", 1)[1].split("|")))

all_props_as_list = sorted(list(all_props))

prop_name_to_index = {}  # maps prop name to the proper index in matrix

for i in range(len(all_props_as_list)):
    prop_name = all_props_as_list[i]
    prop_name_to_index[prop_name] = i

item_features = np.zeros((NUM_OF_ITEMS, NUM_OF_PROPS+2))
item_features_as_dict = defaultdict(list)

current_item_index = 0

for row in tqdm(metadata[["item_id", "splitted_props"]].to_numpy()):
    
    item_id = row[0]
    props = row[1]
    
    # add item index as feature
    item_features[current_item_index][0] = current_item_index  # index feature
    item_features[current_item_index][1] = item_id  # id feature
    
    for prop in props:
        prop_index = prop_name_to_index[prop]
        item_features[current_item_index, prop_index+2] = 1  # +2 for item_index and item_id
        item_features_as_dict[item_id].append(prop_index)
        
    current_item_index += 1   
    
dataframe_columns = ["item_index", "item_id"] + all_props_as_list
item_features_dataframe = pd.DataFrame(item_features, columns=dataframe_columns, dtype=int)

item_features_dataframe.to_csv(ITEM_FEATURES_FILEPATH, index=False)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 927142/927142 [00:09<00:00, 92902.95it/s]


## Dense Feature Extractor

In [42]:
import pandas as pd
import os

DENSE_FEATURES_WRITE_PATH = os.path.join("data", "item_dense_features.csv")
ITEM_FEATURES_FILEPATH = os.path.join("data", "full_item_features.csv")

item_features = pd.read_csv(ITEM_FEATURES_FILEPATH)


def normalize_feature_name(name):
    return name.replace(" ", "_").lower()

hotel_categories = {
    "Hotel": "hotel",
    "Resort": "resort",
    "Hostal (ES)": "hostal",
    "Motel": "motel",
    "House / Apartment": "house",
}

## important features are taken from the paper of Logic AI team
## link: https://github.com/logicai-io/recsys2019/
important_features = [
    "Free WiFi (Combined)",
    "Swimming Pool (Combined Filter)",
    "Car Park",
    "Serviced Apartment",
    "Air Conditioning",
    "Spa (Wellness Facility)",
    "Pet Friendly",
    "All Inclusive (Upon Inquiry)",
]

item_features["n_properties"] = item_features.drop(columns=["item_index", "item_id"]).sum(axis=1)

item_features["rating"]= None
item_features.loc[item_features["Satisfactory Rating"] == 1, "rating"] = 1
item_features.loc[item_features["Good Rating"] == 1, "rating"] = 2
item_features.loc[item_features["Very Good Rating"] == 1, "rating"] = 3
item_features.loc[item_features["Excellent Rating"] == 1, "rating"] = 4


item_features["stars"]= None
item_features.loc[item_features["1 Star"] == 1, "stars"] = 1
item_features.loc[item_features["2 Star"] == 1, "stars"] = 2
item_features.loc[item_features["3 Star"] == 1, "stars"] = 3
item_features.loc[item_features["4 Star"] == 1, "stars"] = 4
item_features.loc[item_features["5 Star"] == 1, "stars"] = 5



item_features["hotel_category"] = None
item_features.loc[item_features["Hotel"] == 1, "hotel_category"] = "hotel"
item_features.loc[item_features["Resort"] == 1, "hotel_category"] = "resort"
item_features.loc[item_features["Hostal (ES)"] == 1, "hotel_category"] = "hostal"
item_features.loc[item_features["Motel"] == 1, "hotel_category"] = "motel"
item_features.loc[item_features["House / Apartment"] == 1, "hotel_category"] = "house"

final_features = ["item_index", "item_id"] + important_features +  ["rating", "stars", "hotel_category"]
item_features = item_features[final_features]

item_features.rename(
    columns=dict(zip(
        item_features.columns,
        map(normalize_feature_name, item_features.columns))),
    inplace=True
)

print(item_features.isna().sum())

item_features.to_csv(DENSE_FEATURES_WRITE_PATH)

item_index                              0
item_id                                 0
free_wifi_(combined)                    0
swimming_pool_(combined_filter)         0
car_park                                0
serviced_apartment                      0
air_conditioning                        0
spa_(wellness_facility)                 0
pet_friendly                            0
all_inclusive_(upon_inquiry)            0
rating                             393856
stars                              663938
hotel_category                     164147
dtype: int64
