In [1]:
from tqdm import tqdm
import pandas as pd
import json

# Data pre-processing

## Filter out unwanted rows

In [2]:
original_data_filename = 'hospitality_dataset_2020.csv'
trimmed_data_filename = 'trimmed_hospitality_dataset_2020.csv'
ids_data_filename = 'streams_posts_comments_ids.json'
user_profiles_filename = 'user_profiles_hospitality_dataset_2020.csv'

In [30]:
# parameters & functions
cols_to_read = ['user_id', 'path', 'normalized_path', 'method']
cols_to_write = ['user_id', 'path', 'normalized_path', 'method']
endpoints_list = [
    '/streams/{streamid}/posts',
    '/streams/{streamid}/subscribe',
    '/streams/{streamid}/members',
    '/streams/{streamid}/subscribe',
    
    '/posts/{postid}/comments',
    '/posts',
    '/posts/{postid}/like',
    '/posts/{postid}/likes',
    '/posts/{postids}/read',
    '/posts/{post_id}/vote/{option_id}',
    
    '/comments/{id}/likes',
    '/comments/{id}/keep',
    '/comments/{id}/like',
]
methods_list = ['POST']

def filter_rows(path, method):
    return (not pd.isnull(path) # path must not be NaN
            and not pd.isnull(method) # method must not be NaN
            and any(endpoint in path for endpoint in endpoints_list) # the path must contain one of the wanted endpoints
            and method in methods_list) # the method should be one of the wanted methods

# read data
data = pd.read_csv(f'./data/{original_data_filename}', chunksize=60, usecols=cols_to_read) # data is an iterable

header = True
for chunk in tqdm(data):
    # filter out unwanted rows
    chunk = chunk[chunk[['normalized_path', 'method']].apply(lambda x: filter_rows(*x), axis=1)]
    # save remaining rows
    chunk.to_csv(f'./data/{trimmed_data_filename}', header=header, columns=cols_to_write, index=False, mode='a')
    header = False # only first chunk should have a header

830563it [1:41:43, 136.08it/s]


## Create user profiles

User profiles:
- Sparse matrix NxM with N users and M features
- Features (for each stream):
    - Number of posts the user read
    - Number of posts the user posted
    - Number of posts and comments the user liked
    - Number of comments the user posted

In [20]:
# load streams, posts and comments ids
with open(f'data/{ids_data_filename}', 'r', encoding="utf8") as ids_file:
    posts_ids, comments_ids = json.load(ids_file)
streams_ids_set = set(posts_ids.values())

In [21]:
features = {
    ('/posts/{postids}/read', 'POST'): 'read',
    ('/streams/{streamid}/posts', 'POST'): 'posts',
    ('/posts/{postid}/like', 'POST'): 'likes',
    ('/posts/{postid}/likes', 'POST'): 'likes',
    ('/comments/{id}/likes', 'POST'): 'likes',
    ('/comments/{id}/keep', 'POST'): 'likes',
    ('/comments/{id}/like', 'POST'): 'likes',
    ('/posts/{postid}/comments', 'POST'): 'comments',
}

In [22]:
# identify columns for future user profiles dataframe
up_cols = [f'{stream_id}_{feature}' for stream_id in streams_ids_set for feature in set(features.values())]
user_profiles = {} 

In [23]:
count_entries = 0
count_unknown_id = 0

# read data
data = pd.read_csv(f'./data/{trimmed_data_filename}', chunksize=60)

for chunk in tqdm(data):
    for _, row in chunk.iterrows(): # for each row
        
        # extract row info
        user_id = row['user_id']
        normalized_path = row['normalized_path']
        path = row['path']
        method = row['method']
        
        # if normalized_path is one of those we want to keep
        try:
            feature = features[(normalized_path.replace('/api/2', ''), method)] # get feature name
            count_entries += 1
        except KeyError:
            break
        
        # extract ids from the path
        ids = path.replace(normalized_path[:normalized_path.find('{')], '')\
                    .replace(normalized_path[normalized_path.find('}')+1:], '').split(',')
        
        normalized_path_action = normalized_path.split('/')[3]
        for element_id in ids: # for each id
            # use posts_ids and comments_ids dictionnaries to find the stream_id
            try:
                if normalized_path_action == 'streams':
                    stream_id = element_id
                elif normalized_path_action == 'posts':
                    stream_id = posts_ids[element_id]
                elif normalized_path_action == 'comments':
                    stream_id = posts_ids[comments_ids[element_id]]
            except KeyError: 
                count_unknown_id += 1
                break # id unknown

            column = f'{stream_id}_{feature}'
            
            try: # if the user already has a row
                user_profiles[user_id][column] += 1 # increment feature counter
            except KeyError: # create a new row
                user_profiles[user_id] = {col: 0 for col in up_cols}
                user_profiles[user_id][column] = 1

print(f'Number of rows used from {trimmed_data_filename}: {count_entries}')
print(f'Number of entries using unknown id(s): {count_unknown_id}')

28309it [03:58, 118.51it/s]

Number of rows used from trimmed_hospitality_dataset_2020.csv: 1308380
Number of entries using unknown id(s): 35315





In [26]:
user_profiles_df = pd.DataFrame.from_dict(user_profiles, orient='index') # convert from dict to DataFrame
user_profiles_df = user_profiles_df.dropna(axis=1) # drop all columns with any missing value
user_profiles_df.to_csv(f'./data/{user_profiles_filename}', header=True, index=False) # save user profiles to a file
print(f'Number of unique users: {len(user_profiles_df)}')
user_profiles_df.head()

Number of unique users: 7767


Unnamed: 0,9731_read,9731_likes,9731_comments,9731_posts,7174_read,7174_likes,7174_comments,7174_posts,12807_read,12807_likes,...,132077_comments,132077_posts,132078_read,132078_likes,132078_comments,132078_posts,50683_read,50683_likes,50683_comments,50683_posts
80c79718-b5ae-4e79-9b1a-b42461b934d0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
fee5578c-cbcd-402d-a698-db9a58af6fb2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
284881a1-833d-49d9-9b7f-42094fdbbca1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
be9ce283-a20f-4110-9523-e1c70d657add,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0fe64dcb-547e-4f4f-a158-a66b5edd422f,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
