In [1]:
# This is a test notebook
# Here no format will be provided, it's just for experimentation.

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import os

from sklearn.ensemble import IsolationForest
from urllib.parse import urlparse

URL_TRIPADVISOR = 'https://www.tripadvisor.com'
URL_EMPTY_STRING = 'https://www.this_is_an_empty_string.com'
SECONDS_TO_DAYS = 1/60/60/24

pd.set_option('display.max_rows', 100)

In [3]:
def first_data_process(data):
    df = data.copy() # We copy so that we don't manipulate the real data.
    
    # Let's sort values using the even time stamp. 
    df = df.sort_values('eventtimestamp')

    df.loc[df.referrerurl=='', 'referrerurl'] = URL_EMPTY_STRING
    
    # Let's identify which links contain tripadvisor information
    df.loc[df.targeturl.str.contains(URL_TRIPADVISOR), 'trip_advisor_presence'] = 1
    df.trip_advisor_presence = df.trip_advisor_presence.fillna(0)
    
    # We are skipping those values that from a same url goes to the same url
    df = df[~(
        df.targeturl.str.contains(URL_TRIPADVISOR) & 
        df.referrerurl.str.contains(URL_TRIPADVISOR)
    )]

    # Now we care about the distinct chain of events that lead to each trip_advisor link
    # We can consider those as "sessions" with the deffiniton that a session is a "series
    # of events that ultimately lead to a new "TripAdvisor" url link.
    sessions = []
    index_val_pos_keep = 0
    for session_id, val in enumerate(df[df.trip_advisor_presence==1].index):
        session_id = f"_{session_id}"
        index_val_pos = df.index.get_loc(val) + 1
        # We get the temporal dataframe from the index_val_pos_keep to index_val_pos
        temporal_df = df.iloc[index_val_pos_keep : index_val_pos]
        temporal_df['sessionid'] = temporal_df['userid'] + session_id
        
        if temporal_df.shape[0] == 1:
            # Sometimes the session has just 1 link, if it was an empty string it
            # doesn't help us to the analysis. Let's keep the ones that have something
            # Different than an empty string.
            temporal_df = temporal_df[~(
                temporal_df.targeturl.str.contains(URL_TRIPADVISOR) & 
                temporal_df.referrerurl.str.contains(URL_EMPTY_STRING)
            )]
            
        if temporal_df.shape[0] > 0:
            sessions.append(temporal_df)
        index_val_pos_keep = index_val_pos
    # After iterations are done we actually removed what happened after the last TripAdvisor url
    # We do not care of this. 
    if sessions:
        # If there is a list of dataframes to concatenate, then we do the concat
        df_sessions = pd.concat(sessions).drop('userid', axis=1)
        # Depending on the lenght of the session will be relatively important to see if we should
        # Keep it or not.
        if df_sessions.shape[0] > 1:
            return df_sessions
        df_sessions['sessionid'] = 'something wrong'
        return df_sessions
    df['sessionid'] = 'something wrong'
    df = df.drop('userid', axis=1)    
    return df

def identify_outliers(data):
    df = data.copy()
    df['click_seconds'] = (df.eventtimestamp - df.eventtimestamp.shift(1)).fillna(0)
    df['click_days'] = df['click_seconds'] * SECONDS_TO_DAYS
    
    # Each user should have their own outlier behaviour
    if df.shape[0]:
        X = df['click_days']
        try:
            random_seed = 0
            clf = IsolationForest(
                random_state=random_seed, 
                n_estimators=100
            ).fit(
                X.dropna().sample(
                    frac=0.1,
                    random_state=random_seed)
                .to_numpy().reshape(-1, 1) 
            )
            y = clf.predict(X.fillna(0).to_numpy().reshape(-1,1))
        except:
            y = np.array([0]*df.shape[0])
        df['outliers'] = y!=1
        df['outliers'] = df['outliers'].astype('int')
    else:
        df['outliers'] = 0
    cumsum_val = df.outliers.cumsum()
    df['subsessionid'] = df.sessionid + '__' + cumsum_val.astype('str')
    df['subsessionid_nb'] = cumsum_val.astype('int')
    return df

def get_grouped_subsessionid_list(data, col1='referrerurl', col2='targeturl'):
    df = data.copy()
    condition = df.click_seconds == 0 
    condition.loc[condition.idxmax()] = False
    df = df[~condition]
    return df[[col1, col2]].to_numpy().flatten().tolist()

In [7]:
def process_file(file_path):
    data = pd.read_parquet(file_path)
    test = data.groupby('userid').apply(first_data_process).reset_index()
    
    del data

    # Let's not care about the 'something wrong': no trip_advisor_presence
    test = test[test.sessionid!='something wrong'].reset_index(drop=True)
    
    url_parsed_referrer = test.referrerurl.apply(urlparse)
    url_parsed_target = test.targeturl.apply(urlparse)

    test['referrerurl_netloc'] = url_parsed_referrer.apply(lambda x: x.netloc)
    test['referrerurl_query'] = url_parsed_referrer.apply(lambda x: x.query)
    test['referrerurl_path'] = url_parsed_referrer.apply(lambda x: x.path)
    test['targeturl_netloc'] = url_parsed_target.apply(lambda x: x.netloc)
    test['targeturl_query'] = url_parsed_target.apply(lambda x: x.query)
    test['targeturl_path'] = url_parsed_target.apply(lambda x: x.path)
    
    s = test.groupby('userid').apply(identify_outliers).drop(['userid', 'level_1'], axis=1).reset_index()
    
    del test

    s_v = s.groupby('sessionid').subsessionid.last()
    s_t = s[s.subsessionid.isin(s_v)]
    s_t_list = s_t.groupby('subsessionid').apply(get_grouped_subsessionid_list)
    s_t_list_netloc = s_t.groupby('subsessionid').apply(
        get_grouped_subsessionid_list, 
        col1='referrerurl_netloc',
        col2='targeturl_netloc'
    )
    st_grouped = s_t.groupby('subsessionid').agg(
        subsession_duration=('eventtimestamp', lambda x:np.max(x)-np.min(x)),
        platforms_used=('platform', lambda x: x.unique().tolist()),
    )
    st_grouped['url_link_list'] = s_t_list
    st_grouped['urlloc_link_list'] = s_t_list_netloc
    st_grouped['reduced_urlloc_link_list'] = st_grouped.urlloc_link_list.apply(lambda x: [v for i, v in enumerate(x) if i == 0 or v != x[i-1]])
    return st_grouped

In [8]:
data_path = '../data/raw/'
files = [os.path.join(data_path, doc) for doc in os.listdir(data_path) if doc.endswith('parquet')]

In [9]:
file_info = []
for file in files: 
    #info = pd.read_parquet(file)
    info = process_file(file)
    file_info.append(info)
    print(file)

../data/raw/data_0.parquet
../data/raw/data_1.parquet
../data/raw/data_10.parquet
../data/raw/data_11.parquet
../data/raw/data_12.parquet
../data/raw/data_13.parquet
../data/raw/data_14.parquet
../data/raw/data_15.parquet
../data/raw/data_16.parquet
../data/raw/data_17.parquet
../data/raw/data_18.parquet
../data/raw/data_19.parquet
../data/raw/data_2.parquet
../data/raw/data_20.parquet
../data/raw/data_21.parquet
../data/raw/data_22.parquet
../data/raw/data_23.parquet
../data/raw/data_24.parquet
../data/raw/data_25.parquet
../data/raw/data_26.parquet
../data/raw/data_27.parquet
../data/raw/data_28.parquet
../data/raw/data_29.parquet
../data/raw/data_3.parquet
../data/raw/data_30.parquet
../data/raw/data_31.parquet
../data/raw/data_32.parquet
../data/raw/data_33.parquet
../data/raw/data_34.parquet
../data/raw/data_35.parquet
../data/raw/data_36.parquet
../data/raw/data_37.parquet
../data/raw/data_38.parquet
../data/raw/data_39.parquet
../data/raw/data_4.parquet
../data/raw/data_40.parqu

In [11]:
f = pd.concat(file_info)

In [22]:
for f, inf  in zip(files, file_info):
    print(f)
    inf.to_parquet(f.replace('raw','interim'))
        
        

../data/raw/data_0.parquet
../data/raw/data_1.parquet
../data/raw/data_10.parquet
../data/raw/data_11.parquet
../data/raw/data_12.parquet
../data/raw/data_13.parquet
../data/raw/data_14.parquet
../data/raw/data_15.parquet
../data/raw/data_16.parquet
../data/raw/data_17.parquet
../data/raw/data_18.parquet
../data/raw/data_19.parquet
../data/raw/data_2.parquet
../data/raw/data_20.parquet
../data/raw/data_21.parquet
../data/raw/data_22.parquet
../data/raw/data_23.parquet
../data/raw/data_24.parquet
../data/raw/data_25.parquet
../data/raw/data_26.parquet
../data/raw/data_27.parquet
../data/raw/data_28.parquet
../data/raw/data_29.parquet
../data/raw/data_3.parquet
../data/raw/data_30.parquet
../data/raw/data_31.parquet
../data/raw/data_32.parquet
../data/raw/data_33.parquet
../data/raw/data_34.parquet
../data/raw/data_35.parquet
../data/raw/data_36.parquet
../data/raw/data_37.parquet
../data/raw/data_38.parquet
../data/raw/data_39.parquet
../data/raw/data_4.parquet
../data/raw/data_40.parqu

In [19]:
len(file_info)

48