# Description

Analyze the clickstream data to identify the most common user journeys leading to TripAdvisor. 

What patterns or sequences of sites or pages do users typically navigate through before reaching TripAdvisor? 
Are there specific categories, themes, or domains that are common in these preTripAdvisor sessions? 

Interpret your findings (visualization and statistical analysis is optional, but it will increase your valuation as a candidate).

# Imports Libraries

In [1]:
# standard libraries
import os
import pandas as pd

# external libraries
import plotly.express as px
import tldextract

# Data cleaning

In [16]:
def get_clean_data(file: str) -> pd.DataFrame:
    '''Returns the data cleaned after the process of:
           ordering it, 
           add domain,
           and subdomain. 
       It also saves the new file for future iteration'''
    if os.path.exists(f'data/clean_{file}'):
        return pd.read_parquet(f'data/clean_{file}')
    
    data = order_data(pd.read_parquet(f'data/{file}'))
    data['domain'] = data['targeturl'].apply(get_domain)
    data['sub_domain'] = data['targeturl'].apply(get_sub_domain)
    data.to_parquet(f'data/clean_{file}')
    return data

In [3]:
def order_data(data: pd.DataFrame) -> pd.DataFrame:
    '''Returns the data ordered by the id and the timestamp'''
    sorted_ids = dict(zip(data.userid.unique(), range(len(data.userid.unique()))))
    data['sortedids'] = data.userid.map(sorted_ids)
    data = data.sort_values(['sortedids', 'eventtimestamp'])
    data.drop(columns=['sortedids'])
    return data

In [14]:
def get_domain(url: str) -> str:
    '''Returns the domain of an url,
    using the library tldextract'''
    try:
        return tldextract.extract(url)[1]
    except:
        return np.nan

In [15]:
def get_sub_domain(url: str) -> str:
    '''Returns the sub domain of an url,
    using the library tldextract'''
    try:
        return tldextract.extract(url)[0]
    except:
        return np.nan

# Data Transformation

In [32]:
def get_user_journeys(data: pd.DataFrame, 
                      max_timestamp: int=14400) -> pd.DataFrame:
    '''journey refers to a series of clicks, 
    page views, and user actions occurring within a specific 
    time frame that ultimately leads to TripAdvisor.
    This function returns a DataFrame with these user journeys
    to obtain insights into user preferences, 
    behaviors, and decision-making processes.
    The function receives the data as a dataframe and a max 
    timestamp to separate journeys'''

    # Cache
    if os.path.exists('data/user_journeys.parquet'):
        return pd.read_parquet('data/user_journeys.parquet')

    # start variables id and current journey
    n = 0
    current_id = f'{data.userid[0]}_{n}'

    # base
    users = data.userid.unique()
    user_journey = [tldextract.extract(data.referrerurl.iloc[0])[1]]
    user_journeys = pd.DataFrame(index=range(len(data) // len(users)))

    # check every row
    for idx in range(len(user_journeys.index) - 1):
        row = data.iloc[idx]
        if data.domain.iloc[idx] == 'tripadvisor':
            user_journey.append(data.domain.iloc[idx])
            user_journeys[current_id] = pd.Series(user_journey)

            # start another user journey for new user
            user_journey = []
            n += 1
            current_id = f'{data.userid[idx]}_{n}'
            

        elif data.userid[idx] != current_id[:36]:
            # start another user journey for new user
            user_journey = []
            
            # new id
            n = 0
            current_id = f'{data.userid[idx]}_{n}'

        
        elif (data.eventtimestamp.iloc[idx + 1] \
              - data.eventtimestamp.iloc[idx]) > max_timestamp:
            # start another user journey for new user
            user_journey = []
            current_id = f'{data.userid[idx]}_{n}'
            
        
        # append the url in the same journey
        user_journey.append(data.domain[idx])

    
    # drop empty rows and save the file
    user_journeys = user_journeys.dropna(how='all')
    user_journeys.to_parquet('data/user_journeys.parquet')
    
    return user_journeys

# Data Import

In [17]:
# prepare and import data.
data = pd.DataFrame()
for n in range(1):
    file = f'data_{n}.parquet'
    new_data = get_clean_data(file)
    data = pd.concat([data, new_data])

In [33]:
# create all the user_journeys
user_journeys = get_user_journeys(data)
user_journeys

Unnamed: 0,0caeb7a6-1496-4ede-9483-4b685881478f_0,0caeb7a6-1496-4ede-9483-4b685881478f_1,0caeb7a6-1496-4ede-9483-4b685881478f_2,0caeb7a6-1496-4ede-9483-4b685881478f_3,0caeb7a6-1496-4ede-9483-4b685881478f_4,0caeb7a6-1496-4ede-9483-4b685881478f_5,0caeb7a6-1496-4ede-9483-4b685881478f_6,0caeb7a6-1496-4ede-9483-4b685881478f_7,0caeb7a6-1496-4ede-9483-4b685881478f_8,0caeb7a6-1496-4ede-9483-4b685881478f_9
0,wikipedia,google,google,ralphs,overdrive,amazon,amazon,google,doccafe,friendsofcc
1,apple,google,johnnys-shop,amazon,tripadvisor,tripadvisor,tripadvisor,tripadvisor,tripadvisor,tripadvisor
2,myreadingmanga,google,google,google,,,,,,
3,arigatomina,arigatomina,id,pressedjuicery,,,,,,
4,mediafire,google,google,google,,,,,,
...,...,...,...,...,...,...,...,...,...,...
160,,,,redfin,,,,,,
161,,,,shakeys,,,,,,
162,,,,amazon,,,,,,
163,,,,archiveofourown,,,,,,


# Statistical Analysis

## Descriptive Analysis

In [34]:
user_journeys.describe()

Unnamed: 0,0caeb7a6-1496-4ede-9483-4b685881478f_0,0caeb7a6-1496-4ede-9483-4b685881478f_1,0caeb7a6-1496-4ede-9483-4b685881478f_2,0caeb7a6-1496-4ede-9483-4b685881478f_3,0caeb7a6-1496-4ede-9483-4b685881478f_4,0caeb7a6-1496-4ede-9483-4b685881478f_5,0caeb7a6-1496-4ede-9483-4b685881478f_6,0caeb7a6-1496-4ede-9483-4b685881478f_7,0caeb7a6-1496-4ede-9483-4b685881478f_8,0caeb7a6-1496-4ede-9483-4b685881478f_9
count,13,20,9,165,2,2,2,2,2,2
unique,8,9,6,47,2,2,2,2,2,2
top,myreadingmanga,google,google,google,overdrive,amazon,amazon,google,doccafe,friendsofcc
freq,5,7,4,23,1,1,1,1,1,1


In [None]:
# add most common journey to trip advisor here

# Visualization

In [36]:
# add histogram here

In [None]:
# add bubble for browser history here

# Conclussions