In [None]:
# Import python packages
import streamlit as st
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

# We can also use Snowpark for our analyses!
from snowflake.snowpark.context import get_active_session
session = get_active_session()
import warnings
warnings.filterwarnings("ignore")

In [None]:
def remove_double_quotes_from_object_columns(df):
    object_columns = df.select_dtypes(include='object').columns
    df[object_columns] = df[object_columns].apply(lambda col: col.str.replace('"', ''))
    return df

In [None]:
def view_duration_validation(data):
    print('##### Starting View Duration Validation ######')
    total_records = data.shape[0]
    data_3600 = data[data['VIEW_DURATION'] >= 3600]
    count_3600 = data_3600.shape[0]
    if count_3600 > 0:
        print(f"   Number of records with >= 3600: {count_3600} represents the {round((count_3600/total_records)*100,2)}%")    
    print('##### Ending View Duration Validation ######')

In [None]:
def content_sections_validation(data):
    print('##### Starting Content Sections Validation ######')
    total_records = data.shape[0]
    cs_0 = data[data['CONTENT_SECTIONS_TOTAL'] == 0].shape[0]
    data = data[data['VIEW_DURATION'] > 0]
    csv_0 = data[data['CONTENT_SECTIONS_VIEWED'] == 0].shape[0]
    csvp_0 = data[data['CONTENT_SECTIONS_VIEWED_PERCENT'] == 0].shape[0]
    highv = data[data['CONTENT_SECTIONS_VIEWED'] > data['CONTENT_SECTIONS_TOTAL']].shape[0]
    high_csvp = data[data['CONTENT_SECTIONS_VIEWED_PERCENT'] > 1].shape[0]
    low_csvp = data[data['CONTENT_SECTIONS_VIEWED_PERCENT'] < 0].shape[0]
    if cs_0 > 0:
        print(f'   content_sections                = 0: {cs_0} represents the {round((cs_0/total_records)*100,2)}%')
    if csv_0 > 0:
        print(f'   content_sections_viewed         = 0: {csv_0} represents the {round((csv_0/total_records)*100,2)}%')
    if csvp_0 > 0:
        print(f'   content_sections_viewed_percent = 0: {csvp_0} represents the {round((csvp_0/total_records)*100,2)}%')
    if highv > 0:
        print(f'   content_sections_viewed > content_sections_total = {highv} represents the {round((highv/total_records)*100,2)}%')
    if high_csvp > 0:
        print(f'   content_sections_viewed_percent > 1 = {high_csvp} represents the {round((high_csvp/total_records)*100,2)}%')
    if low_csvp > 0:
        print(f'   content_sections_viewed_percent < 0 = {low_csvp} represents the {round((low_csvp/total_records)*100,2)}%')
        
    print('##### Ending Content Sections Validation ######')

In [None]:
def scroll_validation(data):
    print('##### Starting Scroll Validation ######')
    total_records = data.shape[0]
    low_300 = data[data['SCROLL_DEPTH'] < 300].shape[0]
    high_million = data[data['SCROLL_DEPTH'] > 1000000].shape[0]
    equal_0 = data[data['SCROLL_PERCENTAGE'] == 0].shape[0]
    sp_0 = data[data['SCROLL_PERCENTAGE'] < 0].shape[0]
    sp_1 = data[data['SCROLL_PERCENTAGE'] > 1].shape[0]
    if low_300 > 0:
        print(f'   scroll depth < 300     = {low_300} represents the {round((low_300/total_records)*100,2)}%')
    if high_million > 0:
        print(f'   scroll depth > 1M      = {high_million} represents the {round((high_million/total_records)*100,2)}%')
    if equal_0 > 0:
        print(f'   scroll percentage == 0 = {equal_0} represents the {round((equal_0/total_records)*100,2)}%')
    if sp_0 > 0:
        print(f'   scroll percentage < 0  = {sp_0} represents the {round((sp_0/total_records)*100,2)}%')
    if sp_1 > 0:
        print(f'   scroll percentage > 1  = {sp_1} represents the {round((sp_1/total_records)*100,2)}%')
    #MISSING scroll_percentage_article validation
    print('##### Ending Scroll Validation ######')

In [None]:
def height_validation(data):
    print('##### Starting View Heigh Validation ######')
    total_records =  data.shape[0]
    low_300 = data[data['VIEW_HEIGHT'] < 300].shape[0]
    high_2M = data[data['VIEW_HEIGHT'] > 2000000].shape[0]
    if low_300 > 0:
        print(f'   view heigth < 300     = {low_300} represents the {round((low_300/total_records)*100,2)}%')
    if high_2M > 0:
        print(f'   view heigth > 2M     = {high_2M} represents the {round((high_2M/total_records)*100,2)}%')
    print('##### Ending View Heigh Validation ######')

In [None]:
def view_count_validation(data, view_threshold):
    print('##### Starting View Count Validation ######')
    total_records = data.shape[0]
    low_0 = data[data['VIEW_COUNT'] <= 0].shape[0]
    high_thrdf = data[data['VIEW_COUNT'] > view_threshold]
    high_thr = high_thrdf.shape[0]
    admedia = high_thrdf['VIEW_URL'].str.contains('utm_source=admedia').sum()
    
    if low_0 > 0:
        print(f'   view count <= 0     = {low_0} represents the {round((low_0/total_records)*100,2)}%')
    if high_thr > 0:
        print(f'   Higher than threshold at 95% {view_threshold} = {high_thr} represents the {round((high_thr/total_records)*100,2)}%') 
        print(f'   AdMedia records     = {admedia} represents the {round((admedia/high_thr)*100,2)}%') 
        print(f'   Non AdMedia records = {high_thr- admedia} overall represents the {round(((high_thr-admedia)/total_records)*100,2)}%') 
        
    print('##### Ending View Count Validation ######')

In [None]:
def words_validation(data):
    print('##### Starting words Validation ######')
    total_records = data.shape[0]
    view_names = ['Article', 'List Article'] #,  'Car Model', 'Car Model : Year', 'Car Specs', 'Staff Profile']
    # if view_name is any of the sections where we are counting words and view_duration > 0
    # then the word counters cannot be 0
    base_df = data[(data['VIEW_NAME'].isin(view_names)) & (data['VIEW_DURATION'] > 0)]
    # identify the null values
    wc_null = base_df['CONTENT_WORD_COUNT'].isna().sum()
    wv_null = base_df['CONTENT_WORDS_VIEWED'].isna().sum()
    wpm_null = base_df['CONTENT_WORDS_PER_MINUTE'].isna().sum()
    
    base_df['CONTENT_WORDS_VIEWED'] = base_df['CONTENT_WORDS_VIEWED'].astype('float64')
    base_df['CONTENT_WORD_COUNT'] = base_df['CONTENT_WORD_COUNT'].astype('float64')
    base_df['CONTENT_WORDS_PER_MINUTE'] = base_df['CONTENT_WORDS_PER_MINUTE'].astype('float64')

    base_count = base_df.shape[0]
    wc_0  = base_df[base_df['CONTENT_WORD_COUNT'] == 0].shape[0]
    wv_0  = base_df[base_df['CONTENT_WORDS_VIEWED'] == 0].shape[0]
    wpm_0 = base_df[base_df['CONTENT_WORDS_PER_MINUTE'] == 0].shape[0]
    if wc_null > 0:
        print(f'   Word Count        == null : {wc_null}')
    if wc_0 > 0:
         print(f'   Word Count       == 0 : {wc_0} represents the {round((wc_0/base_count)*100,2)}%')
    if wv_null > 0:
        print(f'   Word viewed       == null : {wv_null}')
    if wv_0 > 0:
         print(f'   Words Viewed     == 0 : {wv_0} represents the {round((wv_0/base_count)*100,2)}%')
    if wpm_null > 0:
        print(f'   Word per minute   == null : {wpm_null}')
    if wpm_0 > 0:
         print(f'   Words Per Minute == 0 : {wpm_0} represents the {round((wpm_0/base_count)*100,2)}%')
    print('##### Ending words Validation ######')


In [None]:
def check_duplicate_view_counts(df):
    session_view_counts = df.groupby('SESSION_ID')['VIEW_COUNT'].agg(list)
    
    def has_duplicates(view_counts):
        return len(view_counts) != len(set(view_counts))
    sessions_with_duplicates = session_view_counts[session_view_counts.apply(has_duplicates)]
    
    duplicate_details = []
    for session_id, view_counts in sessions_with_duplicates.items():
        counts = pd.Series(view_counts).value_counts()
        duplicated_values = counts[counts > 1]
        for value, count in duplicated_values.items():
            duplicate_details.append({
                'session_id': session_id,
                'view_count_value': value,
                'times_duplicated': count
            })
    
    total_sessions = len(session_view_counts)
    sessions_with_dupes = len(sessions_with_duplicates)

    if sessions_with_dupes > 0:
        print(f'total session_id/view_count : {total_sessions}')
        print(f'sessions_with_duplicates : {sessions_with_dupes}')
        print(f'percentage_affected : {round((sessions_with_dupes / total_sessions) * 100, 2)}')

In [None]:
SELECT 
    "ea content_sections_total" as content_sections_total,
    "ea content_sections_viewed" as content_sections_viewed,
    "ea content_sections_viewed_percent" as content_sections_viewed_percent,
    "ea device_family" as device_family,
    "ea device_screen_area" as device_screen_area,
    "ea device_screen_height" as device_screen_height,
    "ea device_screen_width" as device_screen_width,
    "ea device_user_agent" as device_user_agent,
    "ea device_user_agent_bot" as device_user_agent_bot,
    "ea scroll_depth" as scroll_depth,
    "ea scroll_percentage" as scroll_percentage,
    "ea scroll_percentage_article" as scroll_perce_article,
    "ea view_count" as view_count,
    "ea view_duration" as view_duration,
    "ea view_height" as view_height,
    "ea view_name" as view_name,
    "ea content_words_viewed" as content_words_viewed,  
    "ea content_word_count" as content_word_count,
    "ea content_words_per_minute" as content_words_per_minute,
    "ea view_url" as view_url,
    "ea session_id" as session_id,
    SESSIONID
--select top 10 *
FROM  PROD_DB_RAW.MPARTICLE.MP_VW_EVENT_OTHER_USERENGAGEMENTSUMMARY
WHERE appenvironment = 'Production'
AND   eventdate = CURRENT_DATE - 1
AND   "ea app_name" = 'NextWeb';

In [None]:
ues = ues_query.to_pandas()
ues.shape

In [None]:
ues = remove_double_quotes_from_object_columns(ues)

In [None]:
ues['VIEW_COUNT'] = ues['VIEW_COUNT'].astype('int64')
ues['VIEW_DURATION'] = ues['VIEW_DURATION'].astype('int64')

ues['DEVICE_SCREEN_WIDTH'] = ues['DEVICE_SCREEN_WIDTH'].astype('int64')
ues['DEVICE_SCREEN_HEIGHT'] = ues['DEVICE_SCREEN_HEIGHT'].astype('int64')
ues['DEVICE_SCREEN_AREA'] = ues['DEVICE_SCREEN_AREA'].astype('int64')

ues['CONTENT_SECTIONS_VIEWED'] = ues['CONTENT_SECTIONS_VIEWED'].astype('int64')
ues['CONTENT_SECTIONS_TOTAL'] = ues['CONTENT_SECTIONS_TOTAL'].replace([None], np.nan).fillna(-1).astype('int64')
ues['CONTENT_SECTIONS_VIEWED_PERCENT'] = ues['CONTENT_SECTIONS_VIEWED_PERCENT'].astype('float64')

ues['SCROLL_DEPTH'] = ues['SCROLL_DEPTH'].astype('int64')
ues['SCROLL_PERCENTAGE'] = ues['SCROLL_PERCENTAGE'].astype('float64')
ues['SCROLL_PERCE_ARTICLE'] = ues['SCROLL_PERCE_ARTICLE'].astype('float64')
ues['VIEW_HEIGHT'] = ues['VIEW_HEIGHT'].astype('int64')


In [None]:
# Step 1: Calculate thresholds based on percentiles
view_count_threshold = ues['VIEW_COUNT'].quantile(0.95)  # High page views, 95th percentile
view_duration_low_threshold = ues['VIEW_DURATION'].quantile(0.05)  # Very low duration, 5th percentile
view_duration_high_threshold = ues['VIEW_DURATION'].quantile(0.95)  # Very high duration, 95th percentile
print(f'view_count_threshold --> {view_count_threshold}')
print(f'view_duration_low_threshold --> {view_duration_low_threshold}')
print(f'view_duration_high_threshold --> {view_duration_high_threshold}')

In [None]:
view_duration_validation(ues)

In [None]:
content_sections_validation(ues)

In [None]:
scroll_validation(ues)

In [None]:
height_validation(ues)

In [None]:
view_count_validation(ues, view_count_threshold)

In [None]:
words_validation(ues)

In [None]:
check_duplicate_view_counts(ues)