# process web visits

the web_visits dataset has unneeded columns. the title and description columns are just more informative of the url. when specific url is visited, it always has the same endpoint.

In [None]:
import pandas as pd
from IPython.display import display

train_path = r"..\data\train\web_visits.csv"
test_path = r"..\data\test\test_web_visits.csv"
df_train = pd.read_csv(train_path)
df_test = pd.read_csv(test_path)
df=pd.concat([df_train, df_test], ignore_index=True)

In [None]:
df["url_category"] = df["url"].str.split("/").str[3]
df["url_category"].unique()

array(['chronic', 'tech', 'heart', 'gaming', 'stress', 'sleep',
       'wellness', 'aerobic', 'diabetes', 'strength', 'hypertension',
       'movies', 'cardio', 'nutrition', 'sports', 'fitness',
       'mindfulness', 'pets', 'cars', 'finance', 'travel', 'recipes',
       'weight'], dtype=object)

In [None]:
def check_consistency(df, col_1, col_2):
    # check whether each title has a single unique description
    desc_counts = df.groupby(col_1)[col_2].nunique().sort_values(ascending=False)

    total_col_1 = desc_counts.shape[0]
    inconsistent = desc_counts[desc_counts > 1]

    print(f"Total distinct {col_1}s: {total_col_1}")
    print(f"{col_1.capitalize()}s with >1 distinct {col_2}: {inconsistent.size}")

    if inconsistent.size == 0:
        print(f"Every {col_1} has a single (identical) {col_2}.")
    else:
        # show top titles with the most distinct descriptions
        display(inconsistent.head(20))
    sample_examples = (
        df[df[col_1].isin(inconsistent.index)]
        .groupby(col_1)[col_2]
        .unique()
        .apply(list)
        .reset_index()
        .rename(columns={col_2: f"{col_2}"})
    )
    display(sample_examples.sample(n=min(10, sample_examples.shape[0]), random_state=0))

In [None]:
check_consistency(df, col_1="title", col_2="description")
print("   -----------------------   ")
check_consistency(df, col_1="description", col_2="title")
print("   -----------------------   ")

Total distinct titles: 26
Titles with >1 distinct description: 0
Every title has a single (identical) description.


Unnamed: 0,title,description


   -----------------------   
Total distinct descriptions: 26
Descriptions with >1 distinct title: 0
Every description has a single (identical) title.


Unnamed: 0,description,title


   -----------------------   


we can see that title and description are connected to each other for each title always the same description and for each title the same description. 
we can drop the decription.

In [None]:
df.drop(columns=["description"], inplace=True)

In [None]:
check_consistency(df, col_1="url_category", col_2="url")
print("   -----------------------   ")


Total distinct url_categorys: 23
Url_categorys with >1 distinct url: 23


url_category
cars            3600
travel          3600
movies          3600
sports          3600
tech            3600
nutrition       3599
sleep           3596
heart           3595
gaming          3491
hypertension    3457
fitness         3449
aerobic         3448
cardio          3448
mindfulness     3442
diabetes        3440
stress          3439
wellness        3438
recipes         3436
strength        3430
weight          3427
Name: url, dtype: int64

Unnamed: 0,url_category,url
11,movies,"[https://portal.site/movies/821, https://porta..."
10,mindfulness,"[https://living.better/mindfulness/758, https:..."
21,weight,"[https://living.better/weight/102, https://liv..."
14,recipes,"[https://guide.wellness/recipes/159, https://g..."
20,travel,"[https://media.hub/travel/490, https://world.n..."
1,cardio,"[https://care.portal/cardio/181, https://guide..."
13,pets,"[https://portal.site/pets/371, https://example..."
22,wellness,"[https://guide.wellness/wellness/814, https://..."
16,sports,"[https://portal.site/sports/267, https://media..."
8,heart,"[https://health.wellco/heart/792, https://guid..."


   -----------------------   


In [None]:
check_consistency(df, col_1="title", col_2="url_category")
print("   -----------------------   ")
check_consistency(df, col_1="url_category", col_2="title")

Total distinct titles: 26
Titles with >1 distinct url_category: 0
Every title has a single (identical) url_category.


Unnamed: 0,title,url_category


   -----------------------   
Total distinct url_categorys: 23
Url_categorys with >1 distinct title: 3


url_category
sleep        2
nutrition    2
heart        2
Name: title, dtype: int64

Unnamed: 0,url_category,title
2,sleep,"[Restorative sleep tips, Sleep hygiene]"
1,nutrition,"[Mediterranean diet, Cholesterol friendly foods]"
0,heart,"[Hypertension basics, Cardiometabolic health]"


we see that we need to merge title and url_category to keep all the information about the category

In [None]:
df["category"]= df["url_category"] + "_" + df["title"]

In [None]:
df.drop(columns=["title", "url_category"], inplace=True)
display(df.head(10))

Unnamed: 0,member_id,url,timestamp,category
0,1,https://health.wellco/chronic/859,02/07/2025 22:38,chronic_Diabetes management
1,1,https://portal.site/tech/328,02/07/2025 11:30,tech_Gadget roundup
2,1,https://health.wellco/heart/792,14/07/2025 00:38,heart_Hypertension basics
3,2,https://example.com/gaming/674,07/07/2025 02:56,gaming_Game reviews
4,2,https://living.better/stress/325,02/07/2025 15:53,stress_Stress reduction
5,2,https://care.portal/sleep/928,02/07/2025 13:41,sleep_Restorative sleep tips
6,2,https://guide.wellness/wellness/814,01/07/2025 00:37,wellness_Healthy eating guide
7,2,https://care.portal/aerobic/384,07/07/2025 14:04,aerobic_Aerobic exercise
8,2,https://guide.wellness/heart/881,04/07/2025 16:29,heart_Hypertension basics
9,2,https://health.wellco/diabetes/194,01/07/2025 13:19,diabetes_HbA1c targets


when training the model we will try with and without the full url as input. maybe more feature spliting. 

In [None]:
df["domain"] = df["url"].str.split("/").str[2]
df["domain"].unique()

array(['health.wellco', 'portal.site', 'example.com', 'living.better',
       'care.portal', 'guide.wellness', 'world.news', 'media.hub'],
      dtype=object)

In [None]:
def get_web_feats(web_path):
    web = pd.read_csv(web_path)
    web["url_category"] = web["url"].str.split("/").str[3]
    web["domain"] = web["url"].str.split("/").str[2]
    web.drop(columns=["description"], inplace=True)
    web["category"]= web["url_category"] + "_" + web["title"]
    web.drop(columns=["title", "url_category"], inplace=True)
    web.