In [41]:
import pandas as pd
from custom_helpers_py.get_paths import get_out_folder
import os.path

ANALYSIS_FOLDER_PATH = get_out_folder("analysis")

In [42]:
# Get dfs
ph_extract_filepath = os.path.join(get_out_folder("source_extracts"), "ph_extract.csv")
ph_df = pd.read_csv(ph_extract_filepath, encoding="utf-8")

aift_extract_filepath = os.path.join(get_out_folder("source_extracts"), "aift_extract.csv")
aift_df = pd.read_csv(aift_extract_filepath, encoding="utf-8")

In [43]:
# Get urls df
master_df = pd.concat([ph_df, aift_df], axis=0)
urls_df = master_df[["product_url"]]
urls_df.columns = ["url"]
urls_df = urls_df.drop_duplicates(subset="url")
urls_df = urls_df.reset_index(drop=True)
urls_df

Unnamed: 0,url
0,delighted.com
1,toggl.com
2,sketch.com
3,typeform.com
4,gumroad.com
...,...
92227,chat.openai.com/g/g-casevan8k-blog-writer
92228,apps.apple.com/us/app/sereneai-guided-sleep-au...
92229,meeko.ai
92230,ytube.ai


In [44]:
# Find common beginnings and group by
def get_start_string(in_str:str):
    if "/" not in in_str:
        return in_str
    return in_str[:in_str.find("/")]

urls_df["start_string"] = urls_df["url"].apply(get_start_string)


start_string_duplicate_counts = urls_df.copy(deep=True).drop_duplicates().assign(duplicate_count=urls_df.groupby('start_string').start_string.transform('size'))[["start_string", "duplicate_count"]].drop_duplicates(subset="start_string").sort_values(by="duplicate_count")

start_string_grouped = urls_df.copy(deep=True).groupby("start_string")
start_string_grouped = pd.concat([group for _, group in start_string_grouped])

# Save to csv
start_string_duplicate_counts_csv_filepath = os.path.join(ANALYSIS_FOLDER_PATH, "start_string_duplicate_counts.csv")
start_string_duplicate_counts.to_csv(start_string_duplicate_counts_csv_filepath, encoding="utf-8", index=False)

start_string_grouped_csv_filepath = os.path.join(ANALYSIS_FOLDER_PATH, "start_string_grouped.csv")
start_string_grouped.to_csv(start_string_grouped_csv_filepath, encoding="utf-8", index=False)

start_string_grouped

Unnamed: 0,url,start_string
55191,008agent.ai,008agent.ai
22223,01100010.wtf,01100010.wtf
52374,01supply.com,01supply.com
53441,04-x.com,04-x.com
55143,099.supply,099.supply
...,...,...
53392,zyrahost.com,zyrahost.com
60410,zyro.com,zyro.com
28731,zyrouge.github.io/symphony,zyrouge.github.io
39608,zytalinfo.com/service/seo,zytalinfo.com


In [52]:
# Find path parameters, sort by length and analyze

# Get path parameters
def get_path_params(in_url:str) -> list[str]:
    path_params = in_url.split("/")[1:]
    return path_params

path_params_df = urls_df.copy(deep=True)
path_params_df["path_params_list"] = path_params_df["url"].apply(get_path_params)
non_empty_list_mask = path_params_df["path_params_list"].str.len() != 0
path_params_df = path_params_df[non_empty_list_mask]
path_params_df = path_params_df.explode("path_params_list")
path_params_df["len_path_params"] = path_params_df["path_params_list"].str.split("-").str.len()
path_params_df = path_params_df[path_params_df["len_path_params"] > 1]
path_params_df = path_params_df.sort_values("len_path_params")

path_params_csv_filepath = os.path.join(ANALYSIS_FOLDER_PATH, "path_params.csv")
path_params_df.to_csv(path_params_csv_filepath, encoding="utf-8", index=False)

path_params_df

Unnamed: 0,url,start_string,path_params_list,len_path_params
63,buy.garmin.com/en-us/us/p/643260,buy.garmin.com,en-us,2
61078,github.com/carson-katri/dream-textures,github.com,dream-textures,2
61089,glasp.co/ai-writing,glasp.co,ai-writing,2
28554,winni.in/anniversary-gifts,winni.in,anniversary-gifts,2
28551,mailchi.mp/915b8037e67d/free-content,mailchi.mp,free-content,2
...,...,...,...,...
6171,carplaysmartbox.com/products/app-download-ai-b...,carplaysmartbox.com,app-download-ai-box-wireless-carplay-adapter-s...,25
7618,veefixindia.com/product/high-precision-pulse-h...,veefixindia.com,high-precision-pulse-heat-acf-big-cof-fog-flex...,28
55859,partsam.com/collections/hot-sale-1/products/pa...,partsam.com,partsam-12v-waterproof-square-led-trailer-ligh...,31
16975,dishapublication.com/disha-120-jee-main-online...,dishapublication.com,disha-120-jee-main-online-2022-2012-offline-20...,32
