In [1]:
# Needed libraries 

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.actions.action_builder import ActionBuilder
from selenium.webdriver.common.actions.key_input import KeyInput
from selenium.webdriver.common.actions.pointer_input import PointerInput

# System requiremenmts 
import time
import random
import datetime as dt

# Data Management 
import pandas as pd

# Web scrapping functions 
import src.web_minning as wm

In [2]:
# Parameters 

CITY = "cdmx"
TAGS_LIST = wm.city_hashtags(CITY)
MAX_DEPTH = 5 # Iteration loop over each initial video
#MAX_SCROLLING = #

RAW_DATA = "./data/raw/"
ROOT_FILE = "root_links.csv"
DAY_FILE = f"day_file_{CITY}_{pd.to_datetime(dt.datetime.today()).strftime('%Y-%m-%d')}.csv"
DAYS_FRESHNESS = 300

In [3]:
results = []


for ht in TAGS_LIST:


    try:
        # Invoke driver object 

        driver = webdriver.Chrome()
        wait = WebDriverWait(driver, 15)
        actions = ActionChains(driver)
        keyboard = KeyInput("keyboard")
        actions_b = ActionBuilder(driver, keyboard=keyboard)


        # Open Main page
        driver.get("https://www.tiktok.com")


        # Look for search icon 
        search_icon = wait.until(EC.presence_of_element_located((
            By.XPATH, "//*[@role='searchbox']"
        )))

        # Put mose in search bar 
        search_icon.click()

        # Input labels 
        actions.send_keys(ht)
        time.sleep(random.uniform(0.05, 0.15))
        actions.perform()

        # Enter to look 
        actions.send_keys("\n")
        actions.perform()

        time.sleep(random.uniform(1.5, 4.5))


        # Getting all inmediate results 
        items = wm.scroll_until_no_new_items(driver)
        wm.scroll_through_elements(driver, items)
        items = driver.find_elements(By.CSS_SELECTOR, "div[id^='grid-item-container-']")

        # Get data from results

        for container in items:
            try:
                href = container.find_element(
                    By.CSS_SELECTOR, "a[href*='/video/']"
                ).get_attribute("href")

                description = container.find_element(
                    By.CSS_SELECTOR, "[data-e2e='new-desc-span']"
                ).text.strip()

                date = container.find_element(
                    By.CSS_SELECTOR, "div[class*='DivTimeTag']"
                ).text.strip()

                results.append({
                    "href": href,
                    "description": description,
                    "date": date,
                    "hashtag_label": ht
                })

            except Exception:
                continue

        print(f"Links search for {ht} ended.")

        driver.quit()

    except:
        driver.quit()



Links search for #dondeircdmx ended.
Links search for #quehacercdmx ended.
Links search for #lugarescdmx ended.
Links search for #turismocdmx ended.
Links search for #lugaresbonitos #cdmx ended.
Links search for #lugaressecretos #cdmx ended.
Links search for #lugaresimperdibles #cdmx ended.
Links search for #lugaresrecomendados #cdmx ended.
Links search for #experiencias #cdmx ended.
Links search for #experienciasunicas #cdmx ended.
Links search for #lugaresbonitoscdmx ended.
Links search for #lugaressecretoscdmx ended.
Links search for #findesemana #cdmx ended.
Links search for #findesemanacdmx ended.
Links search for #planesdefin #cdmx ended.
Links search for #planperfecto  #cdmx ended.
Links search for #escapadacdmx ended.
Links search for #brunch #cdmx ended.
Links search for #cafes #cdmx ended.
Links search for #postres #cdmx ended.
Links search for #desayunos #cdmx ended.
Links search for #restaurantes #cdmx ended.
Links search for #brunchcdmx ended.
Links search for #cafescdmx e

In [4]:
df_raw = pd.DataFrame(results)
df_raw['level'] = 0

In [5]:
df_raw.to_csv(RAW_DATA + ROOT_FILE, index = False)

In [None]:
root_list = df_raw.href.drop_duplicates().to_list()
caption_list = df_raw.description.drop_duplicates().to_list()

desc_results = []

for h, d in zip(root_list, caption_list):

    try:

        driver = webdriver.Chrome()
        wait = WebDriverWait(driver, 15)
        actions = ActionChains(driver)
        keyboard = KeyInput("keyboard")
        actions_b = ActionBuilder(driver, keyboard=keyboard)

        driver.get(h)

        time.sleep(random.uniform(2.5,3.5))

        more_buton = driver.find_element(
            By.CSS_SELECTOR,
            "button[class*='ButtonExpand']"
        )

        more_buton.click()

        time.sleep(random.uniform(2.5,3.5))


        # Date Extraction 
        try:
            raw = driver.find_element(
            By.CSS_SELECTOR,
            "span[class*='StyledTUXText']"
            ).text
            date_text = raw.replace("¬∑","").replace(" ","")
        except Exception:
            date_text = ""

        time.sleep(random.uniform(2.5,3.5))

        # Description extraction
        try:
            raw = driver.find_element(
            By.CSS_SELECTOR,
            "div[class*='DivCustomTDKContainer']"
            ).text

            ia_disclaimer = "Esta informaci√≥n se gener√≥ por IA y puede presentar resultados que no son relevantes. No representa las opiniones o consejos de TikTok. Si tienes alguna duda, env√≠anosla a trav√©s de: Comentarios y ayuda: TikTok"

            desc_text = raw.replace(ia_disclaimer,"")

        except Exception:
            desc_text = ""

        time.sleep(random.uniform(0.5,1.5))


        # Get Locations 

         
        try:

            # Get address label

            # From caption 
            if d !='':
                excat_add =  wm.extract_addresses(d, top_k=1, min_score=2.5)
                if len(excat_add) > 0:
                    add_label =  excat_add[0].text
                else:
                    if len(wm.detect_places(d))>0:
                        add_label =  wm.detect_places(d)[0].text
                    else:
                        add_label = ''

            # Fromn text description 
            else:
                if desc_text != "":
                    excat_add =  wm.extract_addresses(desc_text, top_k=1, min_score=2.5)
                    if len(excat_add) > 0:
                        add_label =  excat_add[0].text
                    else:
                        if len(wm.detect_places(desc_text))>0:
                            add_label =  wm.detect_places(d)[0].text
                        else:
                            add_label = ''

                else:
                    add_label = ""

            # Call API if we have label
            if add_label != "":

                loc_osm_api = wm.geocode_osm("F√©lix | CDMX")
                
                # Update interest values
                if loc_osm_api["found"]:
                    ind_found = True
                    lat = loc_osm_api["lat"]
                    lon = loc_osm_api["lon"]
                    osm_name_label = loc_osm_api["display_name"]

                else:
                    ind_found = False
                    lat = 0.0
                    lon = 0.0
                    osm_name_label = ""

            # Empty values 
            else: 
                ### [COMPLETE WITH SPEECH TO TEXT OR VIDEO TO TEXT]

                ind_found = False
                lat = 0.0
                lon = 0.0
                osm_name_label = ""

            print(f"Geolocation Process ended for {h}")
                
        # Empty values 
        except:


            ind_found = False
            lat = 0.0
            lon = 0.0
            osm_name_label = ""
            print(f"Geolocation Process Failed for {h}")

        # Update results 
        desc_results.append({
                    "href": h,
                    "long_description": desc_text,
                    "date_long": date_text, 
                    "ind_found":ind_found, 
                    "lat":lat,
                    "lon":lon, 
                    "osm_name_label":osm_name_label
                                            })

        time.sleep(random.uniform(0.5,1.5))

    # Improve with finally

        driver.quit()

    except:
        driver.quit()

Geolocation Process ended for https://www.tiktok.com/@rutadeviajemx/video/7577239195727465736
Geolocation Process ended for https://www.tiktok.com/@soymiauoficial/video/7416159212868259078
Geolocation Process ended for https://www.tiktok.com/@moniksmr/video/7545214454246444306
Geolocation Process ended for https://www.tiktok.com/@falvindates/video/7589317902264700180
Geolocation Process ended for https://www.tiktok.com/@nylagame/video/7591132225698090260
Geolocation Process ended for https://www.tiktok.com/@elhilonegro/video/7557493636921888007
Geolocation Process ended for https://www.tiktok.com/@cocodidany/video/7590571924519013653
Geolocation Process ended for https://www.tiktok.com/@undiacondya/video/7515511709621079314
Geolocation Process Failed for https://www.tiktok.com/@moniksmr/video/7572281843647630610
Geolocation Process ended for https://www.tiktok.com/@undiacondya/video/7535855035767459079
Geolocation Process ended for https://www.tiktok.com/@carlos_arellanes/video/7543442

In [None]:
df_raw_des = pd.DataFrame(desc_results)

In [None]:
df_raw_des.to_csv(RAW_DATA + DAY_FILE, index = False)

Date and description cleaning 

In [3]:
df_raw = pd.read_csv(RAW_DATA + ROOT_FILE)
df_raw_des = pd.read_csv(RAW_DATA + DAY_FILE)

In [4]:
df = (df_raw.merge(
    df_raw_des
    , on = 'href', how = 'left'
    )
    .dropna(subset=['date_long'])
)

In [5]:
# Get full description 
df = df.query(""" long_description != '' """)

# Get data 
df.date_long = pd.to_datetime(df.date_long.apply(lambda x: 
                   pd.to_datetime(dt.datetime.today()).strftime('%Y-%m-%d') if 'Hace' in x else x)
                , format='%Y-%m-%d')

date_limit = pd.to_datetime(
    dt.datetime.today()- dt.timedelta(days=DAYS_FRESHNESS)
    ).strftime('%Y-%m-%d')


df = df.query(f""" date_long > '{date_limit}'""")

df = df[~(df.description.isna() & df.long_description.isna())]

df['description'] = df['description'].fillna('')
df['long_description'] = df['long_description'].fillna('')

In [6]:
def get_address(string):
    excat_add =  wm.extract_addresses(string, top_k=1, min_score=2.5)

    if len(excat_add) > 0:
        return wm.extract_addresses(string, top_k=1, min_score=2.5)[0].text
    else:
        if len(wm.detect_places(string)):
            return wm.detect_places(string)[0].text
        else:
            return ''

In [7]:
df["add_long"] = df.long_description.apply(get_address)
df["add_desc"] = df.description.apply(get_address)

In [30]:
df[df.add_desc != '']

Unnamed: 0,href,description,date,hashtag_label,level,long_description,date_long,add_long,add_desc
91,https://www.tiktok.com/@rivardo24/video/752754...,"Visita el Acuario Michin, el acuario m√°s grand...",2025-7-15,#dondeircdmx,0,"Visita el Acuario Michin en CDMX, el m√°s grand...",2025-07-15,"el Acuario Michin en CDMX, el m√°s grande de La...","el Acuario Michin, el acuario m√°s grande de La..."
92,https://www.tiktok.com/@kar_martinezg/video/75...,¬øCu√°nto cuesta ir a Six Flags M√©xico en Christ...,2025-12-26,#dondeircdmx,0,Precios para disfrutar Christmas In The Park e...,2025-12-26,Six Flags M√©xico,Christmas In The Park
94,https://www.tiktok.com/@valpina.mx/video/75173...,Quieres un PLAN DIFERENTE este fin de semana e...,2025-6-18,#dondeircdmx,0,,2025-06-18,,CDMX
95,https://www.tiktok.com/@whattodoinmexicocity/v...,"üéÑ‚ú® Navidad m√°gica en Banj√©rcito, CDMX ‚ú®üéÑ",2025-12-27,#dondeircdmx,0,"Navidad M√°gica en Banj√©rcito, CDMX: Plan Famil...",2025-12-27,"Banj√©rcito, CDMX: Plan Familiar Ideal","Banj√©rcito, CDMX ‚ú®üéÑ"
108,https://www.tiktok.com/@imgabycontre/video/753...,Conoces este lugar? Esta es la Privada Roja en...,2025-7-23,#lugarescdmx,0,,2025-07-23,,CDMX
...,...,...,...,...,...,...,...,...,...
1462,https://www.tiktok.com/@mitikahmall/video/7510...,Restaurantes escondidos en M√≠tikah,2025-5-30,#restaurantescdmx,0,Descubre Restaurantes Escondidos en M√≠tikah CD...,2025-05-30,M√≠tikah CDMX,M√≠tikah
1467,https://www.tiktok.com/@sisomosgemelos/video/7...,Un buen Restaurante Italiano en CDMX üçùüçïüáÆüáπ üìç4 M...,2025-7-4,#restaurantescdmx,0,Descubre 4 Mori: Un Restaurante Italiano en CD...,2025-07-04,CDMX,"üìç4 Mori, 3900 Vasco de Quiroga (Sante Fe)"
1474,https://www.tiktok.com/@camaronbuchonmx/video/...,¬øMariscos buenos en CDMX? ¬°Claro que si! En el...,2025-11-24,#restaurantescdmx,0,Los mejores mariscos en CDMX: Camar√≥n Buch√≥n\n...,2025-11-24,CDMX: Camar√≥n Buch√≥n,CDMX
1476,https://www.tiktok.com/@soyelarturito/video/75...,"Tacos Charly, CDMX! üá≤üáΩüåÆ Para muchos el mejor s...",2025-11-2,#restaurantescdmx,0,Tacos Charly: El Mejor Suadero de CDMX\nDescub...,2025-11-02,"CDMX, donde el suadero se convierte en una exp...",Tacos Charly | CDMX


In [23]:
df.loc[1467,'description']

'Un buen Restaurante Italiano en CDMX üçùüçïüáÆüáπ üìç4 Mori, 3900 Vasco de Quiroga (Sante Fe)'

In [27]:
wm.extract_addresses(df.loc[1467,'description'], top_k=3, min_score=2.5)

[AddressMatch(text='üìç4 Mori, 3900 Vasco de Quiroga (Sante Fe)', span=(42, 83), score=4.0, rule='emoji_soft')]

In [None]:
df.loc[1467,'href']

'https://www.tiktok.com/@ely_guia/video/7587190357058735378'