In [1]:
import os
import pickle
import importlib

import utils

utils = importlib.reload(utils)

from utils import SearcherDriver, build_url_search, parse_text_number, clean_text

### Constants

In [2]:
TIKTOK_COOKIE = 'tiktok_cookie.pkl'

URL_BASE = 'https://www.tiktok.com/search'
FILE = './data/tiktok_violencia_guayaquil.csv'

KEYWORDS = [
  'robo',
  'secuestro',
  'inseguridad',
  'extorsion',
  'guayaquil',
  'guayas',
  'ecuador',
]

url_search = build_url_search(URL_BASE, KEYWORDS)

### Scraping functions

In [3]:
def save_cookie(ds: SearcherDriver, path: str):
  with open(path, 'wb') as f:
    pickle.dump(ds.driver.get_cookies(), f)


def load_cookie(driver, path):
  if not os.path.exists(path):
    print('[WARNING] Cookie file not found')
    return
  
  with open(path, 'rb') as f:
    cookies = pickle.load(f)
    for cookie in cookies:
      driver.add_cookie(cookie)
  
  return driver


def login(sd: SearcherDriver):
  # button by id xpath
  btn_login_xpath = '//button[@id="header-login-button"]'
  sd.get_element_by('xpath', btn_login_xpath).click()


def check_capcha(sd: SearcherDriver):
  #print('[INFO] Checking capcha')
  capcha_xpath = '//div[@id="captcha-verify-container-main-page"]'
  capcha = sd.get_element_by('xpath', capcha_xpath, timeout=2, none_is_ok=True, from_capcha=True)

  if capcha is not None:
    print('[WARNING] Capcha detected')

    if input('Please solve the capcha and press "y" to continue: ').lower() == 'y':
      return True

  return False


def go_next_video(sd: SearcherDriver):
  print('[INFO] Going to next video')
  btn_next_xpath = '//button[@aria-label="Go to next video"]'
  btn = sd.get_element_by('xpath', btn_next_xpath, none_is_ok=True)

  if btn is not None:
    btn.click()
    return True

  return False


def get_comments(sd: SearcherDriver, total_comments: int, comments_per_scroll = 40):
  print('[INFO] Scroll comments')
  container_xpath = '//div[contains(@class, "DivCommentListContainer")]'
  container = sd.get_element_by('xpath', container_xpath)

  if container is None:
    return ''

  iter_scrolls = min(total_comments // comments_per_scroll, 12)

  print(f'[INFO] Scrolling {iter_scrolls} times')
  for _ in range(int(iter_scrolls)):
    sd.driver.execute_script('arguments[0].scrollTop = arguments[0].scrollHeight', container)

  print('[INFO] Getting comments')
  comments_xpath = '//div[contains(@class, "DivCommentContentContainer")]/div[1]/p[contains(@class, "PCommentText")]'
  comments = sd.get_elements_by('xpath', comments_xpath, none_is_ok=True)

  return '|'.join(list(map(lambda c: clean_text(c.text), comments)))


def get_video_info(sd: SearcherDriver):
  info = [sd.driver.current_url]

  btn_more_xpath = '//button[contains(@class, "ButtonExpand") and contains(text(), "more")]'
  btn_more = sd.get_element_by('xpath', btn_more_xpath)
  
  try:
    if btn_more is not None and btn_more.is_displayed():
      print('[INFO] Clicking more button')
      btn_more.click()
  except Exception as e:
    print('[WARMING] Clicking more button:', e)

  desc_xpath = '//h1[@data-e2e="browse-video-desc"]'
  desc = sd.get_element_by('xpath', desc_xpath)
  info.append(desc and clean_text(desc.text) or '')

  username_xpath = '//span[@data-e2e="browse-username"]'
  username = sd.get_element_by('xpath', username_xpath)
  info.append(username and clean_text(username.text) or '')

  date_xpath = '//span[@data-e2e="browser-nickname"]/span[3]'
  date_video = sd.get_element_by('xpath', date_xpath)
  info.append(date_video and clean_text(date_video.text) or '')

  like_count_xpath = '//strong[@data-e2e="browse-like-count"]'
  like_count = sd.get_element_by('xpath', like_count_xpath)
  info.append(like_count and clean_text(like_count.text) or '')

  comment_count_xpath = '//strong[@data-e2e="browse-comment-count"]'
  comment_count = sd.get_element_by('xpath', comment_count_xpath)
  info.append(comment_count and clean_text(comment_count.text) or '')

  share_count_xpath = '//strong[@data-e2e="undefined-count"]'
  share_count = sd.get_element_by('xpath', share_count_xpath)
  info.append(share_count and clean_text(share_count.text) or '')

  comments = get_comments(sd, parse_text_number(info[-2]))
  info.append(comments)

  return ','.join(info)


### Login and Run session

In [4]:
def browser_save_cookie(sd: SearcherDriver):
  sd.get(url_search)
  login(sd)

  if input('Enter "y" to save cookie: ').lower() == 'y':
    save_cookie(sd, TIKTOK_COOKIE)
    print('Cookie saved')
  else:
    print('Cookie not saved')

def scraper_with_cookie(sd: SearcherDriver):
  sd.get(url_search)

  load_cookie(sd.driver, TIKTOK_COOKIE)

  sd.driver.refresh()

  first_video_xpath = '//div[@mode="search-video-list"]/div[1]'
  sd.get_element_by('xpath', first_video_xpath).click()

  counter = len(sd.url_scraped)

  if not sd.is_url_scraped():
    sd.append_data(get_video_info(sd))
    counter += 1
    print(f'[INFO] Videos added: {counter}')
    sd.random_sleep(3, 12)

  while True:
    try:
      result = go_next_video(sd)

      if not result:
        print('[INFO] No more videos')
        break

      is_scraped = sd.is_url_scraped()

      if not is_scraped:
        sd.append_data(get_video_info(sd))
        counter += 1
        print(f'[INFO] Videos added: {counter}')
      
      sd.random_sleep(2, 6 if is_scraped else 12)
    except Exception as e:
      print('[ERROR] Error in loop:', e)
      break


# Steps to scrap
If this is the first time you are running this notebook you will need to install selenium:
- `!pip install selenium`
Also need to download the chromedriver from [here](https://googlechromelabs.github.io/chrome-for-testing/) and put it in the same folder as this notebook and set the path to the chromedriver.

If you are using linux or mac you can use `homebrew` to install chromedriver and dont need to set explicitly the path to the chromedriver:
- `brew install chromedriver`

# Login and Scrap
First we need get the cookies os a active session. The cell code below detect if the cookies are already saved in the file `tiktok_cookies.pkl` and if not it will open a browser and ask you to login in the site. After login you must enter "y" in the input prompted on terminal/vscode and the cookies will be saved in the file `tiktok_cookies.pkl`. Finally, `restart the kernel` run all the cells code again, now the cookies will be loaded from the file `tiktok_cookies.pkl`.

You can customize the search changing the `KEYWORDS` variable.


In [5]:
searcher = SearcherDriver(file=FILE, check_captcha_fn=check_capcha)

searcher.run(
  scraper_with_cookie if os.path.exists(TIKTOK_COOKIE) else browser_save_cookie
)

[DRIVER] Loaded 56 scraped URLs
[DRIVER] Running scraper_with_cookie
[DRIVER] https://www.tiktok.com/@henrybustamant/video/7345110706997546245?q=robo%20secuestro%20inseguridad%20extorsion%20guayaquil%20guayas%20ecuador%20 in ['https://www.tiktok.com/@guayaquilalrojovivo.com/video/7397151850799090950?q=violencia%20guayaquil%20', 'https://www.tiktok.com/@guayacaenuk/video/7358960527378255136?q=violencia%20guayaquil%20', 'https://www.tiktok.com/@enfoke_tv/video/7449160699567148294?q=violencia%20guayaquil%20', 'https://www.tiktok.com/@ftabacchi/video/7456220985755307270?q=violencia%20guayaquil%20', 'https://www.tiktok.com/@guayaquilalrojovivo.com/video/7384935592540327174?q=violencia%20guayaquil%20', 'https://www.tiktok.com/@henrybustamant/video/7382947253817724165?q=violencia%20guayaquil%20', 'https://www.tiktok.com/@extraec/video/7389404398830587142?q=violencia%20guayaquil%20', 'https://www.tiktok.com/@antonytve/video/7436463108895313208?q=violencia%20guayaquil%20', 'https://www.tiktok.c