### Import Libraries

In [2]:
import pandas as pd
from bs4 import BeautifulSoup
import requests
import re
from selenium import webdriver
import time

### Load Dataset

In [3]:
df = pd.read_csv('../datasets/first_run_url.csv')

In [4]:
# to view full length of the text
pd.set_option('display.max_colwidth',None)

## Extract Caption

In [5]:
# instantiate dataframe
post_df = pd.DataFrame(columns=['id','url','account_name','following_count','follower_count', 'total_like_count',
                                'date','href','handle','description','hashtag','like_count','bookmark_count',
                                'share_count','comment_count'])

In [6]:
# url
url = df.loc[0,'0']
res = requests.get(url)

# get the content from url
soup = BeautifulSoup(res.content,'lxml')

# instantiate dictionary
video_info = {}

# set an id for the post
num_id = 0
full_id = str(num_id) + '_post'
video_info['id'] = full_id

# save url
video_info['url'] = url

# since account name and date are in the same section:
account_info = soup.find('span', {'data-e2e': 'browser-nickname'})

# account name
for info in account_info.find('span', {'class': 'tiktok-1xccqfx-SpanNickName e17fzhrb3'}):
    video_info['account_name'] = info.text
    
# get date    
for info in account_info.find('span', {'class':None, 'style': None}):
    video_info['date'] = str(info.text)
    if re.match(r'^\d+-\d+$', str(video_info['date'])): # if date does not have year, it is from this year
        video_info['date'] = '2023-' + str(video_info['date'])

# get href
href = soup.find('a', {'class': 'e17fzhrb4 tiktok-d1z8va-StyledLink-StyledLink er1vbsz0'})
video_info['href'] = href['href']

# get handle
handle = soup.find('span', {'data-e2e': 'browse-username'})
video_info['handle'] = handle.text

# since description and hashtags are in the same section
description_hashtag = soup.find('h1', {'data-e2e': 'browse-video-desc'})

# get description
for info in description_hashtag.find('span', {'class': 'tiktok-j2a19r-SpanText efbd9f0'}):
    video_info['description'] = info.text
    
# get hashtags
hashtag_list = []
for info in description_hashtag.find_all('a'):
    # clean up the hashtag. Eg.: '/tag/singapore' to 'singapore'
    hashtag_list.append(info['href'].split('/')[2])
video_info['hashtag'] = hashtag_list

# get 'like' count
like_count = soup.find('strong', {'data-e2e': 'like-count'})
video_info['like_count'] = like_count.text

# get 'comment' count
comment_count = soup.find('strong', {'data-e2e': 'comment-count'})
video_info['comment_count'] = comment_count.text

# get 'bookmark' count
bookmark_count = soup.find('strong', {'data-e2e': 'undefined-count'})
video_info['bookmark_count'] = bookmark_count.text

# get 'share' count
share_count = soup.find('strong', {'data-e2e': 'share-count'})
video_info['share_count'] = share_count.text

# go to user's profile page
url_user_info = 'https://www.tiktok.com' + str(video_info['href'])
res_user_info = requests.get(url_user_info)
soup_user_info = BeautifulSoup(res_user_info.content,'lxml')

# get user's following count
following = soup_user_info.find('strong', {'title': 'Following'})
video_info['following_count'] = following.text

# get user follower count
follower = soup_user_info.find('strong', {'title': 'Followers'})
video_info['follower_count'] = follower.text

# get user total like count
total_like_count = soup_user_info.find('strong', {'title': 'Likes'})
video_info['total_like_count'] = total_like_count.text

# print video_info to check
print(video_info)

# add video_info to post level dataframe
post_df.loc[len(post_df)] = video_info
display(post_df)

num_id += 1

{'id': '0_post', 'url': 'https://www.tiktok.com/@montanadarby/video/7232388092764671258', 'account_name': 'Montana | Travels', 'date': '2023-5-12', 'href': '/@montanadarby', 'handle': 'montanadarby', 'description': 'The perfect 48 hour itinerary for Singapore! ', 'hashtag': ['singapore', 'singaporetravel', 'travel'], 'like_count': '35.6K', 'comment_count': '205', 'bookmark_count': '24.3K', 'share_count': '4849', 'following_count': '149', 'follower_count': '95.2K', 'total_like_count': '4.7M'}


Unnamed: 0,id,url,account_name,following_count,follower_count,total_like_count,date,href,handle,description,hashtag,like_count,bookmark_count,share_count,comment_count
0,0_post,https://www.tiktok.com/@montanadarby/video/7232388092764671258,Montana | Travels,149,95.2K,4.7M,2023-5-12,/@montanadarby,montanadarby,The perfect 48 hour itinerary for Singapore!,"[singapore, singaporetravel, travel]",35.6K,24.3K,4849,205


In [17]:
url

'https://www.tiktok.com/@montanadarby/video/7232388092764671258'

In [50]:
# state which browser we wish to use
driver = webdriver.Chrome()

In [51]:
# load webpage
driver.get(url)

In [52]:
# get ALL comments from video (infinite scrolling vs i number of scrolls)

scroll_pause_time = 1.5 # 1 second

# Get scroll height
last_scroll_height = driver.execute_script("return document.body.scrollHeight")

while True:
    # Scroll down to bottom
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")

    # Wait to load page
    time.sleep(scroll_pause_time)

    # Calculate new scroll height and compare with last scroll height
    new_scroll_height = driver.execute_script("return document.body.scrollHeight")
    if new_scroll_height == last_scroll_height:
        break
    last_scroll_height = new_scroll_height
    


In [53]:
# instantiate dataframe
comment_df = pd.DataFrame(columns=['id','url','handle','comment_count','comment'])

In [58]:
comment_df

Unnamed: 0,id,url,handle,comment_count,comment


## Extract Comments

In [10]:
# once it has scrolled to end
comment_table = soup.find('div',{'class':'tiktok-1ut45oj-DivCommentListContainer ekjxngi3'})
for item_container in comment_table.find_all('div',{'class': 'tiktok-16r0vzi-DivCommentItemContainer eo72wou0'}):
    content_container_1 = item_container.find('div',{'class': 'tiktok-ex1vmp-DivCommentContentContainer e1g2efjf0'})
    content_container_2 = content_container_1.find('div',{'class':'tiktok-1mf23fd-DivContentContainer e1g2efjf1'})
    comment_text = content_container_2.find('p',{'data-e2e':'comment-level-1'})
    try:
        comment = comment_text.find('span')
        print(comment.text)
    except:
        comment = comment_text.find('a',{'class':'e1g2efjf11 tiktok-v7hd8w-StyledLink-StyledUserLinkContent er1vbsz0'})
        print(comment.text)