In [None]:
%pip install requests
%pip install transformers
%pip install scipy
%pip install 'transformers[torch]'
%pip install pandas
%pip install matplotlib
%pip install wordcloud

## Scrapping TikTok Comments

1. Select a video
2. Find the XHR api request using the browser development tools
3. Imitate a request from the browser (use the headers provided by the browser, prefered Chrome)
4. Make the request and convert data to JSON format
5. Parse the data to make it readable
6. Loop through the pages and save all comments

Resources: 
https://www.youtube.com/watch?v=Tqnuhaaw738

In [None]:
import requests, json
import time
import os

class TitkTokCommentScraper:
    """
    A class for scraping comments from TikTok videos.    
    """

    def __init__(self):
        self.headers = {
            "accept": "*/*",
            "accept-language": "es-ES,es;q=0.8,en-US;q=0.5,en;q=0.3",
            "cache-control": "no-cache",
            "pragma": "no-cache",
            "priority": "u=1, i",
            "Referer": "https://www.tiktok.com/explore",
            "sec-ch-ua": '"Google Chrome";v="129", "Chromium";v="129", "Not=A?Brand";v="8"',
            "sec-ch-ua-mobile": "?0",
            "sec-ch-ua-platform": '"Windows"',
            "sec-fetch-dest": "empty",
            "sec-fetch-mode": "cors",
            "sec-fetch-site": "same-origin", 
            "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36",
        }

    def _req(self, post_id : str, page_number : int = 0) -> dict:
        """
        Args:
            post_id (str): The ID of the TikTok video.
            page_number (int, optional): The page number of comments to retrieve. Defaults to 0.
            
        Returns:
            dict: A dictionary containing the json response.
        """

        timestamp = int(time.time())
        request_url = f"https://www.tiktok.com/api/comment/list/?WebIdLastTime={timestamp}&aid=1988&app_language=en&app_name=tiktok_web&aweme_id={post_id}&browser_language=en-US&browser_name=Mozilla&browser_online=true&browser_platform=Win32&browser_version=5.0%20%28Windows%20NT%2010.0%3B%20Win64%3B%20x64%29%20AppleWebKit%2F537.36%20%28KHTML%2C%20like%20Gecko%29%20Chrome%2F130.0.0.0%20Safari%2F537.36&channel=tiktok_web&cookie_enabled=true&count=20&cursor={page_number}&data_collection_enabled=false&device_id=7433621668948149766&device_platform=web_pc&focus_state=true&from_page=video&history_len=2&is_fullscreen=false&is_page_visible=true&odinId=7433621429462483973&os=windows&priority_region=&referer=&region=CO&screen_height=1080&screen_width=1920&tz_name=America%2FBogota&user_is_login=false&webcast_language=en&msToken=av6rBVq946q396gSsl28RTeNIUmHRmyh119dQRpJtEMHOOq2rgFPrl2gEauidZ7zCfjoh9P5PZ11FBS6sMQRg84JTh67SECufY32cARpgbp-DjPSWUt14aoeEih-J9fxjtpF4H38743oXcMlg8JHKA==&X-Bogus=DFSzswVYCD2ANapNtsxp9V6HQhO3&_signature=_02B4Z6wo00001HlBmxQAAIDBG-6YjndyIWR5QZ-AAHmB61"
        
        response = requests.get(request_url, headers=self.headers)
        info = response.text
        raw_data = json.loads(info)
        
        return raw_data
        
    def _parser(self, data : dict) -> list:
        """
        Args:
            data (dict): A dictionary containing the json response.
            
        Returns:
            list: A list of dictionaries containing the comments.
        """
        
        comments = []
        
        for comment in data['comments']:
            desc_comment = comment.get('share_info', {}).get('desc', None)
            text_comment = comment.get('text', None)        
            user_id = comment.get('user', {}).get('unique_id', None)
            
            comments.append({
                "id": user_id,
                "comment": desc_comment,
                "text": text_comment
            })
            
        return comments
    
    def get_comments(self, post_url : str, start_page_number : int = 0, file_path : str = '') -> str:
        """
        Args:
            post_url (str): The URL of the TikTok video.
            start_page_number (int, optional): The start page number of comments to retrieve. Defaults to 0.
            file_path (str, optional): The path to save the comments. Defaults to '' (current directory).
            
        Returns:
            str: The path to the JSON file containing the list of dictionaries with comments.
        """

        comments = []
        page_number = start_page_number * 20 # Default page number skips every 20 comments
        post_id = post_url.split("/")[-1]
        file_path = os.path.join(file_path, f"comments_for_{post_id}.json")

        while 1:
            raw_data = self._req(post_id, page_number)
            comments.extend(self._parser(raw_data))
            more_data = raw_data.get('has_more', False)

            if more_data == 1:
                time.sleep(1)
                page_number += 20
            else:
                break
                
        with open(f"comments_for_{post_id}.json", "a", encoding="utf-8") as file_path:
            json.dump(comments, file_path, ensure_ascii=False, indent=4)
            
        return file_path.name
    
    def load_comment_list(self, file_path : str, data_column : str = 'text') -> list:
        """
        Args:
            file_path (str): The path to the JSON file containing the list of dictionaries with comments.
            data_column (str, optional): The column to extract from the JSON file. Defaults to 'text'.
            
        Returns:
            list: A list of dictionaries containing the comments.
        """
        data_list = []
        if not os.path.isfile(file_path):
            return []
        else:
            with open(file_path, "r", encoding="utf-8") as f:
                comment_data = json.load(f)
                for data in comment_data:
                    data_list.append(data[data_column])
            return data_list

In [None]:
tiktok_scraper = TitkTokCommentScraper()

posts_url = [
    "https://www.tiktok.com/@burnxice/video/7390628837131914503", 
    "https://www.tiktok.com/@keemokazi/video/7430978142325509419",
    "https://www.tiktok.com/@itsqcp/video/7429806435183217963",    
    "https://www.tiktok.com/@fdontcare/video/7413091288263691553",    
    ]
cursor = 0

for post_url in posts_url:
    comments_file = tiktok_scraper.get_comments(post_url, cursor)
    # comments_list = tiktok_scraper.load_comment_list(comments_file, 'text')

## Roberta Pre-Trained Model
- Use a model trained from a large corpus of data
- Transformer model accounts for the words but also the context related to other words.
- Define a general sentiment for the data.

Resources: https://www.youtube.com/watch?v=QpzMWQvxXWk

In [None]:
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from scipy.special import softmax
import pandas as pd

In [None]:
MODEL = "cardiffnlp/twitter-roberta-base-sentiment"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)

def polarity_scores_roberta(text : str) -> dict:
    """
    Analyzes the sentiment of a text using the Roberta model.
    Args:
        text (str): The text to analyze.
        
    Returns:
        dict: A dictionary containing the sentiment scores.
    """
    try:
        encoded_text = tokenizer(text, return_tensors="pt")
        output = model(**encoded_text)
        scores = output[0][0].detach().numpy()
        scores = softmax(scores)
        scores_dict = dict(zip(['negative', 'neutral', 'positive'], scores))
        return scores_dict
    except RuntimeError:
        print('Unable to process the text')

def create_dataframe(comments_list : list) -> pd.DataFrame:
    """
    Creates a dataframe from a list of comments.
    Args:
        comments_list (list): A list of comments.
        
    Returns:
        pd.DataFrame: A dataframe containing the comments and their sentiment scores.
    """
    comments_series = pd.Series(comments_list)
    
    sentiment_scores = comments_series.apply(polarity_scores_roberta)
    sentiments = sentiment_scores.apply(lambda x: max(x, key=x.get))
    
    df = pd.DataFrame({
        'comment': comments_list,
        'sentiment': sentiments,
        'negative': sentiment_scores.apply(lambda x: x['negative']),
        'neutral': sentiment_scores.apply(lambda x: x['neutral']),
        'positive': sentiment_scores.apply(lambda x: x['positive'])
    })
    
    return df

def evaluate_general_sentiment(df : pd.DataFrame) -> pd.DataFrame:
    """
    Evaluates the general sentiment of a dataframe.
    Args:
        df (pd.DataFrame): A dataframe containing the comments and their sentiment scores.
        
    Returns:
        str: The general sentiment of the dataframe.
    """
    average_negative = df['negative'].sum()
    average_neutral = df['neutral'].sum()
    average_positive = df['positive'].sum()

    if average_positive > average_neutral and average_positive > average_negative:
        general_sentiment = 'positive'
    elif average_negative > average_neutral and average_negative > average_positive:
        general_sentiment = 'negative'
    else:
        general_sentiment = 'neutral'

    return f'The general sentiment is: {general_sentiment}'
        

In [None]:
# find all json files in the folder
import os

tiktok_scraper = TitkTokCommentScraper()
file_path = os.getcwd()
comment_files = []

for file in os.listdir(file_path):
    if file.endswith(".json"):
        comment_files.append(file)

for file in comment_files:
    comments_list = tiktok_scraper.load_comment_list(file, 'text')
    df = create_dataframe(comments_list)
    print(f"general sentiment for {file}")
    print(evaluate_general_sentiment(df))
    print("----------------------------------")

## Word Cloud
- Re-arrange the data to use all comments from the videos
- Use interpolation 'bilinear' for simplicity

Resources: https://www.youtube.com/watch?v=X59oBuevKVA

In [None]:
from wordcloud import WordCloud
from wordcloud import STOPWORDS
import string

alphabet = list(string.ascii_lowercase[::-1])

aditional_stopwords = ["ur", "thats", "ok", ] + alphabet + list(STOPWORDS)

# word_cloud = WordCloud(width=800, height=600, background_color='white', stopwords=STOPWORDS).generate_from_frequencies(df['sentiment'].value_counts())
word_cloud = WordCloud(width=800, height=600, background_color='black', max_words = 20, stopwords=aditional_stopwords).generate(df['comment'].str.cat(sep=' '))
word_cloud.to_image()