<a href="https://colab.research.google.com/github/frasercrichton/data-investigation-conspiracy-aotearoa/blob/main/analysis/Pattern_of_Life.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Clean and Extract Data


In [1]:
import pandas as pd
from deep_translator import GoogleTranslator
videos_df = pd.read_json('../data/source/video-2023-09-14.json', convert_dates=['createTime'])

## Parse the Videos

- Extract:
  - Comments
  - Likes
- Translate any text

In [160]:
videos_df = videos_df.drop(columns=[
                                    'author',
                                    'challenges',
                                    'collected',
                                    'contents', 
                                    'digged', 
                                    'duetDisplay', 
                                    'forFriend',
                                    'itemCommentStatus', 
                                    'privateItem', 
                                    'secret', 
                                    'shareEnabled', 
                                    'stitchDisplay', 
                                    'officalItem',
                                    'originalItem',
                                    'duetEnabled',
                                    'stitchEnabled',
                                    ])
def translate_text(text):
    translation = GoogleTranslator(source='it', target='en').translate(text)
    return translation  
    # return text.upper()

def parse_desc(desc):
    return translate_text(desc)

def parse_stats(stats):
    return pd.Series([stats['collectCount'], stats['commentCount'], stats['diggCount'], stats['playCount'], stats['shareCount']])

def parse_text_extra(extra_text):
    if extra_text is None:
        return pd.Series([extra_text, extra_text])
    
    extra_text_as_json = pd.json_normalize(extra_text)    
    hashtags = ', '.join(extra_text_as_json['hashtagName'].values)
    return pd.Series([hashtags, translate_text(hashtags)])

def parse_warn_info(warn_info):
    if warn_info is None:
        return warn_info
    
    return warn_info[0]['text']

def parse_video(video):    
    return video['zoomCover']['960']

# desc: translate the video description into English
videos_df['desc_en'] = videos_df.apply(lambda row: parse_desc(row['desc']), axis=1)

# textExtra: extract a list of hashtags
videos_df[['textExtra', 'textExtra_en']] = videos_df.apply(lambda row: parse_text_extra(row['textExtra']), axis=1)

# stats: turn the like, comment, etc. counts into columns
videos_df[['collectCount', 'commentCount', 'diggCount', 'playCount', 'shareCount']] = videos_df.apply(lambda row: parse_stats(row['stats']), axis=1)
videos_df = videos_df.drop(columns=['stats'])

# warnInfo: extract any content warning text 
videos_df['warnInfo'] = videos_df.apply(lambda x: parse_warn_info(x['warnInfo']), axis=1)

# Videos - zoomCover: get the URL of the cover image
videos_df['coverImage'] = videos_df.apply(lambda row: parse_video(row['video']), axis=1)

videos_df


Unnamed: 0,createTime,desc,id,music,video,textExtra,warnInfo,desc_en,textExtra_en,collectCount,commentCount,diggCount,playCount,shareCount,coverImage
0,2023-09-13 18:57:03,"La mia intervista di questa sera a ""Cinque Min...",7278386520275373056,"{'authorName': 'Giorgia Meloni', 'coverLarge':...","{'bitrate': 1178262, 'bitrateInfo': [{'Bitrate...",,,"My interview this evening on ""Cinque Minuti"", ...",,840,2669,14800,435800,438,https://p16-sign-useast2a.tiktokcdn.com/tos-us...
1,2023-09-12 16:05:41,Stiamo dando alla Nazione una strategia che no...,7277971263958666240,"{'authorName': 'Giorgia Meloni', 'coverLarge':...","{'bitrate': 1021716, 'bitrateInfo': [{'Bitrate...",,,We are giving the Nation a strategy that it ha...,,1050,6426,21200,622700,598,https://p16-sign-useast2a.tiktokcdn.com/tos-us...
2,2023-09-11 09:02:24,"Grazie India, complimenti per il successo del ...",7277491085071387648,"{'authorName': 'Giorgia Meloni', 'coverLarge':...","{'bitrate': 1374581, 'bitrateInfo': [{'Bitrate...",g20,,"Thank you India, congratulations on the succes...",g20,313,468,7158,137500,224,https://p16-sign-useast2a.tiktokcdn.com/tos-us...
3,2023-09-03 07:28:10,A 41 anni dal brutale attentato mafioso che ha...,7274498125660703744,"{'authorName': 'Giorgia Meloni', 'coverLarge':...","{'bitrate': 627490, 'bitrateInfo': [{'Bitrate'...",,,41 years after the brutal mafia attack which c...,,277,274,4874,141400,204,https://p16-sign-useast2a.tiktokcdn.com/tos-us...
4,2023-08-12 06:05:11,Salario minimo: punto stampa dopo l’incontro c...,7266312532665584640,"{'authorName': 'Giorgia Meloni', 'coverLarge':...","{'bitrate': 1077066, 'bitrateInfo': [{'Bitrate...",,,Minimum wage: press point after the meeting wi...,,2847,7144,52600,1800000,1388,https://p16-sign-useast2a.tiktokcdn.com/tos-us...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
219,2022-02-20 17:31:35,Il risultato del #greenpass è stato solo quell...,7066845965105318912,"{'authorName': 'Giorgia Meloni', 'coverLarge':...","{'bitrate': 998202, 'bitrateInfo': [{'Bitrate'...",greenpass,,The result of the #greenpass was only to manag...,greenpass,2460,2020,54800,977100,4925,https://p16-sign-va.tiktokcdn.com/tos-maliva-p...
220,2022-02-18 17:00:34,Noi vogliamo difendere l’Italia 🇮🇹,7066095800291576832,"{'authorName': 'Giorgia Meloni', 'coverLarge':...","{'bitrate': 710139, 'bitrateInfo': [{'Bitrate'...",,,We want to defend Italy 🇮🇹,,30,41,2052,37700,37,https://p16-sign-va.tiktokcdn.com/tos-maliva-p...
221,2022-02-11 14:53:48,Sia chiaro a tutti: noi il governo con il PD e...,7063465536109235200,"{'authorName': 'Giorgia Meloni', 'coverLarge':...","{'bitrate': 401098, 'bitrateInfo': [{'Bitrate'...",,,Let it be clear to everyone: we will never for...,,22,40,1482,33000,21,https://p16-sign-va.tiktokcdn.com/tos-maliva-p...
222,2022-02-10 19:07:13,Cittadini e imprese schiacciati dalla crisi e ...,7063159758185827328,"{'authorName': 'Giorgia Meloni', 'coverLarge':...","{'bitrate': 326414, 'bitrateInfo': [{'Bitrate'...",,,Citizens and businesses crushed by the crisis ...,,28,202,2240,47300,126,https://p16-sign-va.tiktokcdn.com/tos-maliva-p...


Write out the results to a CSV file. 

In [159]:
videos_df.to_json('../data/processed/videos-translated.json')
videos_df.to_csv('../data/processed/videos-translated.csv', sep=',')

## Download Resources

In [None]:
def download_file_and_save(url: str, filepath: Path):
    """Download a file from a specified URL and write its contents to a file"""

    r = _get(url=url)
    if r.status_code == 403:
        return
    ext = r.headers["Content-Type"].split("/")[-1]
    path_with_ext = filepath.with_suffix(f".{ext}")
    with open(path_with_ext, "wb") as f:
        f.write(r.content)
        logger.debug(f"Saved file to: {path_with_ext}")

## Download Videos

In [15]:
import logging
import os
import json
from pathlib import Path
from urllib.error import HTTPError
import yt_dlp
from yt_dlp.utils import ExtractorError, DownloadError
from typing import List, Dict, Optional

logger = logging.getLogger(__name__)

def download_videos(url_to_download=None, output_dir=None):
    if len(url_to_download) > 0:
        print(f"Downloading media {url_to_download}")
        logger.info(f"Downloading media for hashtag {url_to_download}")

    ydl_opts = {
        "outtmpl": os.path.join(output_dir, "%(id)s.%(ext)s"),
        "ignore_errors": True,
        "quiet": logger.getEffectiveLevel() > logging.DEBUG,
    }
    
    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
        try:
            ydl.download([url_to_download])
        except (HTTPError, TypeError, ExtractorError, DownloadError) as e:
            logger.warning(
                f"Encountered error {e} when attempting to download url: {url_to_download}"
            )

for video in videos_df['video']:
    user_id = '7057902765381534725'
    url = f"https://www.tiktok.com/@{user_id}/video/{video['id']}"
    print(url)
    download_videos(url_to_download=url, output_dir='../data/processed/videos')


https://www.tiktok.com/@7057902765381534725/video/7278386520275373344
Downloading media https://www.tiktok.com/@7057902765381534725/video/7278386520275373344
https://www.tiktok.com/@7057902765381534725/video/7277971263958666529
Downloading media https://www.tiktok.com/@7057902765381534725/video/7277971263958666529
https://www.tiktok.com/@7057902765381534725/video/7277491085071387937
Downloading media https://www.tiktok.com/@7057902765381534725/video/7277491085071387937
https://www.tiktok.com/@7057902765381534725/video/7274498125660704033
Downloading media https://www.tiktok.com/@7057902765381534725/video/7274498125660704033
https://www.tiktok.com/@7057902765381534725/video/7266312532665584928
Downloading media https://www.tiktok.com/@7057902765381534725/video/7266312532665584928
https://www.tiktok.com/@7057902765381534725/video/7265735771271728416
Downloading media https://www.tiktok.com/@7057902765381534725/video/7265735771271728416
https://www.tiktok.com/@7057902765381534725/video/72

[download] Got error: Downloaded 6996982 bytes, expected 38107595 bytes
[download] Got error: Downloaded 6996982 bytes, expected 38107595 bytes when attempting to download url: https://www.tiktok.com/@7057902765381534725/video/7226821746425466139


https://www.tiktok.com/@7057902765381534725/video/7223471702020246810
Downloading media https://www.tiktok.com/@7057902765381534725/video/7223471702020246810
https://www.tiktok.com/@7057902765381534725/video/7222372308235603206
Downloading media https://www.tiktok.com/@7057902765381534725/video/7222372308235603206
https://www.tiktok.com/@7057902765381534725/video/7222341088596610309
Downloading media https://www.tiktok.com/@7057902765381534725/video/7222341088596610309
https://www.tiktok.com/@7057902765381534725/video/7221262230988262661
Downloading media https://www.tiktok.com/@7057902765381534725/video/7221262230988262661
https://www.tiktok.com/@7057902765381534725/video/7221065295304117509
Downloading media https://www.tiktok.com/@7057902765381534725/video/7221065295304117509
https://www.tiktok.com/@7057902765381534725/video/7219014997781925146
Downloading media https://www.tiktok.com/@7057902765381534725/video/7219014997781925146
https://www.tiktok.com/@7057902765381534725/video/72

KeyboardInterrupt: 