<a href="https://colab.research.google.com/github/frasercrichton/data-investigation-conspiracy-aotearoa/blob/main/analysis/Pattern_of_Life.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Clean Data

In [14]:
import pandas as pd
import shutil
import cv2
import os
import re
from deep_translator import GoogleTranslator
processed_dir = '../data/processed/'
transcription_dir_location = '../data/processed/transcription/'
os.makedirs(transcription_dir_location, exist_ok = True)
os.makedirs(transcription_dir_location + 'original', exist_ok = True)
videos_original_df = pd.read_json('../data/source/video-2023-09-14.json', convert_dates=['createTime'])

def translate_text(text):
    return GoogleTranslator(source='it', target='en').translate(text)


## Clean the TikTok posts

Once this data was retrieved the following 'cleaning' process was applied:

- TikTok Posts metadata
- Remove data not required for analysis 
- Date - add a formatted date column based on creationDateTime
- Engagement counts - parse out the engagement counts into individual columns, create formatted comma delimited engagement counts columns and calculate engagement ratios. 
- Videos - extract ids, urls for videos, covers, etc.
- Extract warnInfo into a column
- Hashtags  - parse out the comma delimited strings hashtags column into seperate hashtag files including counts of hashtags

In [15]:
videos_df = videos_original_df.drop(columns=[
                                    'author',
                                    'challenges',
                                    'collected',
                                    'contents', 
                                    'digged', 
                                    'duetDisplay', 
                                    'forFriend',
                                    'itemCommentStatus', 
                                    'privateItem', 
                                    'secret', 
                                    'shareEnabled', 
                                    'stitchDisplay', 
                                    'officalItem',
                                    'originalItem',
                                    'duetEnabled',
                                    'stitchEnabled',
                                    ])

def parse_desc(desc):
    return translate_text(desc)

def parse_stats(stats):
    return pd.Series([stats['collectCount'], stats['commentCount'], stats['diggCount'], stats['playCount'], stats['shareCount']])

def add_formatted_stats(data_frame, column_name):
    data_frame[column_name + 'Formatted'] = data_frame[column_name].map('{:,}'.format) 

# Hashtag text
def parse_text_extra(extra_text):
    if extra_text is None:
        return pd.Series([extra_text, extra_text])
    
    extra_text_as_json = pd.json_normalize(extra_text)    
    hashtags = ', '.join(extra_text_as_json['hashtagName'].values)
    return pd.Series([hashtags, hashtags])

def parse_warn_info(warn_info):
    if warn_info is None:
        return warn_info
    
    return warn_info[0]['text']

# desc: translate the video description into English
videos_df['desc_en'] = videos_df.apply(lambda row: parse_desc(row['desc']), axis=1)

# textExtra: extract a list of hashtags
videos_df[['textExtra', 'textExtra_en']] = videos_df.apply(lambda row: parse_text_extra(row['textExtra']), axis=1)

# stats: turn the like, comment, etc. counts into columns
videos_df[['collectCount', 'commentCount', 'diggCount', 'playCount', 'shareCount']] = videos_df.apply(lambda row: parse_stats(row['stats']), axis=1)
videos_df = videos_df.drop(columns=['stats'])
add_formatted_stats(videos_df, 'collectCount')
add_formatted_stats(videos_df, 'commentCount')
add_formatted_stats(videos_df, 'diggCount')
add_formatted_stats(videos_df, 'playCount')
add_formatted_stats(videos_df, 'shareCount')

# Calculate engagement ratios
videos_df['shareCountRatio'] = videos_df['shareCount'] / videos_df['playCount']
videos_df['commentCountRatio'] = videos_df['commentCount'] / videos_df['playCount']
videos_df['diggCountRatio'] = videos_df['diggCount'] / videos_df['playCount']

# warnInfo: extract any content warning text 
videos_df['warnInfo'] = videos_df.apply(lambda x: parse_warn_info(x['warnInfo']), axis=1)

# Videos - zoomCover: get the URL of the cover image
# videos_df['coverImage'] = videos_df.apply(lambda row: parse_video(row['video']), axis=1)

# format createTime as a date
videos_df['createDate'] = videos_df['createTime'].dt.strftime('%Y-%m-%d')

# Extract the video URL id
videos_df['videoId'] = pd.json_normalize(videos_df['video'])['id']

videos_df.head(5)

Unnamed: 0,createTime,desc,id,music,video,textExtra,warnInfo,desc_en,textExtra_en,collectCount,...,collectCountFormatted,commentCountFormatted,diggCountFormatted,playCountFormatted,shareCountFormatted,shareCountRatio,commentCountRatio,diggCountRatio,createDate,videoId
0,2023-09-13 18:57:03,"La mia intervista di questa sera a ""Cinque Min...",7278386520275373056,"{'authorName': 'Giorgia Meloni', 'coverLarge':...","{'bitrate': 1178262, 'bitrateInfo': [{'Bitrate...",,,"My interview this evening on ""Cinque Minuti"", ...",,840,...,840,2669,14800,435800,438,0.001005,0.006124,0.033961,2023-09-13,7278386520275373344
1,2023-09-12 16:05:41,Stiamo dando alla Nazione una strategia che no...,7277971263958666240,"{'authorName': 'Giorgia Meloni', 'coverLarge':...","{'bitrate': 1021716, 'bitrateInfo': [{'Bitrate...",,,We are giving the Nation a strategy that it ha...,,1050,...,1050,6426,21200,622700,598,0.00096,0.01032,0.034045,2023-09-12,7277971263958666529
2,2023-09-11 09:02:24,"Grazie India, complimenti per il successo del ...",7277491085071387648,"{'authorName': 'Giorgia Meloni', 'coverLarge':...","{'bitrate': 1374581, 'bitrateInfo': [{'Bitrate...",g20,,"Thank you India, congratulations on the succes...",g20,313,...,313,468,7158,137500,224,0.001629,0.003404,0.052058,2023-09-11,7277491085071387937
3,2023-09-03 07:28:10,A 41 anni dal brutale attentato mafioso che ha...,7274498125660703744,"{'authorName': 'Giorgia Meloni', 'coverLarge':...","{'bitrate': 627490, 'bitrateInfo': [{'Bitrate'...",,,41 years after the brutal mafia attack which c...,,277,...,277,274,4874,141400,204,0.001443,0.001938,0.03447,2023-09-03,7274498125660704033
4,2023-08-12 06:05:11,Salario minimo: punto stampa dopo l’incontro c...,7266312532665584640,"{'authorName': 'Giorgia Meloni', 'coverLarge':...","{'bitrate': 1077066, 'bitrateInfo': [{'Bitrate...",,,Minimum wage: press point after the meeting wi...,,2847,...,2847,7144,52600,1800000,1388,0.000771,0.003969,0.029222,2023-08-12,7266312532665584928


Save the results to a file.

In [16]:
videos_df.to_json(processed_dir + 'TikTok-posts-metadata.json')

## Extract Transcribed Text

Tiktok metadata allows us to download video files separately and extract audio files to AWS Transcribe (see the VideoDownload Jupyter Notebook and the AWSTranscribe.py script).  

The transcribed files need cleaned and translated.   

## Clean

Extract just the transcription text from the AWS Transcribe formatted files and insert metadata from the TikTok metadata.

The inserted data includes:

- id
- creation time (which is used by the topic modelling classification) 

In [30]:
def merge_create_time(transcript_df: pd.DataFrame):
    metadata_df = videos_df[['createTime', 'videoId']]
    metadata_df = metadata_df.rename(columns={'videoId':'id'})    
    return pd.merge(metadata_df,transcript_df,on=['id'])

def extract_transcript_text(file_id: str, transcriptions_df: pd.DataFrame):
    transcripts_df = transcriptions_df['results']['transcripts']
    if len(transcripts_df) > 1:
        print('error')
        raise
    # convert the list [{'transcript': 'chi va M design, yo.'}] to a dataframe 
    transcript_df = pd.DataFrame.from_records(transcripts_df,index=['0']) 
    transcript_df['id'] = file_id
    return transcript_df

from sentence_transformers import SentenceTransformer
from nltk import sent_tokenize

def split_and_translate(text: str):
    sentence_list = sent_tokenize(text)
    result = []
    for sentence in sentence_list:
        result.append(translate_text(sentence))
    return ' '.join(result) 

def translate_transcripts(file_name: str):
    if file_name == '.DS_Store':
        return
    print(f'file_name: {file_name}')
    file_id = file_name.removeprefix('TranscribeTikTokAudio').removesuffix('.json')
    transcriptions_df = pd.read_json(transcription_dir_location + 'original/it/' + file_name)
    # Extract text from transcribed files
    transcripts_df = extract_transcript_text(file_id, transcriptions_df)
    # translate text
    transcripts_df['transcript'] = transcripts_df['transcript'].apply(lambda x: split_and_translate(x))
    # merge in the creationDateTime and Id
    result_df = merge_create_time(transcripts_df)
    result_df.to_json(f'{transcription_dir_location}/original/en/{file_name}')
    return result_df    

for file_name in os.listdir(transcription_dir_location + 'original/it'):
    translate_transcripts(file_name)

# x = translate_transcripts(file_name='TranscribeTikTokAudio7144023384853171462.json')    
# x[['transcript']]

# split_and_translate('tenere una legge proporzionale perch. Presidente, a differenza di buona parte delle persone che sono qui dentro, noi i. random on the end')

file_name: TranscribeTikTokAudio7216786331005930778.json
file_name: TranscribeTikTokAudio7177010908625980678.json
file_name: TranscribeTikTokAudio7178163802578455813.json
file_name: TranscribeTikTokAudio7194909784926063877.json
file_name: TranscribeTikTokAudio7092079134247046405.json
file_name: TranscribeTikTokAudio7146670520505847045.json
file_name: TranscribeTikTokAudio7168946059706764550.json
file_name: TranscribeTikTokAudio7147206448232074502.json
file_name: TranscribeTikTokAudio7234480028908555547.json
file_name: TranscribeTikTokAudio7207816601217944837.json
file_name: TranscribeTikTokAudio7109083244611587333.json
file_name: TranscribeTikTokAudio7217889363739315462.json
file_name: TranscribeTikTokAudio7266312532665584928.json
file_name: TranscribeTikTokAudio7274498125660704033.json
file_name: TranscribeTikTokAudio7233290952285244698.json
file_name: TranscribeTikTokAudio7133867159503097093.json
file_name: TranscribeTikTokAudio7092750010739215621.json
file_name: TranscribeTikTokAudi

Copy the transcript files from the original directory to the appropriate working directory.  

In [89]:
current = {file_name for file_name in os.listdir(transcription_dir_location + 'en')}
prospective  = {file_name for file_name in os.listdir(transcription_dir_location + 'original/en') if file_name in current}
print('Current files to overwrite: ', len(current))
print('Prospective files to copy: ', len(prospective))

for file_name in prospective:
    src_file_path = transcription_dir_location + 'original/en/' + file_name
    shutil.copy(src_file_path, transcription_dir_location + 'en')

Current files to overwrite:  186
Prospective files to copy:  186


## Update with audio duration

In [None]:
def get_audio(id:str):
    filename = f'TranscribeTikTokAudio{id}.json'
    if filename == 'TranscribeTikTokAudio7060519835788250373.json':
        return 'missing'    
    transcript_df = pd.read_json(processed_dir + 'transcription/original/en/' + filename)
    print(transcript_df)
    x = transcript_df[0][0]
    # transcript_df[transcript_df[0][0]].copy()

    # print(x)
    # ignore if doesn't exist?

    return x

engagement_df = videos_df[['createTime',
        'id', 
        'videoId',
        'desc_en', 
        'textExtra_en', 
        'warnInfo',
        'collectCount', 
        'commentCount', 
        'diggCount',
        'playCount', 
        'shareCount', 
        'collectCountFormatted',
        'commentCountFormatted', 
        'diggCountFormatted',
        'playCountFormatted',
        'shareCountFormatted', 
        'coverImage', 
        'createDate', 
        'shareCountRatio',
        'commentCountRatio', 
        'diggCountRatio' ]]

engagement_df['audio'] = engagement_df.apply(lambda x: get_audio(x['videoId']), axis=1) 
engagement_df.to_csv(processed_dir + 'temp.csv')

Write out the results to a file. 

In [173]:
videos_df.to_json(processed_dir + 'clean-tiktok-metadata.json')

## Clean and Extract Hashtags

Extract hashtags and save to a file.

### Hashtags in Italian

In [181]:
hastags_dir = processed_dir + '/hashtags/'

hash_tags_it_list = [val.strip() for sublist in videos_df['textExtra'].dropna().str.split(",").tolist() for val in sublist]
hash_tags_it_count = pd.DataFrame(hash_tags_it_list,columns=['textExtra']).value_counts().reset_index().rename(columns={0:'count'})
hash_tags_it_count.to_csv(hastags_dir + 'it_hashtags.csv')
hash_tags_it_count.head(10)

Unnamed: 0,textExtra,count
0,fratelliditalia,14
1,gliappuntidigiorgia,11
2,votafdi,5
3,governo,4
4,lavoro,3
5,draghi,3
6,redditodicittadinanza,3
7,bufala,2
8,energiadaliberare,2
9,greenpass,2


### Hashtags in English

In [183]:
def translate_hashtag(text):
        text_en = translate_text(text)
        # translation introduces white space and hashtags have no whitespace
        return text_en.replace(' ', '-')

hash_tags_en_list = []
for item in hash_tags_it_list:
        hash_tags_en_list.append(translate_hashtag(item))
hash_tags_en_count = pd.DataFrame(hash_tags_en_list,columns=['textExtra_en']).value_counts().reset_index().rename(columns={0:'count'})
hash_tags_en_count.to_csv(hastags_dir + 'en_hashtags.csv')
hash_tags_en_count

Unnamed: 0,textExtra_en,count
0,Brothers-of-Italy,14
1,giorgia's-notes,11
2,votefdi,5
3,government,4
4,dragons,3
...,...,...
68,ecological-transition,1
69,elections,1
70,energy-crisis,1
71,examination,1


## Translate transcript text to English

In [None]:
import re

tiktok_df = pd.read_json(processed_dir + 'video-2023-09-14.json')
  
for file_name in os.listdir(transcription_dir_location + 'it'):
    transcriptions_df = translate_transcripts(file_name)

    transcriptions_df.to_json(transcription_dir_location + 'en/' + file_name)


In [None]:
def get_audio(id:str):
    filename = f'TranscribeTikTokAudio{id}.json'
    if filename == 'TranscribeTikTokAudio7060519835788250373.json':
        return 'missing'    
    transcript_df = pd.read_json(processed_dir + 'transcription/original/en/' + filename)
    print(transcript_df)
    x = transcript_df[0][0]
    # transcript_df[transcript_df[0][0]].copy()

    # print(x)
    # ignore if doesn't exist?

    return x

engagement_df = videos_df[['createTime',
        'id', 
        'videoId',
        'desc_en', 
        'textExtra_en', 
        'warnInfo',
        'collectCount', 
        'commentCount', 
        'diggCount',
        'playCount', 
        'shareCount', 
        'collectCountFormatted',
        'commentCountFormatted', 
        'diggCountFormatted',
        'playCountFormatted',
        'shareCountFormatted', 
        'coverImage', 
        'createDate', 
        'shareCountRatio',
        'commentCountRatio', 
        'diggCountRatio' ]]

engagement_df['audio'] = engagement_df.apply(lambda x: get_audio(x['videoId']), axis=1) 
engagement_df.to_csv(processed_dir + 'temp.csv')