In [101]:
import json
import glob
import os
import re
import spacy
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from itertools import combinations
from spacymoji import Emoji
from typing import Tuple


nlp = spacy.load('it_core_news_sm')
nlp.add_pipe("emoji", first=True)
_data_path = "./TikTok/Tikapi/data/"
video_ids_file = _data_path + "video_list.csv"

In [33]:
# load spreadsheet of videos
df = pd.read_csv(video_ids_file)
df = df.fillna("")

# create additional columns
df["id"] = [re.sub("video/", "", re.findall("video/[0-9]{19}", link)[0]) for link in df["Link"]]
df["name"] = np.where(df["Politician"]=="", df["Influencer/tiktoker"].replace(" ", "_", regex=True), df["Politician"].replace(" ", "_", regex=True))
df["file"] = _data_path + df['name'] + "_com_" + df["id"] + ".json"

In [34]:
# load comments with meta data into dictionary (key = video_id)
dic = {}

for i, row in df.iterrows():
    if os.path.isfile(row["file"]):
        with open(row["file"]) as infile:
            comments = json.load(infile)
        dic[row["id"]] = {"raw_comments":comments["comments"], "meta":comments["meta"]}

In [36]:
print(f"Currently we have comments for {len(dic.keys())} videos.")

Currently we have comments for 50 videos.


# Define Preprocessing functions

In [150]:
class preprocessing:

    def __init__(self) -> None:
        pass

    def clean_text(self, text: str, search_words: list = []) -> list:
        """ 
        Cleans a string removing punctuation, emoji, stopwords, lemmatizazion, ...

        text: Input string like a sentence
        search_words: WOrd that were use to query for the input data and should therefore be removed
        output: bag of words
        """
        doc = nlp(text)

        bog = []

        for token in doc:

            # filter stopwords
            if not token.is_stop:
                # filter punctiation
                if not token.is_punct:
                    if not token.like_url:
                        if not token.like_email:
                            if not token.is_space:
                                if token.lemma_ not in ['\n', ' ']:
                                    # filter words used to search for tweets  
                                    if not token._.is_emoji:
                                    
                                        if str(token) not in search_words:
                                            bog.append(re.sub('@', '', token.lemma_))                         
        
        return bog

    
    def create_comment_list(self, dic: dict, search_words: list = []) -> list:
        """ 
        Creates a flat list of cleaned comments from a dictionary.

        dic: dictionnary of comments
        search_words: search words to pass to self.clean_text
        """

        com_list = []

        for key, video in dic.items():
            for com in video["raw_comments"]:
                com_list.append(" ".join(self.clean_text(com["text"])))

        return com_list
            
    def create_edge_dictionary(self, comments: list) -> dict:
        """ 
        Creates edges between two words that appear in the same comment. A weight is assigned according to the number of occurences of an edge in the dataset.

        comments: list of comments where each comment is a list of words
        output: dicitonary with edges as keys and weight as values
        """
        
        weights = {}

        for com in comments:
            for edge in combinations(com.split(), 2):

                # if not self loop
                if edge[0] != edge[1]:
                    
                    if edge not in weights.keys():
                        weights[edge] = 1
                    else:
                        weights[edge] += 1
                    
        return weights

    def edges_to_dataframe(self, edges: dict) -> pd.DataFrame:
        """ 
        Creates data frame of edges from an edge dictionary (self.create_edge_dictionary). This can be saved to csv to use in gephi. In python better use the dicitionary as it is much faster.

        edges: dictionary of edge
        output: pd.DataFrame of edges
        """
        df = pd.DataFrame({"source": [], "target": [], "weigth": []})

        for edge, weight in edges.items():
            row = pd.DataFrame({"source": [edge[0]], "target": [edge[1]], "weigth": [weight]})
            df = pd.concat([df, row], ignore_index=True)
        
        return df



# Create comment lists

In [151]:
# create instance of our preprocessing class
prepro = preprocessing()

In [63]:
# loop over videos in spreadsheet and clean and add comments from d

com_list_all = prepro.create_comment_list(dic)

In [64]:
# define a subdictionary via some fiter criteria from the spreadsheet (e.g. name)
subdic_meloni = {k: dic[k] for k in df.loc[df["name"]=="Meloni", "id"]}
com_list_meloni = prepro.create_comment_list(subdic_meloni)

# Create & save edges

In [119]:
# create weighted edges. Two words are connected if they are in the same comment
edges_all = prepro.create_edge_dictionary(com_list_all)

In [153]:
# create dataframe of edges to be saved to csv for gephi
edges_all_df = prepro.edges_to_dataframe(edges_all)

In [155]:
# save edges to csv
edges_all_df.to_csv(_data_path + "edges_coms_all.csv")

<p style="text-align:center"> <i><b>Fin</b></i> </p>