In [1]:
import os

import pandas as pd
from langdetect import detect

import src.mongos as mg

# Const

In [2]:
db_name = 'youtube_comment'

vids = [  # vid(video_ids) goes to collection name
    'XsX3ATc3FbA',  # 작은 것들을 위한 시
]

# Load comments

In [3]:
def get_df_from_vid(vid):
    return mg.read_mongo(db_name, vid)  # comments, nextPageToken, savedDateTime

In [4]:
raw = get_df_from_vid(vids[0]); raw.head()

Unnamed: 0,comments,nextPageToken,savedDateTime
0,"[{'id': 'UgwoSFJSmi55kK1I3eF4AaABAg', 'author'...",EiYSC1hzWDNBVGMzRmJBwAEAyAEA4AEBogINKP________...,2019-04-17 16:32:26.501
1,"[{'id': 'Ugx_VBzgjApKX6mdFmx4AaABAg', 'author'...",EiYSC1hzWDNBVGMzRmJBwAEAyAEA4AEBogINKP________...,2019-04-17 16:32:27.907
2,"[{'id': 'Ugz6oK4sAG1SByC4In14AaABAg', 'author'...",EiYSC1hzWDNBVGMzRmJBwAEAyAEA4AEBogINKP________...,2019-04-17 16:32:29.237
3,"[{'id': 'Ugxvo93EDTtFen6i9_14AaABAg', 'author'...",EiYSC1hzWDNBVGMzRmJBwAEAyAEA4AEBogINKP________...,2019-04-17 16:32:31.018
4,"[{'id': 'Ugx4l1eRrz_7XpWhiOV4AaABAg', 'author'...",EiYSC1hzWDNBVGMzRmJBwAEAyAEA4AEBogINKP________...,2019-04-17 16:32:32.830


In [5]:
raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29942 entries, 0 to 29941
Data columns (total 3 columns):
comments         29942 non-null object
nextPageToken    29942 non-null object
savedDateTime    29942 non-null datetime64[ns]
dtypes: datetime64[ns](1), object(2)
memory usage: 701.8+ KB


# Transform comments

In [6]:
def add_token_and_date_to_comments(row):
    comments = row['comments']  # list
    token = row['nextPageToken']  # string
    dt = row['savedDateTime']  # dt   
    
    result_from_replies = []
    for c in comments:
        replies = c.get('replies', [])
        replies2 = [{**r, 'nextPageToken': token, 'savedDateTime': dt} for r in replies]
        result_from_replies.extend(replies2)
    
    result_from_comments = [{**cmt, 'nextPageToken': token, 'savedDateTime': dt} for cmt in comments]
    result = result_from_replies + result_from_comments
    return result


def get_comments_from_series_list_cmt(series_list_cmt):
    data = [cmt for l in series_list_cmt for cmt in l]; data
    return pd.DataFrame(data)
    

# series_list_cmt = sample.apply(lambda x: add_token_and_date_to_comments(x), axis=1);series_list_cmt
# raw_comments = get_comments_from_series_list_cmt(series_list_cmt); raw_comments

# replies = raw_comments['replies'].loc[raw_comments['replies'].notna()]
# comments_from_replies = get_comments_from_series_list_cmt(replies)

# Load langs

In [7]:
langs = mg.read_mongo('youtube_comment', 'languages'); langs.head()

Unnamed: 0,639-1,639-2/B,639-2/T,639-3,ISO language name,Language family,Native name (endonym),Notes
0,ab,abk,abk,abk,Abkhazian,Northwest Caucasian,"аҧсуа бызшәа, аҧсшәа",also known as Abkhaz
1,aa,aar,aar,aar,Afar,Afro-Asiatic,Afaraf,
2,af,afr,afr,afr,Afrikaans,Indo-European,Afrikaans,
3,ak,aka,aka,aka + 2,Akan,Niger–Congo,Akan,"macrolanguage, Twi is [tw/twi], Fanti is [fat]"
4,sq,alb,sqi,sqi + 4,Albanian,Indo-European,Shqip,"macrolanguage, ""Albanian Phylozone"" in 639-6"


# Add langs

In [8]:
from langdetect.lang_detect_exception import LangDetectException

error_codes = {
    0: 'NoTextError',
    1: 'FormatError',
    2: 'FileLoadError',
    3: 'DuplicateLangError',
    4: 'NeedLoadProfileError',
    5: 'CantDetectError',
    6: 'CantOpenTrainData',
    7: 'TrainDataFormatError',
    8: 'InitParamError',
}

def detect_lang(row):
    text = row['text']
    author = row['author']
    try:
        return detect(text)
    except (LangDetectException, TypeError) as e:
        try:
            return detect(author)
        except LangDetectException as e:
            return error_codes[e.code]
        except TypeError as e:
            return 'TypeError'

def detect_text(t):
    try:
        return detect(t)
    except LangDetectException as e:
        return error_codes[e.code]

def detect_author(a):
    try:
        return detect(a)
    except LangDetectException as e:
        return error_codes[e.code]
    

def add_langs(raw_comments):
    comments = raw_comments.copy()

    comments['639-1'] = comments[['text', 'author']].apply(detect_lang, axis=1)
    comments = comments.merge(langs[['ISO language name', '639-1']], on='639-1', how='left')
    comments['ISO language name'].fillna(comments['639-1'], inplace=True)
    return comments

# comments['lang_text'] = comments['text'].apply(detect_text)
# comments['lang_author'] = comments['author'].apply(detect_author)
# comments = comments.merge(langs[['ISO language name', '639-1']], left_on='lang_text', right_on='639-1', how='left')

# comments = add_langs(raw_comments)

# print(len(comments))
# comments

# Transform Columns

In [9]:
def rename_columns(comments):
    return comments.rename(columns={'639-1': 'langCode', 'ISO language name': 'langName'})


def drop_columns(comments):
    return comments.drop('replies', axis=1)

# Calculate date time

In [10]:
from datetime import timedelta
import re


def add_date_time(row):
    saved = row['savedDateTime']
    time = row['time']
    obj = re.search(r'(\d+)\s', time)
    if obj:
        value = int(obj.group(1))
    value_in_seconds = value
    factor_table = {'second': 1, 'minute': 60, 'day': 60*24, 'month': 60*24*30, 'year': 60*24*365}
    for k, v in factor_table.items():
        if k in time:
            value_in_seconds = value * v
            break
    return saved - timedelta(seconds=value_in_seconds) + timedelta(hours=9)  # Korean Time

In [94]:
comments[['savedDateTime', 'time']].apply(add_date_time, axis=1)

0     2019-04-17 16:32:17.501
1     2019-04-17 16:32:15.501
2     2019-04-17 16:32:13.501
3     2019-04-17 16:32:11.501
4     2019-04-17 16:32:08.501
5     2019-04-17 16:32:07.501
6     2019-04-17 16:32:03.501
7     2019-04-17 16:31:54.501
8     2019-04-17 16:31:46.501
9     2019-04-17 16:31:46.501
10    2019-04-17 16:31:42.501
11    2019-04-17 16:31:42.501
12    2019-04-17 16:31:39.501
13    2019-04-17 16:31:38.501
14    2019-04-17 16:31:37.501
15    2019-04-17 16:31:37.501
16    2019-04-17 16:31:34.501
17    2019-04-17 16:31:32.501
18    2019-04-17 16:31:26.501
19    2019-04-17 16:31:26.501
20    2019-04-17 16:31:47.907
21    2019-04-17 16:32:00.907
22    2019-04-17 16:31:27.907
23    2019-04-17 16:31:27.907
24    2019-04-17 16:31:27.907
25    2019-04-17 16:31:27.907
26    2019-04-17 16:31:27.907
27    2019-04-17 16:31:27.907
28    2019-04-17 16:31:27.907
29    2019-04-17 16:31:27.907
                ...          
89    2019-04-17 16:29:31.018
90    2019-04-17 16:29:31.018
91    2019

In [88]:
re.search(r'(\d+)\s', '4 minutes ago').group(1)

'4'

# Execution

In [None]:
sample = raw.head()

In [11]:
series_list_cmt = raw.apply(lambda x: add_token_and_date_to_comments(x), axis=1);series_list_cmt

0        [{'id': 'UgwoSFJSmi55kK1I3eF4AaABAg', 'author'...
1        [{'id': 'UgwdsFKPLkKI9PSLe_B4AaABAg.8tplTwmFs8...
2        [{'id': 'Ugw2SWiQ9L-WM81arPd4AaABAg.8tplStq-sN...
3        [{'id': 'Ugxvo93EDTtFen6i9_14AaABAg.8tplMaNTBy...
4        [{'id': 'UgyN-1ghJRP1QRZbsKB4AaABAg.8tpl8mMiYb...
5        [{'id': 'UgzGqIbsRkfmgv5Nekh4AaABAg.8tpl58tBbz...
6        [{'id': 'UgxQqdDUIPraI_zGhOV4AaABAg', 'author'...
7        [{'id': 'UgxUIrlB1mv9Jc6HoT54AaABAg.8tpkuZA45i...
8        [{'id': 'Ugzi6mK4ziW3Thw6xEZ4AaABAg.8tpkq11B0h...
9        [{'id': 'UgxQvQs6H9IoJb8YdDN4AaABAg.8tpkjtUFc7...
10       [{'id': 'Ugy4A6FvZI7VNgSTZq14AaABAg.8tpkdCdfTx...
11       [{'id': 'UgwlhswzYfGqZ3iYSZJ4AaABAg.8tpkaDbiKU...
12       [{'id': 'Ugzz782TlEfeyAPv1IJ4AaABAg.8tpkUoQMYR...
13       [{'id': 'UgyN7RqqtdhPcG9P8Fd4AaABAg.8tpkMSsyH5...
14       [{'id': 'UgzZ-R46wpm1gtMMliB4AaABAg.8tpkJeTgZb...
15       [{'id': 'UgwV-JLN7LF6VF2nKhd4AaABAg.8tpkDiuHT0...
16       [{'id': 'UgyZRRHtNnfmreLx6O94AaABAg.8tpk9UgHhX.

In [13]:
raw_comments = get_comments_from_series_list_cmt(series_list_cmt); raw_comments.head()

Unnamed: 0,author,authorLink,authorThumb,edited,hasReplies,id,likes,nextPageToken,numReplies,replies,repliesToken,savedDateTime,text,time,timestamp
0,Nhung Nguyen,/channel/UC1PS8rESewcXAQaILjwp5Rw,https://yt3.ggpht.com/-LqedvkhhNgM/AAAAAAAAAAI...,False,False,UgwoSFJSmi55kK1I3eF4AaABAg,0,EiYSC1hzWDNBVGMzRmJBwAEAyAEA4AEBogINKP________...,,,,2019-04-17 16:32:26.501,보라해\n 방탄소년단×아미\n베트남,9 seconds ago,1555519000000.0
1,DAMMAR OY,/channel/UCemVva3rTGyw9UukIm-UfaQ,https://yt3.ggpht.com/-ERau5tSnPmk/AAAAAAAAAAI...,False,False,UgwQXZq0Gu8M4x2KJuR4AaABAg,1,EiYSC1hzWDNBVGMzRmJBwAEAyAEA4AEBogINKP________...,,,,2019-04-17 16:32:26.501,Beat the world record;),11 seconds ago,1555519000000.0
2,Nhu Minie,/channel/UCCAVkBY3T95R158d-fHEZDg,https://yt3.ggpht.com/-gRt33JzS8rI/AAAAAAAAAAI...,False,False,UgxmN6-QLDFCl2rPyKx4AaABAg,0,EiYSC1hzWDNBVGMzRmJBwAEAyAEA4AEBogINKP________...,,,,2019-04-17 16:32:26.501,"well, the number is gradually slowing down :( ...",13 seconds ago,1555519000000.0
3,JUNG KOOKiE,/channel/UCu76hhfYZFEz1TH5UrpuGUg,https://yt3.ggpht.com/-EQBxzBIi6Z8/AAAAAAAAAAI...,False,False,UgwTXeJvgWg38VczBWx4AaABAg,1,EiYSC1hzWDNBVGMzRmJBwAEAyAEA4AEBogINKP________...,,,,2019-04-17 16:32:26.501,To those who streams.\n•DON'T clear YT history...,15 seconds ago,1555519000000.0
4,CHUBBY JK,/channel/UCr_oUM5imlQqOoERdUJUfnA,https://yt3.ggpht.com/-SSsjOlr80CA/AAAAAAAAAAI...,False,False,Ugx66bCBBj5LSxG-fNZ4AaABAg,0,EiYSC1hzWDNBVGMzRmJBwAEAyAEA4AEBogINKP________...,,,,2019-04-17 16:32:26.501,Love is nothing stronger than the bond between...,18 seconds ago,1555519000000.0


In [None]:
comments = add_langs(raw_comments); comments.head()

In [None]:
comments = rename_columns(comments)

In [None]:
comments = drop_columns(comments)

In [None]:
comments['dateTime'] = comments[['savedDateTime', 'time']].apply(add_date_time, axis=1)

In [None]:
print(len(comments))
comments

# ...

# Groupby language

In [80]:
comments.groupby('langName').agg({'id': 'count'})

Unnamed: 0_level_0,id
langName,Unnamed: 1_level_1
Afrikaans,1
Albanian,1
Danish,2
"Dutch, Flemish",1
English,61
German,3
Hungarian,1
Indonesian,2
Italian,2
Korean,2


In [81]:
comments['langName'].isna().sum()

0