# Refining raw data for Tableau use

In [1]:
# Importing relevant packages
import pandas as pd
import numpy as np
from pathlib import Path

In [2]:
# Creating file paths for each available .csv file
channels_csv = Path("raw_data/channels.csv")
chat_csv = Path("raw_data/chat_stats.csv")
superchat_csv = Path("raw_data/superchat_stats.csv")

# Creating DataFrames for each available .csv file
channels = pd.read_csv(channels_csv)
chat = pd.read_csv(chat_csv)
superchat = pd.read_csv(superchat_csv)

## Filling NaN in channels DataFrame and adjusting group

In [3]:
# As these VTubers are just not affiliated with any group/generation, let's replace NaN with
# a more descriptive name.
channels["group"] = channels["group"].fillna("No Specific Group/Generation")

# Editing table to assign the following VTubers as Generation 0 as they are wrongly assigned initially:
# Tokino Sora, Roboco-san, Sakura Miko, Hoshimachi Suisei and AZKi
gen_zero = ["Tokino Sora", "Roboco-san", "Sakura Miko", "Hoshimachi Suisei", "AZKi"]

for vt in gen_zero:
    channels.loc[channels["englishName"] == vt, "group"] = "Generation 0"

## Creating nonMemberPercentage column for chats DataFrame

In [4]:
# Creating column
chat["nonMemberPercentage"] = ((chat["uniqueChatters"]-chat["uniqueMembers"])/chat["uniqueChatters"]) * 100

# Also deleting some unused columns
chat = chat.drop(columns=["bannedChatters", "deletedChats"])

## Cleaning up superchat DataFrame

In [5]:
superchat = superchat.drop(columns=["totalMessageLength", "averageMessageLength", "mostFrequentCurrency", "mostFrequentColor"])

## Final result

In [6]:
channels.head(5)

Unnamed: 0,channelId,name,englishName,affiliation,group,subscriptionCount,videoCount,photo
0,UCJFZiqLMntJufDCHc6bQixg,hololive ホロライブ - VTuber Group,Hololive VTuber Group,Hololive,No Specific Group/Generation,1790000,509,https://yt3.ggpht.com/ytc/AMLnZu-FamPA8ofQShmC...
1,UCfrWoRGlawPQDQxxeIDRP0Q,hololive Indonesia,Hololive Indonesia,Hololive,No Specific Group/Generation,339000,69,https://yt3.ggpht.com/ytc/AMLnZu8aac2EJS9DCkeC...
2,UCotXwY6s8pWmuWd_snKYjhg,hololive English,Hololive English,Hololive,No Specific Group/Generation,516000,34,https://yt3.ggpht.com/ytc/AMLnZu8qlq9NWNSS3MkV...
3,UCWsfcksUUpoEvhia0_ut0bA,holostars ホロスターズ - VTuber Group,Holostars Official,Hololive,No Specific Group/Generation,193000,150,https://yt3.ggpht.com/5Jn_OHkOZhZssXXwWopUQZVw...
4,UCp6993wxpyDPHUpavwDFqgg,SoraCh. ときのそらチャンネル,Tokino Sora,Hololive,Generation 0,1000000,667,https://yt3.ggpht.com/ytc/AMLnZu-Uc3gukr4oNSY0...


In [7]:
chat.head(5)

Unnamed: 0,channelId,period,chats,memberChats,uniqueChatters,uniqueMembers,nonMemberPercentage
0,UC--A2dwZW7-M2kID0N6_lfA,2021-03,32116,7874,1112,141,87.320144
1,UC-hM6YJuNYVAmUWxeIr9FeA,2021-03,1569241,378133,60399,4959,91.789599
2,UC-o-E6I3IC2q8sAoAuM6Umg,2021-03,220362,25920,13101,307,97.656667
3,UC01gb86Qdlkh23Nqk3A1OLQ,2021-03,30686,12753,1082,130,87.985213
4,UC0Owc36U9lOyi9Gx9Ic-4qg,2021-03,446411,178018,7413,710,90.422231


In [8]:
superchat.head(5)

Unnamed: 0,channelId,period,superChats,uniqueSuperChatters,totalSC,averageSC
0,UCFKOVgVbGmX65RxO3EtH3iw,2021-03,5552,1043,3482098,627
1,UCV5ZZlLjk5MKGg3L0n0vbzw,2021-03,220,121,199004,904
2,UCvaTdHTWBGv3MKj3KVqJVCw,2021-03,2653,796,1576726,594
3,UCNVEsYbiZjH5QLmGeSgTSzg,2021-03,166,124,205611,1238
4,UCl_gCybOJRIgOXw6Qb4qJzQ,2021-03,3468,835,3773715,1088


## Saving into .csv files

In [9]:
channels.to_csv('processed_data/channel.csv', index=False) 
chat.to_csv('processed_data/chat.csv', index=False)
superchat.to_csv('processed_data/superchat.csv', index=False)