In [3]:
import pandas as pd #basically the engine for the whole analysis. 
import matplotlib.pyplot as plt #for plotting our data.
import glob #a nice library for iterating through multiple files.
import networkx as nx #we need this to construct and export network graphs.
import seaborn as sns; sns.set() #for plotting
import csv #for reading and writing csv's when we are not using the pandas library.
import re
import sys

csv.field_size_limit(sys.maxsize)

%matplotlib inline

In [2]:
# Set path to NL data - better to set these constants in a separate config file and import them here.

path_nl = '/home/dim/Documents/projecten/extremisme/youtube/yt/YouTubeExtremism/DataCollection/output/NL/'

# Set path to control group data.

path_right = '/home/dim/Documents/projecten/extremisme/youtube/yt/YouTubeExtremism/DataCollection/output/right/'

# Set path to international right data

path_left = '/home/dim/Documents/projecten/extremisme/youtube/yt/YouTubeExtremism/DataCollection/output/left/'

# Questions

So we're all set up. Before we dive in, what kind of questions do we want to answer? 

1. What kind of content is being watched by viewers? (The producers)
2. Who is commenting on the videos in the far right information network? How are commenters interacting? (The users)
3. How do political parties compare in terms of content, marketing strategies and reach? (Comparisson and strategies)
4. How does the far right information network compare to other information networks (like far left and center)? (Whataboutism)
5. What content is harmful, hateful, or illegal, in other words, when are lines being crossed? (Morality, the Platform)


# Question #1: The producers

For this we need:
1. Statistics on videos, channels and recommendations.
2. Topics of videos (by tags or through topic modelling)

Let's start by looking at the channels.

## Channels

### Show channel development over time

Socialblade.com provides a range of statistics on YouTube channels, like daily views and subscription info. I've run the list of channels through [socialblade.com](https://www.socialblade.com). I want to try to get a sense of the the growth of the far right network in recent years, maybe in a bubble flow chart. It would make a great comparisson with other information networks on YouTube. We can use four axes for that:
- x = monthly_views
- y = monthly_subscriptions
- z = monthly_comments (z is size of the bubble)
- plus time

The only constraint is that the oldest data is from early 2015, so it's not that old.

I'll prepare the data for use in [gapminder](https://www.gapminder.org/tools/), an easy way to explore this kind of data.

In [16]:
# Import the data from socialblade

#channel_history = pd.read_csv(path_left + 'other_platforms/social_blade_stats.csv')
channel_history = pd.read_csv(path_right + 'other_platforms/social_blade_stats.csv')

In [17]:
# Extract all the dates and values of two columns: daily views and total subs

pattern = re.compile('(\d{4}-\d{2}-\d+,\d+)')

# And create two new columns with lists of dates and values found

channel_history['daily_views'] = channel_history['Date_Daily_Views'].str.findall(pattern)
channel_history['daily_subs'] = channel_history['Date_Total_Subs'].str.findall(pattern)

# Stack them, so all the dates and values are linked to the channels and
# we are getting rid of the messy lists.

daily_views = channel_history.set_index('User') \
            .daily_views.apply(pd.Series) \
            .stack() \
            .reset_index(level=-1, drop=True) \
            .reset_index()

# Extract the values columns for views and subscriptions (subs)

daily_views['date'], daily_views['views'] = daily_views[0].str.split(',', 1).str
daily_views = daily_views[['User', 'date', 'views']]
daily_views = daily_views.rename(columns = {'User': 'channel_id'})

daily_subs = channel_history.set_index('User') \
            .daily_subs.apply(pd.Series) \
            .stack() \
            .reset_index(level=-1, drop=True) \
            .reset_index()

daily_subs['date'], daily_subs['subs'] = daily_subs[0].str.split(',', 1).str
daily_subs = daily_subs[['User', 'date', 'subs']]
daily_subs = daily_subs.rename(columns = {'User': 'channel_id'})

# And bring it all together in a dataframe called daily_stats

daily_stats = pd.merge(daily_subs, daily_views,  how='left', left_on=['channel_id', 'date'], right_on = ['channel_id', 'date'])


In [18]:
# Now we need to add some data, first the channel data (like channel_title, etc.)

# Import the channel data

#channels_int = pd.read_csv(path_left + 'channels_left.csv')
channels_int = pd.read_csv(path_right + 'channels_right.csv')

# And merge them with daily_stats

int_channel_daily_stats = pd.merge(daily_stats, channels_int, on='channel_id', how='left')

# Drop empty values

int_channel_daily_stats = int_channel_daily_stats.dropna()

In [19]:
# We need to get the average (mean) views and subs per year, month and year_month

# The date is not recognized as a date

int_channel_daily_stats['date'] = pd.to_datetime(int_channel_daily_stats['date'])

# Get year, month and year_month (yyyy-mm format)

#int_channel_daily_stats['year'] = int_channel_daily_stats['date'].dt.year
#int_channel_daily_stats['month'] = int_channel_daily_stats['date'].dt.month
int_channel_daily_stats['yearmonth'] = int_channel_daily_stats['date'].dt.to_period('M')

# The values of subs and views are not integers yet, which will get us into trouble later on

int_channel_daily_stats['subs'] = int_channel_daily_stats['subs'].astype('int')
int_channel_daily_stats['views'] = int_channel_daily_stats['views'].astype('int')

In [None]:
# Then it's time to get the comments and average out the comments per month
# (or should we sum them? Let's try both)

# Import the comments using an iterator (the comments file is 4.5GB)
import sys
import csv

csv.field_size_limit(sys.maxsize)
columns = ['video_id', 
           'comment_id', 
           'comment_id2', 
           'author_display_name',
           'author_image',
           'author_channel_url',
           'author_channel_id',
           'comment_text',
           'number_of_replies',
           'comment_date'
          ]
cols_to_keep = ['video_id', 'comment_date']

comments_we_need = pd.concat([x.loc[:, cols_to_keep] for x in pd.read_csv(path_right + 'comments_right.csv', names=columns, chunksize=20000, engine='python')])

In [None]:
# Add channel data to comments_we_need

videos = pd.read_csv(path_right + 'videos_right.csv', low_memory=False, index_col=None, sep='\t')
comments_channels_to_clean = pd.merge(comments_we_need, videos[['video_id', 'video_channel_title']], on='video_id').dropna()

# And make some room in memory

del videos
del comments_we_need

# Parse some dates.

comments_channels_to_clean['comment_date'] = pd.to_datetime(comments_channels_to_clean['comment_date'], errors='coerce')
comments_channels_to_clean['year'] = comments_channels_to_clean['comment_date'].dt.year
comments_channels_to_clean['month'] = comments_channels_to_clean['comment_date'].dt.month
comments_channels_to_clean['yearmonth'] = comments_channels_to_clean['comment_date'].dt.to_period('M')

# And clean it up a bit.

comments_channels_to_clean = comments_channels_to_clean.rename(columns = {'video_channel_title': 'channel_title'})



In [20]:
# Prepare the data for merging - the code is still quite messy
# TODO: Clean it up a bit and make it more pythonic. Maybe write a function.

int_channel_daily_stats = int_channel_daily_stats[['channel_title', 
                                                   'subs', 'views', 
                                                   'yearmonth', 
                                                   ]]

#comments_channels_to_clean = comments_channels_to_clean.groupby([comments_channels_to_clean.channel_title, 
#                                                                 comments_channels_to_clean.yearmonth ]) \
#                                                               .agg('count')

#comments_channels_to_clean = comments_channels_to_clean \
#                            .rename(columns = {'video_id':'comments'}) \
#                            .reset_index()

#comments_channels_to_clean = comments_channels_to_clean[['channel_title', 'yearmonth', 'comments']]
#

#merged_comments = pd.merge(int_channel_daily_stats, 
#                           comments_channels_to_clean, 
#                          on=['channel_title', 'yearmonth'], 
#                           how='left')

#subset_for_graph = int_channel_daily_stats[['channel_id', 
#                                            'channel_title', 
#                                            'yearmonth', 
#                                            'subs', 
#                                            'views']]

In [15]:
# And bring it all finally together.

df1 = pd.melt(int_channel_daily_stats, id_vars=['channel_title', 
                                        'yearmonth', 
                                        ])

df2 = df1.groupby(['channel_title',
                   'yearmonth',  
                   'variable']).mean()

df2['type'] = 'left'

# Write it to csv for use in Gapminder

df2.to_csv('forgapminder_left.csv')

In [21]:
df3 = pd.melt(int_channel_daily_stats, id_vars=['channel_title', 
                                        'yearmonth', 
                                        ])

df4 = df3.groupby(['channel_title',
                   'yearmonth', 
                   'variable']).mean()

df4['type'] = 'right'

# Write it to csv for use in Gapminder

df4.to_csv('forgapminder_right.csv')

In [24]:
df = df2.append(df4)

In [25]:
df.to_csv('output/for_viz/for_gapminder_left_and_right.csv')

In [None]:
df.drop(columns=['year', 'month'])

In [23]:
df4

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,value,type
channel_title,yearmonth,variable,Unnamed: 3_level_1,Unnamed: 4_level_1
#TommyIsFree TOMMY ROBINSON NEWS,2018-01,subs,39.500000,right
#TommyIsFree TOMMY ROBINSON NEWS,2018-01,views,687.500000,right
#TommyIsFree TOMMY ROBINSON NEWS,2018-02,subs,522.739130,right
#TommyIsFree TOMMY ROBINSON NEWS,2018-02,views,3617.086957,right
#TommyIsFree TOMMY ROBINSON NEWS,2018-03,subs,2143.125000,right
#TommyIsFree TOMMY ROBINSON NEWS,2018-03,views,15962.916667,right
#TommyIsFree TOMMY ROBINSON NEWS,2018-04,subs,4213.055556,right
#TommyIsFree TOMMY ROBINSON NEWS,2018-04,views,6114.444444,right
#TommyIsFree TOMMY ROBINSON NEWS,2018-05,subs,5251.068966,right
#TommyIsFree TOMMY ROBINSON NEWS,2018-05,views,10116.827586,right


### Topics info

This still needs some work. The tags are malformed, and I'm not so sure about the quality of the transcripts. I would say this doens't have a high priority, so I'll leave this to later and focus on the users first.

In [4]:
column_names = ['video_id',
                      'video_published',
                      'channel_id',
                      'video_title',
                      'video_description',
                      'video_channel_title',
                      'video_tags',
                      'video_category_id',
                      'video_default_language',
                      'video_duration',
                      'video_view_count',
                      'video_comment_count',
                      'video_likes_count',
                      'video_dislikes_count',
                      'video_topic_ids',
                      'video_topic_categories']


videos = pd.read_csv(path_right + 'videos_right.csv', names=column_names,
                            sep='¶',
                            quotechar='þ',
                            engine='python')

In [5]:
vidtags = videos[['video_id', 'video_title', 'video_tags']]

video_tags = vidtags['video_tags'].str.replace(r"\[|\]|\'|-", '') \
                    .str.lower() \
                    .str.split(', ', expand=True) \
                    .merge(vidtags, right_index = True, left_index = True) \
                    .drop(["video_tags"], axis = 1) \
                    .melt(id_vars = ['video_id', 'video_title'], value_name = "tag") \
                    .drop(['variable'], axis=1) \
                    .dropna()

video_tags = video_tags[~video_tags['tag'].str.contains('not set')]
video_tags.sort_values('tag', inplace=True)

In [6]:
#clean it up

replacements = {'fpoe': 'fpö',
                'rebelmedia': 'rebel media',
                r'^hc$': 'hc strache',
                'вмро-дпмне': 'vmro-dpmne',
                r'^strache$': 'hc strache',
                'news today': 'news',
                r'^heinz-christian strache$': 'hc strache',
                r'^trump$': 'donald trump',
                'република македонија': 'republic of macedonia',
                'oesterreichzuerst': 'österreich zuerst',
                '^zuerst$': 'österreich zuerst',
                '^afd$': 'alternative für deutschland',
                '^amren$': 'american renaissance',
                
                
                
                }

video_tags.tag.replace(replacements, regex=True, inplace=True)




In [7]:
video_tags.loc[video_tags['tag'].str.contains('trump'), 'tag'] = 'donald trump'
video_tags.loc[video_tags['tag'].str.contains('ukip'), 'tag'] = 'ukip'
video_tags.loc[video_tags['tag'].str.contains('femini'), 'tag'] = 'feminism'
video_tags.loc[video_tags['tag'].str.contains('immigra|inmigra|migran|migrat'), 'tag'] = 'immigration'
video_tags.loc[video_tags['tag'].str.contains('islam|mosle|mosqe|moskee|muhamma|mohamm|prophe|quran|koran|musli|mosli'), 'tag'] = 'islam'
video_tags.loc[video_tags['tag'].str.contains(r'^eu\s|-$|parlement eur|european uni|europai|europa|europe|europäi'), 'tag'] = 'eu'
video_tags.loc[video_tags['tag'].str.contains('obama'), 'tag'] = 'barack obama'
video_tags.loc[video_tags['tag'].str.contains(r'^christ$|^christian$|christiani|catholi|jesus|jezus|protestan|katholie|pope'), 'tag'] = 'christianity'
video_tags.loc[video_tags['tag'].str.contains('electio|wahl|verkiezi'), 'tag'] = 'election'
video_tags.loc[video_tags['tag'].str.contains('activis'), 'tag'] = 'activism'
video_tags.loc[video_tags['tag'].str.contains('adolf'), 'tag'] = 'adolf hitler'
video_tags.loc[video_tags['tag'].str.contains('alt right|alt-righ'), 'tag'] = 'alt-right'
video_tags.loc[video_tags['tag'].str.contains('anarch'), 'tag'] = 'anarchism'
video_tags.loc[video_tags['tag'].str.contains('merkel'), 'tag'] = 'angela merkel'
video_tags.loc[video_tags['tag'].str.contains(r'antifa|^afa$'), 'tag'] = 'antifa'
video_tags.loc[video_tags['tag'].str.contains('asyl|flücht|vluchtel|refugee'), 'tag'] = 'refugees'
video_tags.loc[video_tags['tag'].str.contains('atheis'), 'tag'] = 'atheism'
video_tags.loc[video_tags['tag'].str.contains(r'^bank$|banking|banker|banks|bankr'), 'tag'] = 'banks'
video_tags.loc[video_tags['tag'].str.contains('assad'), 'tag'] = 'bahhar al-assad'
video_tags.loc[video_tags['tag'].str.contains('bay.|bayeri'), 'tag'] = 'bavaria'
video_tags.loc[video_tags['tag'].str.contains('ben shapi'), 'tag'] = 'ben shapiro'
video_tags.loc[video_tags['tag'].str.contains('border'), 'tag'] = 'borders'
video_tags.loc[video_tags['tag'].str.contains('brexi'), 'tag'] = 'brexit'
video_tags.loc[video_tags['tag'].str.contains('bruxe|bruss|brüsse'), 'tag'] = 'eu'
video_tags.loc[video_tags['tag'].str.contains('warming|carbon|climate|emissions|energy|energie|environment'), 'tag'] = 'climate'
video_tags.loc[video_tags['tag'].str.contains('nationalis'), 'tag'] = 'nationalism'
video_tags.loc[video_tags['tag'].str.contains('clinton'), 'tag'] = 'clintons'
video_tags.loc[video_tags['tag'].str.contains('communis'), 'tag'] = 'communism'
video_tags.loc[video_tags['tag'].str.contains('conservativ|conservatis'), 'tag'] = 'conservatism'
video_tags.loc[video_tags['tag'].str.contains('debat'), 'tag'] = 'debate'
video_tags.loc[video_tags['tag'].str.contains('economi'), 'tag'] = 'economics'
video_tags.loc[video_tags['tag'].str.contains('macron'), 'tag'] = 'emmanuel macron'
video_tags.loc[video_tags['tag'].str.contains('esoteri'), 'tag'] = 'esotericism'
video_tags.loc[video_tags['tag'].str.contains('euro\s|euro-k|eurok|euroz'), 'tag'] = 'euro'
video_tags.loc[video_tags['tag'].str.contains('famil'), 'tag'] = 'family'
video_tags.loc[video_tags['tag'].str.contains('farage'), 'tag'] = 'nigel farage'
video_tags.loc[video_tags['tag'].str.contains('fascis'), 'tag'] = 'fascism'
video_tags.loc[video_tags['tag'].str.contains('finan|fiscal'), 'tag'] = 'finance'
video_tags.loc[video_tags['tag'].str.contains(r'^fox'), 'tag'] = 'fox news'
video_tags.loc[video_tags['tag'].str.contains(r'^gay$|gay\s|gays|lesbi'), 'tag'] = 'homosexuality'
video_tags.loc[video_tags['tag'].str.contains('gender'), 'tag'] = 'gender'
video_tags.loc[video_tags['tag'].str.contains('glenn be|glen bec|glenbeck|glennbeck'), 'tag'] = 'glenn beck'
video_tags.loc[video_tags['tag'].str.contains('government'), 'tag'] = 'government'
video_tags.loc[video_tags['tag'].str.contains(r'gun\s|^gun$|2nd am'), 'tag'] = 'guns'
video_tags.loc[video_tags['tag'].str.contains('generation ident|génération identitaire|bloc iden'), 'tag'] = 'generation identity'
video_tags.loc[video_tags['tag'].str.contains('health'), 'tag'] = 'health care'
video_tags.loc[video_tags['tag'].str.contains('jack posob'), 'tag'] = 'jack posobiec'
video_tags.loc[video_tags['tag'].str.contains('qanon'), 'tag'] = 'qanon'
video_tags.loc[video_tags['tag'].str.contains('jihad'), 'tag'] = 'jihad'
video_tags.loc[video_tags['tag'].str.contains(r'^jew$|^jews$|jew\s|jewis|juden|judeobol|judaism'), 'tag'] = 'jews'
video_tags.loc[video_tags['tag'].str.contains(r'^left$|left\swing|leftis|leftwing|^linke$|linksextr|linksradi'), 'tag'] = 'leftwing'
video_tags.loc[video_tags['tag'].str.contains(r'^lega$|lega\snord'), 'tag'] = 'lega nord'
video_tags.loc[video_tags['tag'].str.contains(r'^murray$|murray\sro|libertaria|libertär|ludwig von mi|^mises$|mises\sin'), 'tag'] = 'libertarianism'
video_tags.loc[video_tags['tag'].str.contains(r'^male$|mgtow|male\s|manosph|mcinnes|men going their own way|cernovich'), 'tag'] = 'men rights'
video_tags.loc[video_tags['tag'].str.contains(r'^nazi$|neonazi\nazis|nazi|sgenational socia|nationalsocia|nationasozi'), 'tag'] = 'national socialism'
video_tags.loc[video_tags['tag'].str.contains(r'^orban$|orbán'), 'tag'] = 'viktor orban'
video_tags.loc[video_tags['tag'].str.contains(r'^patriot$|patriota|patrioti|patriote|patriots'), 'tag'] = 'patriotism'
video_tags.loc[video_tags['tag'].str.contains('political correc|politically cor|politically in'), 'tag'] = 'poltical correctness'
video_tags.loc[video_tags['tag'].str.contains(r'^race$|rassism|race realism|race rela|race and iq|racism|racial|racist'), 'tag'] = 'race'
video_tags.loc[video_tags['tag'].str.contains(r'^parl'), 'tag'] = 'parlement'
video_tags.loc[video_tags['tag'].str.contains(r'^police|polizi|polizei'), 'tag'] = 'police'
video_tags.loc[video_tags['tag'].str.contains(r'^social just|^social inju|sjw'), 'tag'] = 'sjw'



In [11]:
tags_for_csv = video_tags.tag.value_counts()
tags_for_csv.to_csv('output/right/tags_overview_sorted_by_count.csv')

In [9]:
video_tags = video_tags.groupby('tag').filter(lambda x: len(x) > )

video_tags.tag.unique()[4000:].tolist()

['slovenski narod',
 'small business',
 'small enterprises',
 'smart',
 'smartphone',
 'smes',
 'smith',
 'smoking',
 'smr',
 'snow',
 'snowden',
 'snp',
 'soccer',
 'social',
 'social injustice warrior',
 'social justice',
 'social justice warrior',
 'social justice warriors',
 'social media',
 'social media censorship',
 'social psychology',
 'social science',
 'social security',
 'socialism',
 'socialism (political ideology)',
 'socialist',
 'socialists',
 'society',
 'sociology',
 'sociopath',
 'socofilms',
 'socrates',
 'sodomy',
 'sofia',
 'software',
 'software tutorial',
 'solar',
 'soldier',
 'soldiers',
 'solidaridad',
 'solidarisme',
 'somalia',
 'sommergespräch',
 'song',
 'songs',
 'sony',
 'sorcery',
 'soros',
 'sotu',
 'soul',
 'sound',
 'south',
 'south africa',
 'south america',
 'south burlington',
 'south carolina',
 'south korea',
 'southern',
 'souveränität',
 'sovereignty',
 'soviet',
 'soviet bear',
 'soviet union',
 'soviet union (country)',
 'sozial',
 'soziale

In [None]:
#tags die meer dan 50 keer worden gebruikt

vidtags = vidtags.groupby('tags').filter(lambda x: len(x) > 10)

In [None]:
# Then get the video data with these tags.

videos_tags = pd.merge(vidtags, videos, on='video_id', how='left')

# Create a new dataframe

videos_tags = videos_tags[['tags', 'video_channel_title','video_published', 'video_view_count', 'video_comment_count', 'video_likes_count']]

# Remove some stuff

#to_remove = ['Voice of Europe', 'Matthew & Doris', 'Al Stankard aka HAarlem VEnison']

#videos_tags = videos_tags[~videos_tags.video_channel_title.isin(to_remove)]

# Add a year-month column.

videos_tags['video_published'] = pd.to_datetime(videos_tags['video_published'])
videos_tags['yearmonth'] = videos_tags['video_published'].dt.to_period('M')

In [None]:
len(videos_tags)

In [None]:
tags_for_gm = videos_tags.groupby(['tags','yearmonth'])[['video_view_count', 'video_comment_count', 'video_likes_count']].sum()

In [None]:
tags_for_gm.to_csv(path_right + 'temp_data/tags_for_gapminder.csv')

In [10]:
videos.columns

Index(['Unnamed: 0', 'video_id', 'video_published', 'channel_id',
       'video_title', 'video_description', 'video_channel_title', 'video_tags',
       'video_category_id', 'video_default_language', 'video_duration',
       'video_view_count', 'video_comment_count', 'video_likes_count',
       'video_dislikes_count', 'video_topic_ids', 'video_topic_categories'],
      dtype='object')

Build a search function to get the videos associated with certain tags

In [41]:
query = ['race realis', 'race and iq', 'race iq',
         'murray', 'bell curve', 'jared taylor', 
         'eugenics', 'galton', 'genetics', 'dna']
esc_query = [re.escape(s) for s in query]
pattern = '|'.join(esc_query)
mask = videos['video_tags'].str.contains(pattern, case=False, na=False)
result = videos[mask]
len(result)

2065

In [42]:
result.video_channel_title.value_counts()

Alt-Right Tankie- Eurasianist    514
starisloven14                    191
American Renaissance             124
misesmedia                       117
Rebel Media                      105
Red Ice TV                        57
Styxhexenhammer666                50
Stefan Molyneux                   48
Thinking-Ape                      41
DCCI Ministries                   32
Sean Last                         32
The Patriotic Report              30
Jean-Francois Gariépy             28
Pilleater                         24
EuropeanWatchman2                 21
The Alternative Hypothesis        20
Martin Willett                    19
PJ Media                          19
RockingMrE                        19
Libertarian Realist               17
ReasonTV                          16
The Rubin Report                  15
Verbo Tempestas                   14
JFG Livestreams                   14
TheBlaze                          11
Argent                            11
The Daily Wire                    11
n

In [43]:
mask2 = videos['video_description'].str.contains(pattern, case=False, na=False)
result2 = videos[mask2]
len(result2)

1391

In [44]:
result2.video_channel_title.value_counts()

Red Ice TV                       142
American Renaissance             109
The Alternative Hypothesis        98
Rebel Media                       72
misesmedia                        64
nemzeti1tv                        52
Stefan Molyneux                   44
ReasonTV                          41
starisloven14                     35
Jean-Francois Gariépy             30
The Rubin Report                  24
Libertarian Realist               22
EuropeanWatchman2                 21
Fatmir Alispahic                  20
PragerU                           17
Newstalk                          16
The Patriotic Report              16
Nordiska motståndsrörelsen        14
Alt-Right Tankie- Eurasianist     12
RockingMrE                        12
Thinking-Ape                      12
TheBlaze                          10
Survive the Jive                   9
FreedomWorks                       9
LaughingMan0X                      8
PJ Media                           8
Matthew Drake                      8
Q