In [17]:
import pandas as pd
import config
import csv
import re
from apiclient.discovery import build
from apiclient.errors import HttpError

In [12]:
YOUTUBE_API_SERVICE_NAME = config.YOUTUBE_API_SERVICE_NAME
YOUTUBE_API_VERSION = config.YOUTUBE_API_VERSION
DEVELOPER_KEY = config.DEVELOPER_KEY
PATH_TEMP_RIGHT = config.PATH_TEMP_RIGHT

In [6]:
seeds = pd.read_csv(PATH_TEMP_RIGHT + 'right_seeds.csv')

In [7]:
seeds.head()

Unnamed: 0,Id,Label,country,isseed,seedrank,subscribercount,videocount,viewcount(100s),Unnamed: 8,publishedat,daysactive
0,UC_0dwPeY0vQSJGVfRpFvGUg,Winteroffensief,not set,no,,32.0,4.0,1150.0,,2006-06-02T19:42:03.000Z,4256.0
1,UC_8WUrPbi8clO6sWt_FDvuA,Rand Paul,US,no,,9771.0,254.0,26128.0,,2011-07-05T15:52:45.000Z,2397.0
2,UC_bjx_30CFcJFlcjPtG7VHQ,Impact Theory Studios,US,no,,3760.0,15.0,237.0,,2010-10-13T22:38:04.000Z,2662.0
3,UC_FX5j6DnMjsocBYFCYu6rg,LDDpersdienst,not set,yes,8.0,27.0,63.0,372.0,,2006-08-07T21:41:31.000Z,4190.0
4,UC_I-bFxOYtpWLyF_CqlBUAw,Mag�nszf�ra,not set,no,,823.0,42.0,2964.0,,2013-03-30T02:56:38.000Z,1764.0


In [8]:
def get_channels(channel_id):
    '''Queries the youtube API and 
    gets a json in return'''
        
    youtube = build(YOUTUBE_API_SERVICE_NAME, YOUTUBE_API_VERSION,
                    developerKey=DEVELOPER_KEY)
    
    response = youtube.channels().list(
    part = 'snippet,contentDetails,topicDetails,statistics,brandingSettings',
    id = channel_id
    ).execute()
    #print('getting channel info for %s' % (channel_id))
    return response


def get_channel_data(response):
    '''Extracts the needed variables 
    from the returned json'''
    
    for channel in response['items']:
        channel_id = channel['id']
        channel_title = channel['snippet']['title']
        channel_description = channel['snippet']['description']
        try: #many channels do not set a language so we need to catch the exception
            channel_default_language = channel['snippet']['defaultLanguage']
        except:
            channel_default_language = 'not set'
        try:
            channel_country = channel['snippet']['country']
        except:
            channel_country = 'not set'
        channel_viewcount = channel['statistics']['viewCount']
        channel_commentcount = channel['statistics']['commentCount']
        channel_subscribercount = channel['statistics']['subscriberCount']
        channel_videocount = channel['statistics']['videoCount']
        try:
            channel_topic_ids = channel['topicDetails']['topicIds']
        except:
            channel_topic_ids = 'not set'
        try:
            channel_topic_categories = channel['topicDetails']['topicCategories']
        except:
            channel_topic_categories = 'not set'
            
        try:
            channel_branding_keywords = channel['brandingSettings']['channel']['keywords']
        except:
            channel_branding_keywords = 'not set'
        
        return (channel_id,
                channel_title,
                channel_description,
                channel_default_language,
                channel_country,
                channel_viewcount,
                channel_commentcount,
                channel_subscribercount,
                channel_videocount,
                channel_topic_ids,
                channel_topic_categories,
                channel_branding_keywords)

In [14]:
channel_output = PATH_TEMP_RIGHT + 'channels.csv'
channel_input = seeds
count = -1 # if there is an error, it's easier to find the index position from where to continue

with open(channel_output, "a") as csvFile:
    fieldnames = ['channel_id',
                  'channel_title',
                  'channel_description',
                  'channel_default_language',
                  'channel_country',
                  'channel_viewcount',
                  'channel_commentcount',
                  'channel_subscribercount',
                  'channel_videocount',
                  'channel_topic_ids',
                  'channel_topic_categories',
                  'channel_branding_keywords'                                 
                 ]

    writer = csv.DictWriter(csvFile, fieldnames=fieldnames)
    writer.writeheader()

    for channel in channel_input['Id']:
        try:
            response = get_channels(channel)
            variabelen = get_channel_data(response)
            (channel_id,
            channel_title,
            channel_description,
            channel_default_language,
            channel_country,
            channel_viewcount,
            channel_commentcount,
            channel_subscribercount,
            channel_videocount,
            channel_topic_ids,
            channel_topic_categories,
            channel_branding_keywords) = variabelen
        except:
            continue

        writer.writerow({'channel_id': channel_id, 
                         'channel_title': channel_title, 
                         'channel_description': channel_description, 
                         'channel_default_language': channel_default_language, 
                         'channel_country': channel_country,
                         'channel_viewcount': channel_viewcount,
                         'channel_commentcount': channel_commentcount,
                         'channel_subscribercount': channel_subscribercount,
                         'channel_videocount': channel_videocount,
                         'channel_topic_ids': channel_topic_ids,
                         'channel_topic_categories': channel_topic_categories,
                         'channel_branding_keywords': channel_branding_keywords  
                        })
        count += 1

        print('wrote data for ' + channel_title + ' and index is ' + str(count))

wrote data for Winteroffensief and index is 0
wrote data for Rand Paul and index is 1
wrote data for Impact Theory Studios and index is 2
wrote data for LDDpersdienst and index is 3
wrote data for Magánszféra and index is 4
wrote data for BaldTerror and index is 5
wrote data for Palóc Videotéka and index is 6
wrote data for Svegot and index is 7
wrote data for •CRITICAL CONDITION• and index is 8
wrote data for NPI / Radix and index is 9
wrote data for În Linie Dreaptă and index is 10
wrote data for Bat'ko and index is 11
wrote data for Caleb Maddix and index is 12
wrote data for Right To Rise PAC and index is 13
wrote data for NSDAP Clan l Hitler Jugend and index is 14
wrote data for Gaelic Neoreactionary and index is 15
wrote data for Esterreicherr and index is 16
wrote data for rinnenmaker and index is 17
wrote data for JUGENDproNRW and index is 18
wrote data for No To Challenge 25 and index is 19
wrote data for Donald Robertson and index is 20
wrote data for tihichat and index is 21

wrote data for SakSaret and index is 175
wrote data for Australian Defence League and index is 176
wrote data for The Daily Wire and index is 177
wrote data for Neder1and and index is 178
wrote data for ThegnOfNorthumbria and index is 179
wrote data for Baked Alaska and index is 180
wrote data for NPDThueringenTV and index is 181
wrote data for Sean O'Rourke and index is 182
wrote data for Golden Dawn Dubs & Subs and index is 183
wrote data for New Paradigm and index is 184
wrote data for AltCapRight and index is 185
wrote data for tihookeanist and index is 186
wrote data for NPD Ingolstadt and index is 187
wrote data for HALLVARD and index is 188
wrote data for Siim Land and index is 189
wrote data for RagingPapist and index is 190
wrote data for Młodzież Wszechpolska Wrocław and index is 191
wrote data for gaelgeseomra and index is 192
wrote data for C4Liberty and index is 193
wrote data for Blazing Catfur and index is 194
wrote data for Identitarian Future and index is 195
wrote dat

wrote data for The Rovertime Podcast and index is 347
wrote data for The Swan of Tuonela and index is 348
wrote data for Entsatzheerfuehrer and index is 349
wrote data for Stanata Radulov and index is 350
wrote data for Thomas Sheridan and index is 351
wrote data for Donar Van Holland and index is 352
wrote data for Lutz Bachmann and index is 353
wrote data for Domuitio Germaniae and index is 354
wrote data for Mundane Chats and index is 355
wrote data for Thug Nificent and index is 356
wrote data for ĽS Naše Slovensko v NR SR and index is 357
wrote data for Avatar of Hyperborea and index is 358
wrote data for novopress and index is 359
wrote data for Virginia Vota and index is 360
wrote data for Caolan Robertson and index is 361
wrote data for Make-A-Wish Australia and index is 362
wrote data for Red Ice TV and index is 363
wrote data for IRISHINFIDEL and index is 364
wrote data for Alt-Right Tankie- Eurasianist and index is 365
wrote data for Mister Metokur and index is 366
wrote dat

wrote data for NPDFraktionSachsen and index is 521
wrote data for Nephanor and index is 522
wrote data for xryshayghcom and index is 523
wrote data for Terrace Retro 3 and index is 524
wrote data for pvvlimburg and index is 525
wrote data for Gigs in the Streets - music, busking, cover songs and index is 526
wrote data for Jacquelin Doran and index is 527
wrote data for NPD KV RO/TS/BGL and index is 528
wrote data for George Whale and index is 529
wrote data for Jérôme ROBERT and index is 530
wrote data for Courageous Conservatives PAC and index is 531
wrote data for SrpskaElita and index is 532
wrote data for RANT HUB and index is 533
wrote data for WhiteRabbitRadioTV and index is 534
wrote data for No White Guilt and index is 535
wrote data for Génération ID Lorraine and index is 536
wrote data for Jordan B Peterson and index is 537
wrote data for Bastion Social and index is 538
wrote data for mdtb2008 and index is 539
wrote data for T Я U Σ Ð I L T O M and index is 540
wrote data fo

wrote data for Bogoljubskij and index is 693
wrote data for gman4edl03 and index is 694
wrote data for Ruch Narodowy and index is 695
wrote data for AbstractEntityJ and index is 696
wrote data for senatormikelee and index is 697
wrote data for Knooppunt Delta and index is 698
wrote data for AfrikanerWeerstand Beweging Nuus Kanaal and index is 699
wrote data for Vee Live Stream and index is 700
wrote data for Wolfsbloed Midgardr and index is 701
wrote data for securefreedom and index is 702
wrote data for notregme and index is 703
wrote data for Steve Franssen and index is 704
wrote data for Bronson Official and index is 705
wrote data for Kopf Stein Pflaster and index is 706
wrote data for plurk85 and index is 707
wrote data for По программе Дедушки Мороза and index is 708
wrote data for Ábel Bódi and index is 709
wrote data for SirOssisOfLiver and index is 710
wrote data for StadtkewitzTV and index is 711
wrote data for Paul Lawlor and index is 712
wrote data for RafaVideoart and inde

wrote data for RAC1914 and index is 864
wrote data for The Irish Megaphone and index is 865
wrote data for Bosanska Straza and index is 866
wrote data for WODHANAZSON and index is 867
wrote data for Verde nel cuore - istituzioni and index is 868
wrote data for Terrace Retro 2 and index is 869
wrote data for Pegida Nederland and index is 870
wrote data for Télé NATION Infos and index is 871
wrote data for Beatrix von Storch and index is 872
wrote data for Bosanski Pokret Nacionalnog Ponosa and index is 873
wrote data for Lega Salvini Premier and index is 874
wrote data for BloodAndHonour Hexagone and index is 875
wrote data for Hadraan Le Gaulois and index is 876
wrote data for OpenMind and index is 877
wrote data for Alexandre Del Valle and index is 878
wrote data for Rational Black Pill and index is 879
wrote data for BayernDieFreiheit and index is 880
wrote data for Ronan in Siam and index is 881
wrote data for Franc Studio and index is 882
wrote data for Nicolaus Fest and index is 8

In [15]:
channels = pd.read_csv(PATH_TEMP_RIGHT + 'channels.csv')

In [31]:
patreon = re.compile('patreon')

nonnan = channels.loc[channels.channel_description.notnull()]

In [34]:
filtered = nonnan[nonnan['channel_description'].str.contains(patreon)]

In [45]:
pd.options.display.max_colwidth = 200

In [48]:
filtered['patreon'] = filtered['channel_description'].str.extract('patreon\.com\/([a-zA-Z0-9_&]+)')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [50]:
filtered.patreon

126                       cjboling
141                   countdankula
190                       siimland
215                   thisiseuropa
226              Carol_dancingdove
227                   social3state
235              roamingmillennial
261                    aaronclarey
303          illustratedphilosophy
349                      angryswan
377                   RealityCalls
415                 VlogIdentitaer
455                    LautGedacht
538                jordanbpeterson
561                  theiconoclast
581              roamingmillennial
609                    blackpigeon
656                jordanbpeterson
669                           mbga
684     blondeinthebellyofthebeast
855                          gamba
991                 survivethejive
1005                     poseidon7
Name: patreon, dtype: object

In [59]:
columns = ['videoId', 'published', 'title', 'channel_id', 'channel_title']
videos = pd.read_csv(PATH_TEMP_RIGHT + 'videos.csv', names=columns)

In [60]:
videos.head()

Unnamed: 0,videoId,published,title,channel_id,channel_title
0,Dxg7Jxou3F4,2018-04-14T04:29:42.000Z,Alex Jones Lulz 27 (Fuck Trump Edition),UCJpdwEg7vrj_61e0Bfnx02w,Morrakiu
1,85HbztRoTUE,2016-11-09T21:18:46.000Z,In Hoc Trumpo Vinces,UCJpdwEg7vrj_61e0Bfnx02w,Morrakiu
2,sYwkXqL8LX0,2016-10-15T07:19:15.000Z,Thou Shalt Not Stump,UCJpdwEg7vrj_61e0Bfnx02w,Morrakiu
3,MNy5h3yC8pI,2016-11-11T06:21:27.000Z,"Liberal Tears, a 2016 vintage",UCJpdwEg7vrj_61e0Bfnx02w,Morrakiu
4,M589TmXgGD0,2016-07-22T05:29:29.000Z,Alex Jones Lulz 24 (Featuring the Fat Brown Buffalo),UCJpdwEg7vrj_61e0Bfnx02w,Morrakiu


In [53]:
metadata = pd.read_csv(PATH_TEMP_RIGHT + 'metadata.csv')

In [54]:
metadata.head()

Unnamed: 0,videoId,description,tags,views,likes,dislikes,comments
0,Gdztmp80LWk,Why do you hate goodness?,"['they', 'see', 'me', 'trollin', 'federal', 'departments', 'perry', 'romney', 'ron', 'paul', 'education', 'energy', 'commerce']",3030,123,6,24
1,nZy4gyBxVfo,"A parody of 'Only In America' by Brooks & Dunn, written right after ""the red line was crossed"" in Syria.\nProtected by Fair Use, assholes.\n\nLyrics: \nSun coming up over Aleppo city\nFarouq Briga...","['Syria (Country)', 'Bashar Al-Assad', 'Netanyahu', 'Vladimir Putin', 'Barack Obama (US President)']",7967,143,12,29
2,lsh28plGBl8,We're back where we started before the mediaeval festival at Burghausen in order to answer the question as to why so many Germans - and Europeans in general - are self-destructive Leftists. Why di...,"['Wanghausen', 'apocalypse', 'four horsemen', 'germany', 'bavaria', 'austria', 'mjolnir magazine', 'david yorkshire']",96,12,0,2
3,rcQAF50rEvM,"As part of my tour of Bavaria, I'm at Walhalla. What, the hall of fallen heroes? Yes, in a way.....","['Walhalla', '#Walhalla', 'Bavaria', 'David Yorkshire', 'Mjolnir Magazine']",592,25,0,18
4,HZ346fsMYk8,I continue with the Hollywood theme and take a look at the connection between the evil lifestyles of the Hollywood elite and the (im)morality they imprint on their films. I claim fair use for the ...,"['Howard Stern', 'Quentin Tarantino', 'Hollywood', 'child rape', 'Black Panther', 'Roman Polanski', 'David Yorkshire', 'Mjolnir Magazine']",538,19,0,7


In [55]:
nonnanmeta = metadata.loc[metadata.description.notnull()]

In [69]:
filtered_meta = nonnanmeta[nonnanmeta['description'].str.contains('patreon|litecoin|ethereum|paypal|makersupport|monacoin| ripple|hatreon', case=False)]

In [70]:
len(filtered_meta)

14168

In [71]:
lookup = pd.merge(filtered_meta, videos, on='videoId', how='left')

In [72]:
len(lookup)

15352

In [73]:
lookup.to_csv(PATH_TEMP_RIGHT + 'patreon_lookup.csv')

In [91]:
lookup['patreon'] = lookup['description'].str.extract('patreon\.com\/([a-zA-Z0-9_&]+)')

AttributeError: 'DataFrame' object has no attribute 'lower'

In [98]:
lookup.patreon.str.lower().unique().tolist()

['schattenmacher',
 'criticalcondition',
 nan,
 'sargon',
 'user',
 'zoe',
 'thesarkeesianeffect',
 'styxhexenhammer666',
 'argenttemplar',
 'rapalje_celtic_folk_music',
 'palaestramedia',
 'thewarskishow',
 'ryandawson',
 'raheemkassam',
 'join',
 'thealternativehypothesis',
 'cooltapes',
 'jordanbpeterson',
 'mrthriveandsurvive',
 'mrthrivandsurvie',
 'mrthrvandsurvive',
 'mrthrivrandsurvive',
 'mindingfreedom',
 'sijw',
 'coachredpill',
 'billwhittle',
 'dmjaurini',
 'cjboling',
 'janetbloomfield',
 'werenotsorry',
 'countdankula',
 'gentscofflaw',
 'alexmalenki',
 'mikecernovich',
 'themaskedarab',
 'bakedalaska',
 'anatofinnstark',
 'wlop',
 'lionsheartproductions',
 'mattipaalanen',
 'sandara',
 'siimland',
 'thisiseuropa',
 'carol_dancingdove',
 'social3state',
 'roamingmillennial',
 'kirstenlauryn',
 'aaronclarey',
 'semiogogue',
 'ontheoffensive',
 'bravingruin',
 'drsteveturley',
 'distributist',
 'blacksunphoenix',
 'illustratedphilosophy',
 'truthcomics',
 'timcast',
 'slej

In [84]:
pat

Unnamed: 0,videoId,description,tags,views,likes,dislikes,comments,published,title,channel_id,channel_title,patreon
