# Scraping journalists then using twint to get who they follow

In [38]:
import sys
import os
import time
import json
import csv
import threading
import queue
import asyncio
import nest_asyncio
nest_asyncio.apply()
import twint

sys.path.insert(1, '../')
from src.data import journalists as journos

### 1. Getting journalist twitter handles according to topic

In [4]:
journo_handles = journos.get_handles_by_keyword('cyber')
print(len(journo_handles))

12


In [5]:
print(journo_handles)

['jennystrasburg', 'dannsimmons', 'LeoKelion', 'gordoncorera', 'joetidy', '_lucyingham', 'dannyjpalmer', 'SophiaFurber', 'SCFGallagher', 'MsHannahMurphy', 'JesscaHaworth', 'Ad_Nauseum74']


## 2. Loop over journalists and get who they follow

### 2.2 Storing each threads results as csv files (USE THIS)

This seems to work quite nicely, and may even be more reliable than storing in memory.  

Note that any recursive use to get data that was missed has to be done manually by alternately using the last two cells before section 3 to recheck the failed list and run it.

In [None]:
num_threads = 6 # num_threads can be any amount, but only so many can be run in parallel, according to number of cores you have.
# import multiprocessing; multiprocessing.cpu_count() can tell you how many cores python can see.

In [9]:
def get_friends(q, fp, persist):
    '''
    Params
    ------
    q : a Queue instance
    
    fp : string
        Provides the path where the file should be saved.
    persist : bool
        If True, make a repeat attempt if attempt fails.
    '''
    while True: 
        username = q.get()
        success = False
        filepath = fp+'friends_'+username+'.csv'
        while not success:
            print('Attempting to get friends of @'+username)
            c = twint.Config()
            c.Username = username
            c.User_full = False
            c.Hide_output = True
            c.Output = filepath

            twint.run.Following(c)
            
            if persist:
                if os.path.exists(filepath):
                    success = True
                    print('Friends of @'+username+' saved.')
            else:
                success = True
        q.task_done()

In [10]:
q = queue.Queue(maxsize=0)

for i in range(num_threads): # Loop to create threads
    worker = threading.Thread(target=get_friends, args=(q, '../data/raw/cyber_', False)) # Change persist to True to tell the code to keep trying until results are written to file.
    worker.setDaemon(True)
    worker.start()

for username in journo_handles: # Loop to add journalists usernames to the queue
    q.put(username)

q.join()

CRITICAL:root:twint.get:User:'NoneType' object is not subscriptable
CRITICAL:root:twint.get:User:'NoneType' object is not subscriptable
CRITICAL:root:twint.get:User:'NoneType' object is not subscriptable
CRITICAL:root:twint.feed:Follow:IndexError
CRITICAL:root:twint.get:User:'NoneType' object is not subscriptable
CRITICAL:root:twint.get:User:'NoneType' object is not subscriptable
CRITICAL:root:twint.feed:Follow:IndexError
CRITICAL:root:twint.get:User:'NoneType' object is not subscriptable
CRITICAL:root:twint.feed:Follow:IndexError
CRITICAL:root:twint.get:User:'NoneType' object is not subscriptable
CRITICAL:root:twint.get:User:'NoneType' object is not subscriptable
CRITICAL:root:twint.get:User:'NoneType' object is not subscriptable
CRITICAL:root:twint.feed:Follow:IndexError
CRITICAL:root:twint.feed:Follow:IndexError
CRITICAL:root:twint.feed:Follow:IndexError
CRITICAL:root:twint.get:User:'NoneType' object is not subscriptable
CRITICAL:root:twint.get:User:'NoneType' object is not subscrip

In [29]:
all_handles = []
all_users = []
failed = []
for name in journo_handles:
    if not os.path.exists('../data/raw/cyber_friends_'+name+'.csv'):
        failed.append(name)
    else:
        with open('../data/raw/cyber_friends_'+name+'.csv', newline='') as f:
            reader = csv.reader(f)
            handles = list(reader)
            all_handles.extend([handle[0] for handle in handles])
            all_users.extend([name for handle in handles])
            print('@'+name+' follows '+str(len(handles))+' users.')
            
print('\nTotal number of handles pulled: '+str(len(all_handles)))

unique = len(set(all_handles))
print('Number of unique usernames: '+str(unique))

print('\nZero following in list for '+str(failed))

@jennystrasburg follows 2841 users.
@dannsimmons follows 295 users.
@LeoKelion follows 4765 users.
@gordoncorera follows 1270 users.
@joetidy follows 2714 users.
@_lucyingham follows 507 users.
@dannyjpalmer follows 888 users.
@SophiaFurber follows 4324 users.
@SCFGallagher follows 1555 users.
@MsHannahMurphy follows 1321 users.
@JesscaHaworth follows 968 users.
@Ad_Nauseum74 follows 365 users.

Total number of handles pulled: 21813
Number of unique usernames: 18926

Zero following in list for []


If some of the requests seem to have failed, can try them again:

In [17]:
q = queue.Queue(maxsize=0)

for i in range(num_threads): # Loop to create threads
    worker = threading.Thread(target=get_friends, args=(q, '../data/raw/cyber_', False))
    worker.setDaemon(True)
    worker.start()

for username in failed: # Loop to add journalists usernames to the queue
    q.put(username)

q.join()

CRITICAL:root:twint.get:User:'NoneType' object is not subscriptable
CRITICAL:root:twint.feed:Follow:IndexError
CRITICAL:root:twint.feed:Follow:IndexError


Interestingly, most of the handles must appear only once, suggesting there are only a few users that are followed by more than one of the journalists in this list.

## 3. Store the data in one csv file using pandas

In [30]:
import pandas as pd

In [32]:
df = pd.DataFrame(list(zip(all_users, all_handles)), 
               columns =['screen_name', 'friend'])

In [33]:
df.head()

Unnamed: 0,screen_name,friend
0,jennystrasburg,RobaHusseini
1,jennystrasburg,HashemOsseiran
2,jennystrasburg,liveanthony
3,jennystrasburg,EliseKapNM
4,jennystrasburg,adam_tooze


In [34]:
df.to_csv('../data/raw/cyber_journalist_friends_2.csv', index=False)

## 4. Quick inspection of data

In [35]:
from collections import Counter

In [36]:
counted = Counter(all_handles)

In [37]:
counted.most_common(50)

[('MalwareTechBlog', 9),
 ('briankrebs', 9),
 ('gcluley', 8),
 ('TaylorLorenz', 7),
 ('alexhern', 7),
 ('jimwaterson', 7),
 ('faisalislam', 7),
 ('BBCBreaking', 7),
 ('ProfWoodward', 7),
 ('DaveLeeFT', 7),
 ('TechCrunch', 7),
 ('ruskin147', 7),
 ('alexstamos', 7),
 ('jamesrbuk', 7),
 ('NCSC', 7),
 ('KimZetter', 7),
 ('GossiTheDog', 7),
 ('geoffwhite247', 7),
 ('bbclaurak', 7),
 ('mikko', 6),
 ('dnvolz', 6),
 ('elonmusk', 6),
 ('realDonaldTrump', 6),
 ('TheEconomist', 6),
 ('troyhunt', 6),
 ('jleyden', 6),
 ('DanRaywood', 6),
 ('oliviasolon', 6),
 ('WIRED', 6),
 ('drjessicabarker', 6),
 ('charlottejee', 6),
 ('campuscodi', 6),
 ('joetidy', 6),
 ('SecurityCharlie', 6),
 ('josephfcox', 6),
 ('carolecadwalla', 6),
 ('amolrajan', 6),
 ('SwiftOnSecurity', 6),
 ('DarkReading', 6),
 ('meghamohan', 6),
 ('e_kaspersky', 6),
 ('iblametom', 6),
 ('zackwhittaker', 6),
 ('Peston', 6),
 ('lorenzofb', 6),
 ('FSecure', 5),
 ('ericgeller', 5),
 ('kashhill', 5),
 ('rcallimachi', 5),
 ('noUpside', 5)]