# Scraping journalists then using twint to get who they follow

In [2]:
import sys
import os
import time
import json
import csv
import threading
import queue
import asyncio
import nest_asyncio
nest_asyncio.apply()
import twint

sys.path.insert(1, '../')
from src.data import journalists as journos

### 1. Getting journalist twitter handles according to topic

In [11]:
keyword = 'cybersec'
journo_handles = journos.get_handles_by_keyword(keyword)
print(len(journo_handles))

3


In [4]:
print(journo_handles)

['_lucyingham', 'JesscaHaworth', 'Ad_Nauseum74']


## 2. Loop over journalists and get who they follow

### 2.2 Storing each threads results as csv files (USE THIS)

This seems to work quite nicely, and may even be more reliable than storing in memory.  

Note that any recursive use to get data that was missed has to be done manually by alternately using the last two cells before section 3 to recheck the failed list and run it.

In [5]:
num_threads = 6 # num_threads can be any amount, but only so many can be run in parallel, according to number of cores you have.
# import multiprocessing; multiprocessing.cpu_count() can tell you how many cores python can see.

In [6]:
def get_friends(q, fp, persist):
    '''
    Params
    ------
    q : a Queue instance
    
    fp : string
        Provides the path where the file should be saved.
    persist : bool
        If True, make a repeat attempt if attempt fails.
    '''
    while True: 
        username = q.get()
        success = False
        filepath = fp+'friends_'+username+'.csv'
        while not success:
            print('Attempting to get friends of @'+username)
            c = twint.Config()
            c.Username = username
            c.User_full = False
            c.Hide_output = True
            c.Output = filepath

            twint.run.Following(c)
            
            if persist:
                if os.path.exists(filepath):
                    success = True
                    print('Friends of @'+username+' saved.')
            else:
                success = True
        q.task_done()

In [7]:
q = queue.Queue(maxsize=0)

for i in range(num_threads): # Loop to create threads
    worker = threading.Thread(target=get_friends, args=(q, '../data/raw/'+keyword+'_', True)) # Change persist to True to tell the code to keep trying until results are written to file.
    worker.setDaemon(True)
    worker.start()

for username in journo_handles: # Loop to add journalists usernames to the queue
    q.put(username)

q.join()

Attempting to get friends of @_lucyingham
Attempting to get friends of @JesscaHaworth
Attempting to get friends of @Ad_Nauseum74


CRITICAL:root:twint.get:User:'NoneType' object is not subscriptable
CRITICAL:root:twint.feed:Follow:IndexError


Friends of @_lucyingham saved.


CRITICAL:root:twint.get:User:'NoneType' object is not subscriptable
CRITICAL:root:twint.get:User:'NoneType' object is not subscriptable
CRITICAL:root:twint.feed:Follow:IndexError


Attempting to get friends of @Ad_Nauseum74


CRITICAL:root:twint.get:User:'NoneType' object is not subscriptable
CRITICAL:root:twint.feed:Follow:IndexError
CRITICAL:root:twint.feed:Follow:IndexError


Friends of @Ad_Nauseum74 saved.


CRITICAL:root:twint.feed:Follow:IndexError
CRITICAL:root:twint.feed:Follow:IndexError


Friends of @JesscaHaworth saved.


In [14]:
all_handles = []
all_users = []
failed = []
for name in journo_handles:
    filepath = '../data/raw/'+keyword+'_friends_'+name+'.csv'
    if not os.path.exists(filepath):
        failed.append(name)
    else:
        with open(filepath, newline='') as f:
            reader = csv.reader(f)
            handles = list(reader)
            all_handles.extend([handle[0] for handle in handles])
            all_users.extend([name for handle in handles])
            print('@'+name+' follows '+str(len(handles))+' users.')
            
print('\nTotal number of handles pulled: '+str(len(all_handles)))

unique = len(set(all_handles))
print('Number of unique twitter handles: '+str(unique))

print('\nZero following in list for users: '+str(failed))

@_lucyingham follows 507 users.
@JesscaHaworth follows 1936 users.
@Ad_Nauseum74 follows 365 users.

Total number of handles pulled: 2808
Number of unique twitter handles: 1702

Zero following in list for users: []


If some of the requests seem to have failed, can try them again:

In [17]:
q = queue.Queue(maxsize=0)

for i in range(num_threads): # Loop to create threads
    worker = threading.Thread(target=get_friends, args=(q, '../data/raw/'+keyword+'_', False))
    worker.setDaemon(True)
    worker.start()

for username in failed: # Loop to add journalists usernames to the queue
    q.put(username)

q.join()

CRITICAL:root:twint.get:User:'NoneType' object is not subscriptable
CRITICAL:root:twint.feed:Follow:IndexError
CRITICAL:root:twint.feed:Follow:IndexError


Interestingly, most of the handles must appear only once, suggesting there are only a few users that are followed by more than one of the journalists in this list.

## 3. Store the data in one csv file using pandas

In [15]:
import pandas as pd

In [16]:
df = pd.DataFrame(list(zip(all_users, all_handles)), 
               columns =['screen_name', 'friend'])

In [17]:
df.head()

Unnamed: 0,screen_name,friend
0,_lucyingham,SarahCAndersen
1,_lucyingham,JoeBelBruno
2,_lucyingham,Danny_D_Pearson
3,_lucyingham,jeremyscahill
4,_lucyingham,davidgraeber


In [18]:
df.to_csv('../data/raw/'+keyword+'_journalist_friends.csv', index=False)

## 4. Quick inspection of data

In [19]:
from collections import Counter

In [20]:
counted = Counter(all_handles)

In [None]:
counted.most_common(50)