# Scraping journalists then using twint to get who they follow

In [1]:
import sys
import os
import time
import json
import csv
import threading
import queue
import asyncio 
import nest_asyncio
nest_asyncio.apply()
import twint

sys.path.insert(1, 'C:/Users/Luca/Aug20_Ditchley')
from src.data import journalists as journos

### 1. Getting journalist twitter handles according to topic

In [2]:
keyword = 'politics'
journo_handles = journos.get_handles_by_keyword(keyword)
print(len(journo_handles))

239


In [None]:
print(journo_handles)

In [None]:
#this journalist has a locked account
journo_handles.remove('P_Madeley_Star')
#journo_handles.remove('AndyRoocroft')

## 2. Loop over journalists and get who they follow

### 2.2 Storing each threads results as csv files (USE THIS)

This seems to work quite nicely, and may even be more reliable than storing in memory.  

Note that any recursive use to get data that was missed has to be done manually by alternately using the last two cells before section 3 to recheck the failed list and run it.

In [None]:
num_threads = 6 # num_threads can be any amount, but only so many can be run in parallel, according to number of cores you have.
# import multiprocessing; multiprocessing.cpu_count() can tell you how many cores python can see.

In [None]:
def get_friends(q, fp, persist):
    '''
    Params
    ------
    q : a Queue instance
    
    fp : string
        Provides the path where the file should be saved.
    persist : bool
        If True, make a repeat attempt if attempt fails.
    '''
    while True: 
        username = q.get()
        success = False
        filepath = fp+'friends_'+username+'.csv'
        while not success:
            print('Attempting to get friends of @'+username)
            c = twint.Config()
            c.Username = username
            c.User_full = False
            c.Hide_output = True
            c.Output = filepath

            twint.run.Following(c)
            
            if persist:
                if os.path.exists(filepath):
                    success = True
                    print('Friends of @'+username+' saved.')
            else:
                success = True
        q.task_done()

In [None]:
q = queue.Queue(maxsize=0)

for i in range(num_threads): # Loop to create threads
    worker = threading.Thread(target=get_friends, args=(q, '../data/raw/'+keyword+'_', False)) # Change persist to True to tell the code to keep trying until results are written to file.
    worker.setDaemon(True)
    worker.start()

# for username in journo_handles: # Loop to add journalists usernames to the queue

    q.put(username)

q.join()

In [None]:
all_handles = []
all_users = []
failed = []
for name in journo_handles:
    filepath = '../data/raw/'+keyword+'_friends_'+name+'.csv'
    if not os.path.exists(filepath):
        failed.append(name)
    else:
        with open(filepath, newline='') as f:
            reader = csv.reader(f)
            handles = list(reader)
            all_handles.extend([handle[0] for handle in handles])
            all_users.extend([name for handle in handles])
            print('@'+name+' follows '+str(len(handles))+' users.')
            
print('\nTotal number of handles pulled: '+str(len(all_handles)))

unique = len(set(all_handles))
print('Number of unique twitter handles: '+str(unique))

print('\nZero following in list for users: '+str(failed))

If some of the requests seem to have failed, can try them again:

In [None]:
q = queue.Queue(maxsize=0)

for i in range(num_threads): # Loop to create threads
    worker = threading.Thread(target=get_friends, args=(q, '../data/raw/'+keyword+'_', False))
    worker.setDaemon(True)
    worker.start()

for username in failed: # Loop to add journalists usernames to the queue
    q.put(username)

q.join()

Interestingly, most of the handles must appear only once, suggesting there are only a few users that are followed by more than one of the journalists in this list.

## 3. Store the data in one csv file using pandas

In [None]:
import pandas as pd

In [None]:
df = pd.DataFrame(list(zip(all_users, all_handles)), 
               columns =['screen_name', 'friend'])

In [None]:
df.head()

In [None]:
df.to_csv('../data/raw/'+keyword+'_journalist_friends.csv', index=False)

## 4. Quick inspection of data

In [None]:
from collections import Counter

In [None]:
counted = Counter(all_handles)

In [None]:
counted.most_common(50)

In [3]:
num_threads = 7 # num_threads can be any amount, but only so many can be run in parallel, according to number of cores you have.
# import multiprocessing; multiprocessing.cpu_count() can tell you how many cores python can see.

In [4]:
### from here I try the same but for tweets

def get_tweets(q, fp, persist):
    '''
    Params
    ------
    q : a Queue instance
    
    fp : string
        Provides the path where the file should be saved.
    persist : bool
        If True, make a repeat attempt if attempt fails.
    '''
    while True: 
        username = q.get()
        success = False
        filepath = fp +username+'.csv'
        while not success:
            print('Attempting to get tweets of @'+username)
            c = twint.Config()
            c.Username = username
            #c.User_full = False
            c.Hide_output = True
            #c.Pandas =True
            #c.Store_object = True
            c.Limit = 10000000
            c.Until = '2020-08-01'
            c.Since = '2019-08-01'
            c.Profile_full = True
            c.Store_csv = True
            c.Retweets = True
            #c.Native_retweets = True
            c.Output = filepath

            twint.run.Search(c) # Profile
            tweets = twint.storage.panda.Tweets_df
            #tweets = tweets.append(twint.storage.panda.Tweets_df)
            
            if persist:
                if os.path.exists(filepath):
                    success = True
                    print('Tweets of @'+username+' saved.')
            else:
                success = True
        q.task_done()

In [None]:
q = queue.Queue(maxsize=0)

for i in range(num_threads): # Loop to create threads
    worker = threading.Thread(target=get_tweets, args=(q,'../data/raw/'+keyword +'_', True)) # Change persist to True to tell the code to keep trying until results are written to file.
    worker.setDaemon(True)
    worker.start()

for username in journo_handles: # Loop to add journalists usernames to the queue
#for username in ['jennystrasburg',]: 
    q.put(username)

q.join()

Attempting to get tweets of @paulcockerton
Attempting to get tweets of @DelModyAttempting to get tweets of @patrickwintour
Attempting to get tweets of @hugh_muir
Attempting to get tweets of @JohnDomokosAttempting to get tweets of @Davidmkeys


Attempting to get tweets of @elashton


CRITICAL:root:twint.get:User:'NoneType' object is not subscriptable
CRITICAL:root:twint.get:User:'NoneType' object is not subscriptable
CRITICAL:root:twint.get:User:'NoneType' object is not subscriptable
CRITICAL:root:twint.get:User:'NoneType' object is not subscriptable
CRITICAL:root:twint.get:User:'NoneType' object is not subscriptable
CRITICAL:root:twint.get:User:'NoneType' object is not subscriptable
CRITICAL:root:twint.get:User:'NoneType' object is not subscriptable


Tweets of @hugh_muir saved.
Attempting to get tweets of @patrick_kidd


CRITICAL:root:twint.get:User:'NoneType' object is not subscriptable


Tweets of @Davidmkeys saved.
Attempting to get tweets of @TimRoss_1


CRITICAL:root:twint.get:User:'NoneType' object is not subscriptable


Tweets of @TimRoss_1 saved.
Attempting to get tweets of @nadiakhomami


CRITICAL:root:twint.get:User:'NoneType' object is not subscriptable


Tweets of @JohnDomokos saved.
Attempting to get tweets of @JBeattieMirror


CRITICAL:root:twint.get:User:'NoneType' object is not subscriptable


Tweets of @paulcockerton saved.
Attempting to get tweets of @NigelNelson


CRITICAL:root:twint.get:User:'NoneType' object is not subscriptable


Tweets of @JBeattieMirror saved.
Attempting to get tweets of @RJPartington


CRITICAL:root:twint.get:User:'NoneType' object is not subscriptable


Tweets of @NigelNelson saved.
Attempting to get tweets of @FinanceJames


CRITICAL:root:twint.get:User:'NoneType' object is not subscriptable


Tweets of @DelMody saved.
Attempting to get tweets of @trevadavies


CRITICAL:root:twint.get:User:'NoneType' object is not subscriptable


Tweets of @nadiakhomami saved.
Attempting to get tweets of @benglaze


CRITICAL:root:twint.get:User:'NoneType' object is not subscriptable


Tweets of @trevadavies saved.
Attempting to get tweets of @christopherhope


CRITICAL:root:twint.get:User:'NoneType' object is not subscriptable


Tweets of @patrickwintour saved.
Attempting to get tweets of @ladyhaja


CRITICAL:root:twint.get:User:'NoneType' object is not subscriptable


Tweets of @RJPartington saved.
Attempting to get tweets of @LOS_Fisher


CRITICAL:root:twint.get:User:'NoneType' object is not subscriptable


Tweets of @FinanceJames saved.
Attempting to get tweets of @martinbeckford


CRITICAL:root:twint.get:User:'NoneType' object is not subscriptable
CRITICAL:root:twint.run:Twint:Feed:noDataExpecting value: line 1 column 1 (char 0)
sleeping for 1.0 secs
CRITICAL:root:twint.run:Twint:Feed:noDataExpecting value: line 1 column 1 (char 0)
sleeping for 1.0 secs
CRITICAL:root:twint.run:Twint:Feed:noDataExpecting value: line 1 column 1 (char 0)
sleeping for 1.0 secs
CRITICAL:root:twint.run:Twint:Feed:noDataExpecting value: line 1 column 1 (char 0)
sleeping for 1.0 secs
CRITICAL:root:twint.run:Twint:Feed:noDataExpecting value: line 1 column 1 (char 0)
sleeping for 8.0 secs
CRITICAL:root:twint.run:Twint:Feed:noDataExpecting value: line 1 column 1 (char 0)
sleeping for 8.0 secs
CRITICAL:root:twint.run:Twint:Feed:noDataExpecting value: line 1 column 1 (char 0)
sleeping for 1.0 secs
CRITICAL:root:twint.run:Twint:Feed:noDataExpecting value: line 1 column 1 (char 0)
sleeping for 1.0 secs
CRITICAL:root:twint.run:Twint:Feed:noDataExpecting value: line 1 column 1 (char 0)
sleeping 

Tweets of @martinbeckford saved.
Attempting to get tweets of @danbloom1


CRITICAL:root:twint.get:User:'NoneType' object is not subscriptable


Tweets of @elashton saved.
Attempting to get tweets of @darrendodd


CRITICAL:root:twint.get:User:'NoneType' object is not subscriptable


Tweets of @benglaze saved.
Attempting to get tweets of @martinkettle


CRITICAL:root:twint.get:User:'NoneType' object is not subscriptable


Tweets of @martinkettle saved.
Attempting to get tweets of @fperraudin


CRITICAL:root:twint.get:User:'NoneType' object is not subscriptable


Tweets of @LOS_Fisher saved.
Attempting to get tweets of @mikeysmith


CRITICAL:root:twint.get:User:'NoneType' object is not subscriptable


Tweets of @darrendodd saved.
Attempting to get tweets of @bbclaurak


CRITICAL:root:twint.get:User:'NoneType' object is not subscriptable


Tweets of @fperraudin saved.
Attempting to get tweets of @sarah_bloch


CRITICAL:root:twint.get:User:'NoneType' object is not subscriptable


Tweets of @sarah_bloch saved.
Attempting to get tweets of @chloefhayward


CRITICAL:root:twint.get:User:'NoneType' object is not subscriptable


Tweets of @chloefhayward saved.
Attempting to get tweets of @BBCHughPym


CRITICAL:root:twint.get:User:'NoneType' object is not subscriptable


Tweets of @BBCHughPym saved.
Attempting to get tweets of @gavinhewitt01


CRITICAL:root:twint.get:User:'NoneType' object is not subscriptable


Tweets of @gavinhewitt01 saved.
Attempting to get tweets of @JPonpolitics


CRITICAL:root:twint.get:User:'NoneType' object is not subscriptable


Tweets of @JPonpolitics saved.
Attempting to get tweets of @Jo_Coburn


CRITICAL:root:twint.get:User:'NoneType' object is not subscriptable


Tweets of @Jo_Coburn saved.
Attempting to get tweets of @BBCBenWright


CRITICAL:root:twint.get:User:'NoneType' object is not subscriptable
CRITICAL:root:twint.run:Twint:Feed:noDataExpecting value: line 1 column 1 (char 0)
sleeping for 1.0 secs
CRITICAL:root:twint.run:Twint:Feed:noDataExpecting value: line 1 column 1 (char 0)
sleeping for 1.0 secs
CRITICAL:root:twint.run:Twint:Feed:noDataExpecting value: line 1 column 1 (char 0)
sleeping for 1.0 secs
CRITICAL:root:twint.run:Twint:Feed:noDataExpecting value: line 1 column 1 (char 0)
sleeping for 8.0 secs
CRITICAL:root:twint.run:Twint:Feed:noDataExpecting value: line 1 column 1 (char 0)
sleeping for 8.0 secs
CRITICAL:root:twint.run:Twint:Feed:noDataExpecting value: line 1 column 1 (char 0)
sleeping for 1.0 secs
CRITICAL:root:twint.run:Twint:Feed:noDataExpecting value: line 1 column 1 (char 0)
sleeping for 8.0 secs
CRITICAL:root:twint.run:Twint:Feed:noDataExpecting value: line 1 column 1 (char 0)
sleeping for 1.0 secs
CRITICAL:root:twint.run:Twint:Feed:noDataExpecting value: line 1 column 1 (char 0)
sleeping 

Tweets of @BBCBenWright saved.
Attempting to get tweets of @RobBurl


CRITICAL:root:twint.get:User:'NoneType' object is not subscriptable


Tweets of @christopherhope saved.
Attempting to get tweets of @DarranMarshall


CRITICAL:root:twint.get:User:'NoneType' object is not subscriptable


Tweets of @bbclaurak saved.
Attempting to get tweets of @SimonHareBBC


CRITICAL:root:twint.get:User:'NoneType' object is not subscriptable


Tweets of @SimonHareBBC saved.
Attempting to get tweets of @patersonjon


CRITICAL:root:twint.get:User:'NoneType' object is not subscriptable


Tweets of @patersonjon saved.
Attempting to get tweets of @samwdhouse


CRITICAL:root:twint.get:User:'NoneType' object is not subscriptable


Tweets of @samwdhouse saved.
Attempting to get tweets of @mccaffertynaomi


CRITICAL:root:twint.get:User:'NoneType' object is not subscriptable


Tweets of @mccaffertynaomi saved.
Attempting to get tweets of @GemmaLDillon


CRITICAL:root:twint.get:User:'NoneType' object is not subscriptable
CRITICAL:root:twint.run:Twint:Feed:noDataExpecting value: line 1 column 1 (char 0)
sleeping for 1.0 secs
CRITICAL:root:twint.run:Twint:Feed:noDataExpecting value: line 1 column 1 (char 0)
sleeping for 1.0 secs
CRITICAL:root:twint.run:Twint:Feed:noDataExpecting value: line 1 column 1 (char 0)
sleeping for 1.0 secs
CRITICAL:root:twint.run:Twint:Feed:noDataExpecting value: line 1 column 1 (char 0)
sleeping for 1.0 secs
CRITICAL:root:twint.run:Twint:Feed:noDataExpecting value: line 1 column 1 (char 0)
sleeping for 8.0 secs
CRITICAL:root:twint.run:Twint:Feed:noDataExpecting value: line 1 column 1 (char 0)
sleeping for 8.0 secs
CRITICAL:root:twint.run:Twint:Feed:noDataExpecting value: line 1 column 1 (char 0)
sleeping for 1.0 secs
CRITICAL:root:twint.run:Twint:Feed:noDataExpecting value: line 1 column 1 (char 0)
sleeping for 1.0 secs
CRITICAL:root:twint.run:Twint:Feed:noDataExpecting value: line 1 column 1 (char 0)
sleeping 

In [None]:
all_handles = []
all_users = []
failed = []
for name in journo_handles:
    filepath = '../data/raw/'+keyword +'_'+name+'.csv'
    if not os.path.exists(filepath):
        failed.append(name)
    if os.path.exists(filepath):
        all_handles.append(filepath)        

In [None]:
failed

In [None]:
#this journalist has a locked account
failed.remove('Tinglepolitics')# no tweets in the target date period
failed.remove('P_Madeley_Star') # locked tweets
failed.remove('LauraHusbo') # no tweets in the target date period
failed.remove('AndyRoocroft') # locked tweets
failed.remove('Simon_Vaughan') # no tweets in the target date period
failed.remove('journomatei') # no tweets in the target date period
failed.append('mateirosca')
#journo_handles.remove('AndyRoocroft')

In [None]:
q = queue.Queue(maxsize=0)

for i in range(num_threads): # Loop to create threads
    worker = threading.Thread(target=get_tweets, args=(q, '../data/raw/'+keyword+'_', True))
    worker.setDaemon(True)
    worker.start()

for username in failed: # Loop to add journalists usernames to the queue
    q.put(username)

q.join()

In [None]:
import pandas as pd
all_handles = []
all_users = []
failed = []
all_tweets = pd.DataFrame()
for name in journo_handles:
    filepath = '../data/raw/'+keyword +'_'+name+'.csv'
    if os.path.exists(filepath):
        all_handles.append(filepath)
        temp_csv = pd.read_csv(filepath)
        all_tweets = pd.concat([all_tweets, temp_csv])

In [None]:
len(all_tweets)

In [None]:
all_tweets.index = range(len(all_tweets))


In [None]:
all_tweets.to_csv('twint_cyber_16082020_inParallel_with_index.csv', index = True)