# Pulling data from Twitter with twint

In [1]:
# Run this block if you need to install the relevant modules

!pip3 install twint==2.1.20 --user

!pip3 install nest_asyncio==1.4.0 --user
# ^^ Need to install this for twint event loop to work in a jupyter notebook setting

Collecting twint==2.1.20
  Downloading twint-2.1.20.tar.gz (31 kB)
Collecting aiodns
  Downloading aiodns-2.0.0-py2.py3-none-any.whl (4.8 kB)
Collecting aiohttp
  Downloading aiohttp-3.6.2-py3-none-any.whl (441 kB)
[K     |████████████████████████████████| 441 kB 9.3 MB/s eta 0:00:01
[?25hCollecting aiohttp_socks
  Downloading aiohttp_socks-0.5.3-py3-none-any.whl (32 kB)
Collecting cchardet
  Downloading cchardet-2.1.6-cp38-cp38-manylinux2010_x86_64.whl (244 kB)
[K     |████████████████████████████████| 244 kB 2.1 MB/s eta 0:00:01
[?25hCollecting elasticsearch
  Downloading elasticsearch-7.8.1-py2.py3-none-any.whl (188 kB)
[K     |████████████████████████████████| 188 kB 1.8 MB/s eta 0:00:01
[?25hCollecting fake-useragent
  Downloading fake-useragent-0.1.11.tar.gz (13 kB)
Collecting geopy
  Downloading geopy-2.0.0-py3-none-any.whl (111 kB)
[K     |████████████████████████████████| 111 kB 1.0 MB/s eta 0:00:01
[?25hCollecting googletransx
  Downloading googletransx-2.4.2.tar.gz (

Import nest_asyncio if needed for current environment

In [2]:
'''
This block of code is needed for using twint within a jupyter notebook.  Without it, twint searches generate runtime errors because an event loop is already running.
'''
try:
    ip = get_ipython()
    
    if ip.has_trait('kernel'): 
        #TODO: Create outcome
        import nest_asyncio
        nest_asyncio.apply()
        
except:
    Exception('Not working in an iPython environment, skipping this step.')

Import other modules and define some key variables:

In [4]:
import os
import sys
import pandas as pd

import twint
print('twint: '+twint.__version__)

data_path = '../data/raw/' # define location for data to be saved relative to notebook location
print(os.getcwd())

twint: 2.1.20
/home/robert/host/Ditchley/Aug20_Ditchley/notebooks


## Extracting lists of followers

In [5]:
def get_followers(username, fp, full=False, suppress=True):
    '''
    Function to scrape a list of followers of a specific user.
    
    Parameters
    ----------
    username : str
        the twitter handle of the user whose followers to pull.
    fp : str
        Filepath to directory where data should be stored.  
    full : bool
        if true, scrapes all user info; only name if false.
    suppress : bool
        If true, suppress the printed output of the scraping. 
        
    Returns
    -------
    followers : list of str
    '''
    full_path = fp+'followers_'
    if not full:
        full_path += 'names_'
    full_path += username+'.db'
    
    c = twint.Config()
    c.Username = username
    c.Hide_output = suppress
    c.User_full = full
    c.Database = full_path
    
    twint.run.Followers(c)
    
    print('Follower data saved to "'+full_path+'"')

In [6]:
get_followers('bobthephysicist', '../data/raw/', suppress=True)

[+] Inserting into Database: ../data/raw/followers_names_bobthephysicist.db


CRITICAL:root:twint.get:User:
CRITICAL:root:twint.feed:Follow:IndexError
CRITICAL:root:twint.feed:Follow:IndexError


Follower data saved to "../data/raw/followers_names_bobthephysicist.db"


In [20]:
print(len(followers))
print(followers[:5])

TypeError: object of type 'NoneType' has no len()

In [6]:
def get_following(username, fp, full=False, suppress=True):
    '''
    Function to scrape a list of followed users of a specific user.
    
    Parameters
    ----------
    username : str
        the twitter handle of the user whose follows to pull.
    fp : str
        Filepath to directory where data should be stored.  
    full : bool
        if true, scrapes all user info; only name if false.
    suppress : bool
        If true, suppress the printed output of the scraping. 
        
    Returns
    -------
    followers : list of str
        
    TODO: Abstract RAM storage of output
    '''
    full_path = fp+'following_'
    if not full:
        full_path += 'names_'
    full_path += username+'.db'
    
    c = twint.Config()
    c.Username = username
    c.Hide_output = suppress
    c.User_full = full
    c.Database = full_path
    
    twint.run.Following(c)
    
    print('Follow data saved to "'+full_path+'"')

In [7]:
get_following('bobthephysicist', '../data/raw/', suppress=True)

[+] Inserting into Database: ../data/raw/following_names_bobthephysicist.db


CRITICAL:root:twint.get:User:'NoneType' object is not subscriptable
CRITICAL:root:twint.feed:Follow:IndexError
CRITICAL:root:twint.feed:Follow:IndexError


Follow data saved to "../data/raw/following_names_bobthephysicist.db"
