In [1]:
# Dependencies
from TwitterAPI import TwitterAPI, TwitterPager
import json
import pandas as pd

In [2]:
# Login using credentials stored in auth.json file

# Define authentication file path
auth_path = 'data/auth.json'

# Read auth.json file
with open(auth_path, 'r') as file:
    auth_dict = json.load(file)

# Execute login
api = TwitterAPI(**auth_dict)

# Verify credentials
req = api.request('account/verify_credentials')
assert req.status_code == 200, 'Authentication failed!'

In [3]:
# Retrieve tweets for @FeminismInIndia
res = TwitterPager(api, 'tweets/search/30day/:dev', {
    'query': '(from:FeminismInIndia OR from:UN_Women) lang:en',
    'fromDate': '201912100000',
    'toDate': '201912200000'
})

# Initialize a list of tweet objects
tweets = list()
# Iterate through each response item
for item in res.get_iterator():
    # Check if there is an error
    if 'message' in item and item['code'] == 88:
        print('SUSPEND, RATE LIMIT EXCEEDED: {:s}\n'.format(str(item['message'])))
        break
    # Case no error: save the tweet
    tweets.append(item)



In [4]:
# Show number retrieved tweets
len(tweets)

408

In [5]:
# Show retrieved tweets
tweets

[{'created_at': 'Thu Dec 19 23:35:02 +0000 2019',
  'id': 1207806578650468352,
  'id_str': '1207806578650468352',
  'text': 'These events around the world prompted important conversations about gender equality this year. #ThisHappened via… https://t.co/507Nc5SAN1',
  'source': '<a href="https://www.hootsuite.com" rel="nofollow">Hootsuite Inc.</a>',
  'truncated': True,
  'in_reply_to_status_id': None,
  'in_reply_to_status_id_str': None,
  'in_reply_to_user_id': None,
  'in_reply_to_user_id_str': None,
  'in_reply_to_screen_name': None,
  'user': {'id': 17137628,
   'id_str': '17137628',
   'name': 'UN Women',
   'screen_name': 'UN_Women',
   'location': 'Worldwide',
   'url': 'http://www.unwomen.org',
   'description': "UN Women is the UN entity for gender equality and women's empowerment. Executive Director: @phumzileunwomen. Join us with #GenerationEquality!",
   'translator_type': 'none',
   'protected': False,
   'verified': True,
   'followers_count': 1718676,
   'friends_count':

In [6]:
# Get entities
entities = []
for i, tweet in enumerate(tweets):
    # Get tweet id (string)
    tweet_id = tweet['id_str']
    # Case the current tweet is of type extended
    if 'extended_tweet' in set(tweet.keys()):
        hashtags = tweet.get('extended_tweet').get('entities', {}).get('hashtags', [])
    # Case it is a standard tweet
    else:
        hashtags = tweet.get('entities', {}).get('entities', {})
    # Retrieve hashtags
    for j, hashtag in enumerate(hashtags):
        # Store retrieved hashtag
        entities.append({
            # Id of the current tweet
            'tweet_id_str': tweet_id,
            # Tweet type (e.g. hashtag)
            'type': 'hashtag',
            # Add actual hashtag text
            'text': hashtag.get('text', None)
        })
        
# Show entities
print('Entities retrieved:')
print(entities)

Entities retrieved:
[{'tweet_id_str': '1207806578650468352', 'type': 'hashtag', 'text': 'ThisHappened'}, {'tweet_id_str': '1207761285045260294', 'type': 'hashtag', 'text': 'WomenInSport'}, {'tweet_id_str': '1207714713326510081', 'type': 'hashtag', 'text': 'LokSabha'}, {'tweet_id_str': '1207714713326510081', 'type': 'hashtag', 'text': 'WomenInPolitics'}, {'tweet_id_str': '1207699612359749632', 'type': 'hashtag', 'text': 'IndiaRejectsCAA'}, {'tweet_id_str': '1207699612359749632', 'type': 'hashtag', 'text': 'IndiaRejectsNRC'}, {'tweet_id_str': '1207699612359749632', 'type': 'hashtag', 'text': 'PoliceStopViolence'}, {'tweet_id_str': '1207684513486245889', 'type': 'hashtag', 'text': 'sexism'}, {'tweet_id_str': '1207670688338890754', 'type': 'hashtag', 'text': 'GenerationEquality'}, {'tweet_id_str': '1207669414050570249', 'type': 'hashtag', 'text': 'CAA'}, {'tweet_id_str': '1207669414050570249', 'type': 'hashtag', 'text': 'IndiaRejectsCAA'}, {'tweet_id_str': '1207669414050570249', 'type': 'h

In [7]:
# Save entities into Pandas DataFrame object
entities = pd.DataFrame(entities)
entities.head()

Unnamed: 0,tweet_id_str,type,text
0,1207806578650468352,hashtag,ThisHappened
1,1207761285045260294,hashtag,WomenInSport
2,1207714713326510081,hashtag,LokSabha
3,1207714713326510081,hashtag,WomenInPolitics
4,1207699612359749632,hashtag,IndiaRejectsCAA


In [8]:
# Save entities to file
entities.to_csv('data/database/entities.csv')

In [9]:
# Define attributes which will be kept from retrieved tweets
kept_attr = ('created_at', 'id_str', 'text', 'truncated', 'geo', 'coordinates',
             'place', 'retweet_count', 'favourite_count', 'in_reply_to_status_id_str', 
             'in_reply_to_user_id_str', 'lang')

# Apply filter
for i, tweet in enumerate(tweets):
    # Take out the extended tweet attributes
    tweet = {**tweet, **tweet.get('extended_tweet', {})}
    # Handle text
    tweet['text'] = tweet['full_text' if 'full_text' in set(tweet.keys()) else 'text']
    # Substitute i-th tweet
    tweets[i] = {k: tweet.get(k, None) for k in kept_attr}
    # Show i-th tweet
    print(tweets[i])
    print()

{'created_at': 'Thu Dec 19 23:35:02 +0000 2019', 'id_str': '1207806578650468352', 'text': 'These events around the world prompted important conversations about gender equality this year. #ThisHappened via @GlblCtzn https://t.co/f6sXRxM0YO', 'truncated': True, 'geo': None, 'coordinates': None, 'place': None, 'retweet_count': 53, 'favourite_count': None, 'in_reply_to_status_id_str': None, 'in_reply_to_user_id_str': None, 'lang': 'en'}

{'created_at': 'Thu Dec 19 21:48:10 +0000 2019', 'id_str': '1207779681627447296', 'text': "RT @wef: Women's pay equality has slipped back 50 years. What can we do? https://t.co/n5HYhgJkIf #gendergap20 @UN_Women https://t.co/28Go3Z…", 'truncated': False, 'geo': None, 'coordinates': None, 'place': None, 'retweet_count': 0, 'favourite_count': None, 'in_reply_to_status_id_str': None, 'in_reply_to_user_id_str': None, 'lang': 'en'}

{'created_at': 'Thu Dec 19 21:42:14 +0000 2019', 'id_str': '1207778191173201920', 'text': "Happy birthday\xa0@Alyssa_Milano!🎈 Thank

In [10]:
# Store tweets into Pandas DataFrame object
tweets = pd.DataFrame(tweets)
tweets.head()

Unnamed: 0,created_at,id_str,text,truncated,geo,coordinates,place,retweet_count,favourite_count,in_reply_to_status_id_str,in_reply_to_user_id_str,lang
0,Thu Dec 19 23:35:02 +0000 2019,1207806578650468352,These events around the world prompted importa...,True,,,,53,,,,en
1,Thu Dec 19 21:48:10 +0000 2019,1207779681627447296,RT @wef: Women's pay equality has slipped back...,False,,,,0,,,,en
2,Thu Dec 19 21:42:14 +0000 2019,1207778191173201920,Happy birthday @Alyssa_Milano!🎈 Thank you for ...,False,,,,191,,,,en
3,Thu Dec 19 20:35:04 +0000 2019,1207761285045260294,“This is my charge to everyone:\n\nWe have to ...,True,,,,88,,,,en
4,Thu Dec 19 18:41:46 +0000 2019,1207732772451889152,8 Posters At The CAA Protests In Delhi That Sc...,False,,,,7,,,,en


In [11]:
# Save tweets to file
tweets.to_csv('data/database/tweets.csv', index=False)