In [1]:
# Dependencies
from TwitterAPI import TwitterAPI, TwitterPager
import json
import pandas as pd

In [2]:
# Login using credentials stored in auth.json file

# Define authentication file path
auth_path = 'data/auth.json'

# Read auth.json file
with open(auth_path, 'r') as file:
    auth_dict = json.load(file)

# Execute login
api = TwitterAPI(**auth_dict)

# Verify credentials
req = api.request('account/verify_credentials')
assert req.status_code == 200, 'Authentication failed!'

In [3]:
# Initialize a list of tweet objects
tweets = list()

In [4]:
# Retrieve tweets for @FeminismInIndia
res = TwitterPager(api, 'tweets/search/fullarchive/:dev', {
    'query': 'from:UN_Women lang:en',
    'fromDate': '201804010000',
    'toDate': '201807010000'
})

# Iterate through each response item
for item in res.get_iterator():
    # Check if there is an error
    if 'message' in item and item['code'] == 88:
        print('SUSPEND, RATE LIMIT EXCEEDED: {:s}\n'.format(str(item['message'])))
        break
    # Case no error: save the tweet
    tweets.append(item)

In [5]:
# Show number retrieved tweets
print('Number of tweets downloaded: {:d}'.format(len(tweets)))

Number of tweets downloaded: 1573


In [6]:
# Get entities
entities = []
for i, tweet in enumerate(tweets):
    # Get tweet id (string)
    tweet_id = tweet['id_str']
    # Case the current tweet is of type extended
    if 'extended_tweet' in set(tweet.keys()):
        hashtags = tweet.get('extended_tweet').get('entities', {}).get('hashtags', [])
    # Case it is a standard tweet
    else:
        hashtags = tweet.get('entities', {}).get('entities', {})
    # Retrieve hashtags
    for j, hashtag in enumerate(hashtags):
        # Store retrieved hashtag
        entities.append({
            # Id of the current tweet
            'tweet_id_str': tweet_id,
            # Tweet type (e.g. hashtag)
            'type': 'hashtag',
            # Add actual hashtag text
            'text': hashtag.get('text', None)
        })
        
# Show entities
print('Entities retrieved:')
print(entities)

Entities retrieved:
[{'tweet_id_str': '1013189209488797697', 'type': 'hashtag', 'text': 'PrideMonth'}, {'tweet_id_str': '1013128815575470081', 'type': 'hashtag', 'text': 'WorldParliamentDay'}, {'tweet_id_str': '1013098625768009728', 'type': 'hashtag', 'text': 'SMDay'}, {'tweet_id_str': '1013075989885341697', 'type': 'hashtag', 'text': 'SMDay'}, {'tweet_id_str': '1013053315935064064', 'type': 'hashtag', 'text': 'MeToo'}, {'tweet_id_str': '1013053315935064064', 'type': 'hashtag', 'text': 'SMDay'}, {'tweet_id_str': '1012985372064780289', 'type': 'hashtag', 'text': 'SMDay'}, {'tweet_id_str': '1012940062709960704', 'type': 'hashtag', 'text': 'SMDay'}, {'tweet_id_str': '1012940062709960704', 'type': 'hashtag', 'text': 'MeToo'}, {'tweet_id_str': '1012940062709960704', 'type': 'hashtag', 'text': 'TimeIsNow'}, {'tweet_id_str': '1012917408238063618', 'type': 'hashtag', 'text': 'Planet5050'}, {'tweet_id_str': '1012917408238063618', 'type': 'hashtag', 'text': 'MP'}, {'tweet_id_str': '1012917408238

In [7]:
# Save entities into Pandas DataFrame object
entities = pd.DataFrame(entities)
entities.head()

Unnamed: 0,tweet_id_str,type,text
0,1013189209488797697,hashtag,PrideMonth
1,1013128815575470081,hashtag,WorldParliamentDay
2,1013098625768009728,hashtag,SMDay
3,1013075989885341697,hashtag,SMDay
4,1013053315935064064,hashtag,MeToo


In [8]:
# Save entities to file
entities.to_csv('data/database/entities.csv')

In [9]:
# Define attributes which will be kept from retrieved tweets
kept_attr = ('created_at', 'id_str', 'text', 'truncated', 'geo', 'coordinates',
             'place', 'retweet_count', 'favourite_count', 'in_reply_to_status_id_str', 
             'in_reply_to_user_id_str', 'lang')

# Apply filter
for i, tweet in enumerate(tweets):
    # Take out the extended tweet attributes
    tweet = {**tweet, **tweet.get('extended_tweet', {})}
    # Handle text
    tweet['text'] = tweet['full_text' if 'full_text' in set(tweet.keys()) else 'text']
    # Substitute i-th tweet
    tweets[i] = {k: tweet.get(k, None) for k in kept_attr}
    # Show i-th tweet
    print(tweets[i])
    print()

{'created_at': 'Sat Jun 30 22:35:06 +0000 2018', 'id_str': '1013189209488797697', 'text': "#PrideMonth may be ending today, but we will continue to advocate for everyone's right to live as their true self every day, everywhere.\n\nJoin us: https://t.co/rm4EvjQRyk https://t.co/kbrhXRXL7P", 'truncated': True, 'geo': None, 'coordinates': None, 'place': None, 'retweet_count': 73, 'favourite_count': None, 'in_reply_to_status_id_str': None, 'in_reply_to_user_id_str': None, 'lang': 'en'}

{'created_at': 'Sat Jun 30 19:35:07 +0000 2018', 'id_str': '1013143915308953606', 'text': '“This disaster leaves us in extreme poverty..." -- Magdalena Sutamul, after the volcanic eruption in Guatemala.\n\nWe\'re delivering humanitarian assistance based on women’s needs &amp; priorities: https://t.co/BWzaXJ0H0z https://t.co/IcJK5eA6K1', 'truncated': True, 'geo': None, 'coordinates': None, 'place': None, 'retweet_count': 28, 'favourite_count': None, 'in_reply_to_status_id_str': None, 'in_reply_to_user_id_str'

{'created_at': 'Thu Apr 12 14:47:57 +0000 2018', 'id_str': '984443008509718528', 'text': 'RT @UNFPA: #DidYouKnow: Migration is a feminist issue?! Here are 5 reasons why: https://t.co/Ji2hO0TGlq\n\n#CPD51', 'truncated': False, 'geo': None, 'coordinates': None, 'place': None, 'retweet_count': 0, 'favourite_count': None, 'in_reply_to_status_id_str': None, 'in_reply_to_user_id_str': None, 'lang': 'en'}

{'created_at': 'Thu Apr 12 14:28:41 +0000 2018', 'id_str': '984438159558758401', 'text': 'RT @unwomenasia: What has the Swedish ambassador @StaffHerrst to say about #genderequality in business? \nExactly one month ago @UN_Women in…', 'truncated': False, 'geo': None, 'coordinates': None, 'place': None, 'retweet_count': 0, 'favourite_count': None, 'in_reply_to_status_id_str': None, 'in_reply_to_user_id_str': None, 'lang': 'en'}

{'created_at': 'Thu Apr 12 14:25:23 +0000 2018', 'id_str': '984437327513649152', 'text': "RT @UN: Investing in women &amp; girls helps build better lives &amp; create

{'created_at': 'Tue Apr 03 15:32:39 +0000 2018', 'id_str': '981192766385459200', 'text': 'RT @unwomenpacific: Our thoughts go out to everyone affected by #TCJosie floods in Fiji, and especially women market vendors who have been…', 'truncated': False, 'geo': None, 'coordinates': None, 'place': None, 'retweet_count': 0, 'favourite_count': None, 'in_reply_to_status_id_str': None, 'in_reply_to_user_id_str': None, 'lang': 'en'}

{'created_at': 'Tue Apr 03 14:56:04 +0000 2018', 'id_str': '981183556616630272', 'text': 'RT @UNDP: 84% of Yemeni women give birth at home &amp; very few households can afford professional healthcare. Since the start of the conflict,…', 'truncated': False, 'geo': None, 'coordinates': None, 'place': None, 'retweet_count': 0, 'favourite_count': None, 'in_reply_to_status_id_str': None, 'in_reply_to_user_id_str': None, 'lang': 'en'}

{'created_at': 'Tue Apr 03 13:35:10 +0000 2018', 'id_str': '981163198501085184', 'text': '“Women in rural communities want to be recogniz

In [10]:
# Store tweets into Pandas DataFrame object
df_tweets = pd.DataFrame(tweets)
df_tweets.head()

Unnamed: 0,created_at,id_str,text,truncated,geo,coordinates,place,retweet_count,favourite_count,in_reply_to_status_id_str,in_reply_to_user_id_str,lang
0,Sat Jun 30 22:35:06 +0000 2018,1013189209488797697,"#PrideMonth may be ending today, but we will c...",True,,,,73,,,,en
1,Sat Jun 30 19:35:07 +0000 2018,1013143915308953606,“This disaster leaves us in extreme poverty......,True,,,,28,,,,en
2,Sat Jun 30 18:35:07 +0000 2018,1013128815575470081,We want more women on ballots 🗳️ when we go ou...,True,,,,56,,,,en
3,Sat Jun 30 16:35:09 +0000 2018,1013098625768009728,“The idea that sexism &amp; misogyny in online...,True,,,,107,,,,en
4,Sat Jun 30 15:05:12 +0000 2018,1013075989885341697,"When women are targeted online, the abuse is m...",True,,,,209,,,,en


In [11]:
# Save tweets to file
df_tweets.to_csv('data/database/tweets.csv', index=False)

In [12]:
df_tweets.shape

(1573, 12)