# Tweepy to Extract Female MP Tweets

The below code takes code from https://gist.github.com/nicolewhite/167828e51d8f2b6fad75 and modifies it to extract the last 1000 tweets by all female MPs in the UK (as of post GE June 2017) 

In [3]:
import tweepy
from tweepy import Cursor
import unicodecsv
from unidecode import unidecode
import pandas as pd
import os

## Setup Twitter

Twitter can be a pain to setup. The main problem I had was this URL field. I solved it by typing in www.google.com. Go figure.

Get the salient details and fill in below:

In [2]:
# Authentication and connection to Twitter API.

# INSERT YOUR OWN
consumer_key = ""
consumer_secret = ""
access_key = ""
access_secret = ""

auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_key, access_secret)
api = tweepy.API(auth)

In [4]:
# Insert People Data Here

mps = pd.read_excel("./tweet_data/Hackathon_WomenMP.xlsx")

In [4]:
# Get just the handles

mp_twitter_names = [name for name in mps.twitter_username if type(name) != float]

In [32]:
# Usernames whose tweets we want to gather.
users = mp_twitter_names

with open('tweets.csv', 'wb') as file:
    writer = unicodecsv.writer(file, delimiter = ',', quotechar = '"')
    # Write header row with info we want.
    writer.writerow(["politician_name",
                    "politician_username",
                    "tweet_year",
                    "tweet_month",
                    "tweet_day",
                    "tweet_hour",
                    "tweet_text",
                    "tweet_lat",
                    "tweet_long",
                    "tweet_source",
                    "tweet_in_reply_to_screen_name",
                    "tweet_direct_reply",
                    "tweet_retweet_status",
                    "tweet_retweet_count",
                    "tweet_favorite_count",
                    "tweet_hashtags",
                    "tweet_hashtags_count",
                    "tweet_urls",
                    "tweet_urls_count",
                    "tweet_user_mentions",
                    "tweet_user_mentions_count",
                    "tweet_media_type",
                    "tweet_contributors"])

    for user in users:
        user_obj = api.get_user(user)

        # Gather info specific to the current user.
        user_info = [user_obj.name,
                     user_obj.screen_name]

        # Get 1000 most recent tweets for the current user.
        for tweet in Cursor(api.user_timeline, screen_name = user).items(1000):
            # Latitude and longitude stored as array of floats within a dictionary.
            lat = tweet.coordinates['coordinates'][1] if tweet.coordinates != None else None
            long = tweet.coordinates['coordinates'][0] if tweet.coordinates != None else None
            # If tweet is not in reply to a screen name, it is not a direct reply.
            direct_reply = True if tweet.in_reply_to_screen_name != "" else False
            # Retweets start with "RT ..."
            retweet_status = True if tweet.text[0:3] == "RT " else False

            # Get info specific to the current tweet of the current user.
            tweet_info = [tweet.created_at.year,
                          tweet.created_at.month,
                          tweet.created_at.day,
                          tweet.created_at.hour,
                          unidecode(tweet.text),
                          lat,
                          long,
                          tweet.source,
                          tweet.in_reply_to_screen_name,
                          direct_reply,
                          retweet_status,
                          tweet.retweet_count,
                          tweet.favorite_count]

            # Below entities are stored as variable-length dictionaries, if present.
            hashtags = []
            hashtags_data = tweet.entities.get('hashtags', None)
            if(hashtags_data != None):
                for i in range(len(hashtags_data)):
                    hashtags.append(unidecode(hashtags_data[i]['text']))

            urls = []
            urls_data = tweet.entities.get('urls', None)
            if(urls_data != None):
                for i in range(len(urls_data)):
                    urls.append(unidecode(urls_data[i]['url']))

            user_mentions = []
            user_mentions_data = tweet.entities.get('user_mentions', None)
            if(user_mentions_data != None):
                for i in range(len(user_mentions_data)):
                    user_mentions.append(unidecode(user_mentions_data[i]['screen_name']))

            media = []
            media_data = tweet.entities.get('media', None)
            if(media_data != None):
                for i in range(len(media_data)):
                    media.append(unidecode(media_data[i]['type']))

            contributors = []
            if(tweet.contributors != None):
                for contributor in tweet.contributors:
                    contributors.append(unidecode(contributor['screen_name']))

            more_tweet_info = [', '.join(hashtags),
                               len(hashtags),
                               ', '.join(urls),
                               len(urls),
                               ', '.join(user_mentions),
                               len(user_mentions),
                               ', '.join(media),
                               ', '.join(contributors)]

            # Write data to CSV.
            writer.writerow(user_info + tweet_info + more_tweet_info)

        # Show progress.
        print("Wrote tweets by %s to CSV." % user)

Wrote tweets by VictoriaPrentis to CSV.
Wrote tweets by morton_wendy to CSV.
Wrote tweets by YasminQureshiMP to CSV.
Wrote tweets by YvetteCooperMP to CSV.


In [36]:
Fem_MP_Tweets = pd.DataFrame()

path = '../notebooks/FemaleMPTweets/'

for filename in os.listdir(path):
    if filename[-4:] == '.csv':
        Fem_MP_Tweets = Fem_MP_Tweets.append(pd.read_csv(path + filename, encoding='latin1'))

## Export

In [39]:
Fem_MP_Tweets= Fem_MP_Tweets.drop_duplicates()

In [40]:
Fem_MP_Tweets.to_pickle('FemaleMPTweets.pkl')