In [1]:
import datetime as dt

from environs import Env
import numpy as np
import pandas as pd
import tweepy

## Sourcing Data

In [2]:
env = Env()
env.read_env()

TWITTER_KEY = env.str('TWITTER_KEY')
TWITTER_SECRET = env.str('TWITTER_SECRET')

auth = tweepy.AppAuthHandler(TWITTER_KEY, TWITTER_SECRET)

api = tweepy.API(auth)

In [253]:
user_pattern = '(?<=^|(?<=[^a-zA-Z0-9-_\.]))@([A-Za-z0-9-_]+)'
hashtag_pattern = '(?<=^|(?<=[^a-zA-Z0-9-_\.]))#([A-Za-z]+[A-Za-z0-9-_]+)'


def get_account_age(creation_date):
    return abs(
        (dt.datetime.today() - tweet.author.created_at)
    ).days

def parse_profile_description(pattern):
    return np.NaN if pattern == '' else pattern

def is_retweet(pattern):
    return 1 if pattern[:2] == 'RT' else 0

def get_original_author(tweet):
    if is_retweet(tweet.full_text):
        try:
            return re.findall(user_pattern, tweet.full_text)[0]
        except:
            return tweet.full_text.split('RT')[-1].split(':')[0].replace(' ', '')
    else:
        return tweet.author.screen_name
    
def get_hashtags(tweet):
    hashtags = tweet.entities.get('hashtags')
    if len(hashtags) == 0:
        return np.NaN
    else:
        return ", ".join([tag['text'] for tag in hashtags])

def get_user_location(tweet):
    
    loc = tweet.user.location
    
    if not loc == '':
        return loc
    return np.NaN
    
def get_full_text(tweet):
        if hasattr(tweet, "retweeted_status"):  # Check if Retweet
            try:
                return tweet._json['retweeted_status']['full_text']
            except AttributeError:
                return tweet.retweeted_status.text
        else:
            try:
                return tweet._json["full_text"]
            except AttributeError:
                return tweet.text

In [217]:
tweet.full_text.split('RT')[-1].split(':')[0].replace(' ', '')

'@rossi4va'

In [218]:
search_terms = ['#election2020',
                'trump',
                'biden',
                '#maga',
                '#vote',
                '#election',
                '#democrat',
                '#resist',
                '#voteblue',
                '#impotus',
                '#getoutthevote',
                '#gop',
                '#republican',
                '#politics',
                '#liberal',
                '#conservative',
                '#bluewave',
               ]

tweet_attributes = ['id',
                    'account',
                    'account_screenname',
                    'account_location',
                    'account_followers',
                    'account_following',
                    'account_age_days',
                    'account_description',
                    'is_retweet',
                    'original_author',
                    'tweeted_on',
                    'count_retweeted',
                    'count_favorited',
                    'hashtags',
                    'tweet_text'
                   ]

In [219]:
tweets = {attr: [] for attr in tweet_attributes}

In [220]:
for search_term in search_terms:
    for tweet in tweepy.Cursor(api.search, q=search_term, tweet_mode='extended').items(200):    
        if tweet.id not in set(tweets['id']):
            tweets['id'].append(tweet.id)
            tweets["account"].append(tweet.author.name)
            tweets["account_screenname"].append(tweet.user.screen_name)
            tweets["account_location"].append(get_user_location(tweet))
            tweets["account_followers"].append(tweet.author.followers_count)
            tweets["account_following"].append(tweet.author.friends_count)
            tweets["account_age_days"].append(get_account_age(
                       tweet.author.created_at
                       ))
            tweets["account_description"].append(parse_profile_description(
                           tweet.author.description
                       ))
            tweets["is_retweet"].append(is_retweet(tweet.full_text))
            tweets["original_author"].append(get_original_author(tweet))
            tweets["tweeted_on"].append(tweet.created_at)
            tweets["count_retweeted"].append(tweet.retweet_count)
            tweets["count_favorited"].append(tweet.favorite_count)
            tweets["hashtags"].append(get_hashtags(tweet))
            tweets["tweet_text"].append(get_full_text(tweet))

In [221]:
tweets_df.head()

Unnamed: 0,id,account,account_screenname,account_location,account_followers,account_following,account_age_days,account_description,is_retweet,original_author,tweeted_on,count_retweeted,count_favorited,hashtags,tweet_text
0,1297426537982156803,Phyllis Council,Preach_Council,The SUNSHINE STATE,598,474,2466,I've been REDEEMED bought with a PRICE; JESUS ...,1,RealDLHughley,2020-08-23 06:52:46,262,0,,"He’s dumb, but he’s not stupid... I don’t know..."
1,1297426361209049090,#CallAli,CallAli16,,88,425,18,#CallAli,0,CallAli16,2020-08-23 06:52:04,0,0,"JoeBiden, SlowJoe, KamalaHarris, PhonyKamala, ...",Democratic Hack OFFENDED That Policy Matters M...
2,1297426336923951104,Nessi,nessie1036,"Michigan, USA",449,463,582,"Independent lady. Love my freedom, love my cou...",1,VicToensing,2020-08-23 06:51:58,3687,0,,OMG! When DNC persons pledged allegiance to Am...
3,1297426296335732736,Extrmus Lftst Botus,LeftyBotty,,817,17,1244,,1,CallAli16,2020-08-23 06:51:48,1,0,JoeBiden,Dem Convention PACKED With Republican Speakers...
4,1297426138516590592,ELIZABETH GARZON,MNEG03,PANAMA,5261,5208,3766,ABOGADA,1,PoliticalHut,2020-08-23 06:51:11,7,0,"Elections2020, Election2020, Trump2020, Trump2...",#Elections2020 \n#Election2020\n#Trump2020\n#T...


In [222]:
tweets_df2 = pd.DataFrame(tweets)

In [224]:
merged = pd.concat((tweets_df, tweets_df2))
tweets_df = merged.drop_duplicates('id')

tweets_df.shape

(6351, 15)

In [225]:
tweets_df.to_csv('../data/raw/election-tweets-initial.csv', index=False)

## EDA

In [212]:
tweets_df.shape

(3269, 15)

In [211]:
tweets_df.describe()

Unnamed: 0,id,account_followers,account_following,account_age_days,is_retweet,count_retweeted,count_favorited
count,3269.0,3269.0,3269.0,3269.0,3269.0,3269.0,3269.0
mean,1.297388e+18,11788.52,3630.578158,2082.308657,0.654329,1254.493423,0.369532
std,60810060000000.0,249496.5,10095.299888,1443.636137,0.475659,4351.975328,6.328077
min,1.296995e+18,0.0,0.0,0.0,0.0,0.0,0.0
25%,1.297381e+18,144.0,307.0,718.0,0.0,0.0,0.0
50%,1.29741e+18,603.0,971.0,1957.0,1.0,7.0,0.0
75%,1.297423e+18,2603.0,3413.0,3393.0,1.0,361.0,0.0
max,1.297427e+18,10973200.0,166314.0,4895.0,1.0,41354.0,351.0


In [190]:
tweets_df.head()

Unnamed: 0,id,account,account_screenname,account_location,account_followers,account_following,account_age_days,account_description,is_retweet,original_author,tweeted_on,count_retweeted,count_favorited,hashtags,tweet_text
0,1297426537982156803,Phyllis Council,Preach_Council,The SUNSHINE STATE,598,474,2466,I've been REDEEMED bought with a PRICE; JESUS ...,1,RealDLHughley,2020-08-23 06:52:46,262,0,,"He’s dumb, but he’s not stupid... I don’t know..."
1,1297426361209049090,#CallAli,CallAli16,,88,425,18,#CallAli,0,CallAli16,2020-08-23 06:52:04,0,0,"JoeBiden, SlowJoe, KamalaHarris, PhonyKamala, ...",Democratic Hack OFFENDED That Policy Matters M...
2,1297426336923951104,Nessi,nessie1036,"Michigan, USA",449,463,582,"Independent lady. Love my freedom, love my cou...",1,VicToensing,2020-08-23 06:51:58,3687,0,,OMG! When DNC persons pledged allegiance to Am...
3,1297426296335732736,Extrmus Lftst Botus,LeftyBotty,,817,17,1244,,1,CallAli16,2020-08-23 06:51:48,1,0,JoeBiden,Dem Convention PACKED With Republican Speakers...
4,1297426138516590592,ELIZABETH GARZON,MNEG03,PANAMA,5261,5208,3766,ABOGADA,1,PoliticalHut,2020-08-23 06:51:11,7,0,"Elections2020, Election2020, Trump2020, Trump2...",#Elections2020 \n#Election2020\n#Trump2020\n#T...


In [189]:
tweets_df['tweet_text'][302]

'This video only solidifies Donald Trump’s place in history as the most hateful, petty, fear-mongering, impeached forever President in history. He makes George Bush look like Mother Theresa. https://t.co/bY8CghNc6i'

In [157]:
tweets_df['tweet_text'][:10
                       ]

0    Bernie Sanders admits:\n"Many of the ideas we ...
1                                                 None
2                                                 None
3                                                 None
4    Are you voting in the Presidential #Election20...
5    Millennium Falcon revisited. My latest from ⁦@...
6    @JoeBiden This just great ... YOUR former chie...
7    @JoeBiden This just great ... YOUR former chie...
8                                                 None
9                                                 None
Name: tweet_text, dtype: object

In [196]:
def wrap_tweet(tweet, wrap=79):
    lines = round(len(tweet)/wrap) - 1
    
    for line in range(lines):
        if not line == range(lines)[-1]:
            print(tweet[line*wrap:(line*wrap)+wrap])
        else:
            print(tweet[line*wrap:] + '\n')

for tweet in tweets_df['tweet_text'][:10]:
    wrap_tweet(tweet)

He’s dumb, but he’s not stupid... I don’t know who needs to hear this, but 🗣 IF
 YOUR VOTE WAS NOT IMPORTANT, THEY WOULDN’T TRY SO HARD TO STOP IT!!! #TeamDL #vote #trump #maga #election2020 #FuckTrump #Repost… https://t.co/YG7HUYntuk

Democratic Hack OFFENDED That Policy Matters More Than Hating Trump https://t.c
o/ai6yA1CMNV via @YouTube #JoeBiden #SlowJoe #KamalaHarris #PhonyKamala #KamalasACop #CrookedCopKamala #BidenHarris2020 #Election2020 #Democrat #Offended #Policy #Trump #KyleKulinski #Progressive

OMG! When DNC persons pledged allegiance to American flag 🇺🇸 they purposely omi
tted “under God.” Bet that won’t happen next week at RNC convention.  @realDonaldTrump would never allow. #maga2020 #Election2020

Dem Convention PACKED With Republican Speakers &amp; Right-Wing Arguments https
://t.co/mBWn8DLtik via @YouTube #JoeBiden #SlowJoe #KamalaHarris #KamalasACop #
CrookedCopKamala #BidenHarris2020 #Election2020 #DemConvention #DemocraticConvention #DNC2020 #KyleKulinski #Progres

## Definining the problem

Essentially the task at hand calls for classification. We can approach this in the simplest form by trying to determine if a given tweet is "pro-trump", "pro-biden", or neutral.

In [268]:
# We need to generate a corpus of text from known pro-trump and pro-biden accounts to train our model on.

pro_trump_accounts = ['realDonaldTrump',
                      'Mike_Pence',
                      'Team_Trump45',
                      'DiamondandSilk',
                      'DonnaWR8',
                      'The_Trump_Train',
                      'joegooding',
                      'paultdove',
                      'Filibuster',
                      'Bet22325450ste',
                      'JerryTravone',
                      'Fuctupmind'
                     ]

pro_biden_accounts = ['JoeBiden',
                      'KamalaHarris',
                      'benbrown',
                      'biden4pres',
                      'RepsForBiden',
                      'joncoopertweets',
                      'AndrewBatesNC',
                      'TeamJoe',
                      'RealKHiveQueenB',
                      'YAFBiden',
                      'ProfSybill'
                      'JoeKamalaTicket',
                     ]

In [270]:
pro_trump_tweets = {attr:[] for attr in tweet_attributes}

for account in pro_trump_accounts:
    try:
        user = api.get_user(account,
                            tweet_mode='extended'
                           )
    except:
        continue
    tweets = user.timeline(count=200,
                           tweet_mode='extended'
                          )
    for tweet in tweets:
        if tweet.id not in set(pro_trump_tweets['id']):
            pro_trump_tweets['id'].append(tweet.id)
            pro_trump_tweets["account"].append(tweet.author.name)
            pro_trump_tweets["account_screenname"].append(tweet.user.screen_name)
            pro_trump_tweets["account_location"].append(get_user_location(tweet))
            pro_trump_tweets["account_followers"].append(tweet.author.followers_count)
            pro_trump_tweets["account_following"].append(tweet.author.friends_count)
            pro_trump_tweets["account_age_days"].append(get_account_age(
                       tweet.author.created_at
                       ))
            pro_trump_tweets["account_description"].append(parse_profile_description(
                           tweet.author.description
                       ))
            pro_trump_tweets["is_retweet"].append(is_retweet(get_full_text(tweet)))
            pro_trump_tweets["original_author"].append(get_original_author(tweet))
            pro_trump_tweets["tweeted_on"].append(tweet.created_at)
            pro_trump_tweets["count_retweeted"].append(tweet.retweet_count)
            pro_trump_tweets["count_favorited"].append(tweet.favorite_count)
            pro_trump_tweets["hashtags"].append(get_hashtags(tweet))
            pro_trump_tweets["tweet_text"].append(get_full_text(tweet))
        
len(pro_trump_tweets['id'])

1579

In [271]:
pro_trump_df = pd.DataFrame(pro_trump_tweets)
pro_trump_df.to_csv('../data/raw/pro-trump-tweets.csv')

In [274]:
pro_biden_tweets = {attr:[] for attr in tweet_attributes}

for account in pro_biden_accounts:
    try:
        user = api.get_user(account,
                            tweet_mode='extended'
                           )
    except:
        continue
        
    tweets = user.timeline(count=200,
                           tweet_mode='extended'
                          )
    
    for tweet in tweets:
        if tweet.id not in set(pro_biden_tweets['id']):
            pro_biden_tweets['id'].append(tweet.id)
            pro_biden_tweets["account"].append(tweet.author.name)
            pro_biden_tweets["account_screenname"].append(tweet.user.screen_name)
            pro_biden_tweets["account_location"].append(get_user_location(tweet))
            pro_biden_tweets["account_followers"].append(tweet.author.followers_count)
            pro_biden_tweets["account_following"].append(tweet.author.friends_count)
            pro_biden_tweets["account_age_days"].append(get_account_age(
                       tweet.author.created_at
                       ))
            pro_biden_tweets["account_description"].append(parse_profile_description(
                           tweet.author.description
                       ))
            pro_biden_tweets["is_retweet"].append(is_retweet(get_full_text(tweet)))
            pro_biden_tweets["original_author"].append(get_original_author(tweet))
            pro_biden_tweets["tweeted_on"].append(tweet.created_at)
            pro_biden_tweets["count_retweeted"].append(tweet.retweet_count)
            pro_biden_tweets["count_favorited"].append(tweet.favorite_count)
            pro_biden_tweets["hashtags"].append(get_hashtags(tweet))
            pro_biden_tweets["tweet_text"].append(get_full_text(tweet))

len(pro_biden_tweets['id'])

2000

In [275]:
pro_biden_df = pd.DataFrame(pro_biden_tweets)
pro_biden_df.to_csv('../data/raw/pro-biden-tweets.csv')