In [7]:
from pymongo import MongoClient
import re
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import pandas as pd
from collections import defaultdict

#### Load tweets from mongoDB

In [8]:
client = MongoClient()
db = client.tweet_db
col = db.tweet_collection_full

In [9]:
col.find_one().keys()

dict_keys(['_id', 'created_at', 'id', 'id_str', 'full_text', 'truncated', 'display_text_range', 'entities', 'metadata', 'source', 'in_reply_to_status_id', 'in_reply_to_status_id_str', 'in_reply_to_user_id', 'in_reply_to_user_id_str', 'in_reply_to_screen_name', 'user', 'geo', 'coordinates', 'place', 'contributors', 'retweeted_status', 'is_quote_status', 'retweet_count', 'favorite_count', 'favorited', 'retweeted', 'lang'])

In [10]:
def load_tweets_from_mongo_db(db, col):
    '''
    load tweets from mongo_db
    removes all retweets
    
    input: "db" (str) database name
    input: "col" (str) collection name
    
    returns a dictionary of tweets and twitter data
    '''
    
    all_tweets = defaultdict(list)
    for tweet in col.find({"full_text": {"$exists": True}}):
        if not tweet['retweeted'] and 'RT @' not in tweet['full_text'] and 'Retweeted' not in tweet['full_text']:
            all_tweets['tweet'].append(tweet['full_text'])
            all_tweets['date'].append(tweet['created_at'])
            all_tweets['handle'].append(tweet['user']['screen_name'])
            all_tweets['language'].append(tweet['lang'])
            all_tweets['id'].append(tweet['_id'])
    return all_tweets

In [11]:
tweets_dict = load_tweets_from_mongo_db(db = db, col = col)

In [12]:
df = pd.DataFrame(tweets_dict)
df.shape

(198748, 5)

In [13]:
df['date'] = pd.to_datetime(df['date'])

#### Drop non-English tweets

In [14]:
df1 = df.copy()

In [15]:
mask = df1['language'] != 'en'
df1 = df1.drop(df[mask].index, axis = 0)

In [16]:
df1.shape

(186334, 5)

#### Remove hypertext links and newlines from tweets

In [17]:
tweets = df1['tweet'].tolist()
remove = re.compile(r"http\S+|\n|")
cleaned_tweets = []
for tweet in tweets:
    tweet = remove.sub('', tweet).strip()
    cleaned_tweets.append(tweet)

In [18]:
df1['tweet'] = cleaned_tweets

#### Clean up the data by removing emojis, punctuation, twitter handles. This will help us peform sentiment analysis. Also I will see if there are any duplicate tweets in the data.

In [24]:
%run ../python_files/cleaning_helper.py # python module for cleaning tweets!

In [25]:
tweets_clean = []
tuples = []
tweets = df1['tweet'].tolist()
translator = str.maketrans('', '', remove_punctuaton) # punctuation remover
for tweet in tweets:
    for char in tweet:
    
        tweet = emojis.sub('', tweet)
    
    list_form = tweet.split() # turns the tweet into a list
    
    to_process = [x for x in list_form if not x.startswith("@")] # removes twitter handles
    
    string_form = " ".join(to_process) # back into a string
    
    set_form = set(string_form.translate(translator).strip().lower().split())
    
    tweets_clean.append(string_form.translate(translator).strip().lower())
    
    tuples.append(tuple(set_form)) # need to make it a tuple so it's hashable!

df1['tuples'] = tuples
df1['tweets_clean'] = tweets_clean

In [26]:
df1.shape

(186334, 7)

#### Drop duplicates

In [27]:
df2 = df1.copy()

In [28]:
df2 = df2.drop_duplicates(subset='tuples', keep="first") 
df2.shape

(62689, 7)

#### There were many duplicate tweets. This was because the same tweets were pulled multiple times or because some people were tweeting the exact same tweet.

#### Perform sentiment analysis
#### VADER is optimized for sentiment analysis in social media.
#### It understands slang, emoticons, and capitilization.

In [54]:
positive = []
negative = []
neutral = []
compound = []
sid = SentimentIntensityAnalyzer()
tweets = df2['tweets_clean'].tolist()
for tweet in tweets:
    ss = sid.polarity_scores(tweet)
    positive.append(ss['pos'])
    negative.append(ss['neg'])
    neutral.append(ss['neu'])
    compound.append(ss['compound'])

df2['positive'] = positive
df2['negative'] = negative
df2['neutral'] = neutral
df2['compound'] = compound
df2.shape

(62689, 15)

#### Create columns for each airline

In [57]:
handles = ['@united', '@Delta', '@SouthwestAir', '@AmericanAir']
airlines = ['United', 'Delta', 'Southwest', 'American']

for handle, airline in zip(handles, airlines):
    mask = df2['tweet'].str.lower().str.contains(handle.lower())
    df2.loc[mask, airline] = 1
    df2.loc[~mask, airline] = 0

In [59]:
df2.describe()

Unnamed: 0,positive,negative,neutral,compound,United,Delta,Southwest,American
count,62689.0,62689.0,62689.0,62689.0,62689.0,62689.0,62689.0,62689.0
mean,0.139925,0.088818,0.771207,0.084594,0.195616,0.287834,0.230902,0.310852
std,0.176979,0.134389,0.19362,0.445081,0.396678,0.452757,0.421413,0.462846
min,0.0,0.0,0.0,-0.9702,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.653,-0.2263,0.0,0.0,0.0,0.0
50%,0.088,0.0,0.788,0.0,0.0,0.0,0.0,0.0
75%,0.231,0.152,0.94,0.4404,0.0,1.0,0.0,1.0
max,1.0,1.0,1.0,0.9874,1.0,1.0,1.0,1.0


#### We are interested in the "compound" column. It is a measure of overall sentiment for a given tweet. Its range is from -1 for negative tweets, to +1 for positive tweets.

In [60]:
df2.loc[df2['compound'] == df2['compound'].max()].index

Int64Index([22502], dtype='int64')

In [61]:
df2.loc[df2['compound'] == df2['compound'].min()].index

Int64Index([53809], dtype='int64')

#### Highest and lowest sentiment scoring tweets

In [62]:
df2['tweet'][22502]

'I love love love, love love love @Delta! Thanks Stephanie for awesome customer service as usual!! #RonR'

In [63]:
df2['tweet'][52782]

'@ChristiChat @anntensity @NAACP @AmericanAir Racist! Saying that Racist NAACP is Racist is AA is Racist! Yes that line makes as much sense as any words that come from NAACP. They are irrelevant now. Because everyone who disagrees with them is racist, nothing for them to prove. #maga #covfefe'

In [64]:
df2.to_pickle('../pickle_files/df1.p')