In [13]:
import tweepy
import json
import re
import arrow
import spacy
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
from pytz import timezone
from dateutil.parser import parse
from textblob import TextBlob
from xml.sax.saxutils import unescape
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

spc = spacy.load('en_core_web_sm')

In [14]:
trump = pd.read_csv('trump_new.csv')
trump.head()

Unnamed: 0,id,created_at,source,year,month,day,hour,day_of_week,week,is_reply,is_retweet,original_text,original_author,text,favorite_count,retweet_count
0,1256037319963967488,Fri May 01 01:46:47 +0000 2020,Twitter for iPhone,2020,4,30,21,4,17,False,True,The CCP’s Military-Civil Fusion strategy blurs...,SenTomCotton,,,
1,1256037103173017603,Fri May 01 01:45:56 +0000 2020,Twitter for iPhone,2020,4,30,21,4,17,False,True,Tremendous work by @FreeBeacon \n\nWhile the r...,RichLowry,,,
2,1256025982152200194,Fri May 01 01:01:44 +0000 2020,Twitter for iPhone,2020,4,30,21,4,17,False,True,Something seems rotten in Flynn's case — my co...,AndrewCMcCarthy,,,
3,1256024436513411072,Fri May 01 00:55:36 +0000 2020,Twitter for iPhone,2020,4,30,20,4,17,False,True,.@BarackObama is Ultimately Responsible for @F...,BuckSexton,,,
4,1256024228777857025,Fri May 01 00:54:46 +0000 2020,Twitter for iPhone,2020,4,30,20,4,17,False,False,,,“The only thing we got wrong is that it was ev...,156126.0,38037.0


In [15]:
def clean_text(df)  :
    # convert to lower case
    # remove hyperlinks / digits / special characters / punctuations / hashtags / accounts
    regex = r'(@[A-Za-z0-9]+)|(&amp;)|([^0-9A-Za-z\'’ \t])|(\w+:\/\/\S+)|([A-Za-z]+[\d@]+[\w@]*|[\d@]+[A-Za-z]+[\w@]*)|(\d+\s)|(\s\d+\s)$'
    df['text_cleaned'] = df['text'].str.lower().apply(lambda t: ' '.join(re.sub(regex, ' ', t).split()))
    # lemmatization
    df['text_cleaned'] = df['text_cleaned'].apply(lambda t: ' '.join([w.lemma_ for w in spc(t) if w.lemma_ != '-PRON-' and '\'' not in w.lemma_ and '’' not in w.lemma_]))
    return df

In [16]:
# remove retweets without comment
trump_modified = trump[trump['text'].notnull()]
# clean text
trump_modified = clean_text(trump_modified)
# some tweets becomes empty after cleaning because they contains only hyperlinks
trump_modified = trump_modified[trump_modified['text_cleaned'] != '']

trump_modified.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


Unnamed: 0,id,created_at,source,year,month,day,hour,day_of_week,week,is_reply,is_retweet,original_text,original_author,text,favorite_count,retweet_count,text_cleaned
4,1256024228777857025,Fri May 01 00:54:46 +0000 2020,Twitter for iPhone,2020,4,30,20,4,17,False,False,,,“The only thing we got wrong is that it was ev...,156126.0,38037.0,the only thing get wrong be that be even bad t...
6,1256016249169084417,Fri May 01 00:23:04 +0000 2020,Twitter for iPhone,2020,4,30,20,4,17,False,False,,,Vote @MikeGarcia2020 by May 12th! His opponent...,34845.0,11170.0,vote by may opponent vote to fire teacher and ...
7,1256013240896835585,Fri May 01 00:11:06 +0000 2020,Twitter for iPhone,2020,4,30,20,4,17,True,False,Tom Tiffany (@TomTiffanyWI) is a Great Advocat...,realDonaldTrump,"....He is Strong on Crime, the Border, and Sec...",33328.0,8521.0,be strong on crime the border and second amend...
8,1256013239487549441,Fri May 01 00:11:06 +0000 2020,Twitter for iPhone,2020,4,30,20,4,17,False,False,,,Tom Tiffany (@TomTiffanyWI) is a Great Advocat...,56133.0,13626.0,tom tiffany be a great advocate for the incred...
10,1255926261068763143,Thu Apr 30 18:25:29 +0000 2020,Twitter for iPhone,2020,4,30,14,4,17,False,False,,,Over 120 MILLION Economic Impact Payments have...,65073.0,16955.0,over million economic impact payment have alre...


In [17]:
def sentiment_textblob(df):
    df['tb_polarity'], df['tb_subjectivity'] = zip(*df['text'].apply(lambda t: TextBlob(t).sentiment))
    df['tb_orientation'] = np.where(df['tb_polarity'] > 0, 'positive', np.where(df['tb_polarity'] < 0, 'negative', 'neutral'))
    df['tb_explanatory'] = np.where(df['tb_subjectivity'] <= 0.5, 'objective', 'subjective')
    return df

def sentiment_vader(df):
    analyser = SentimentIntensityAnalyzer()
    keys = ['neg', 'neu', 'pos', 'compound']
    df['vd_neg'], df['vd_neu'], df['vd_pos'], df['vd_compound'] = zip(*df['text'].apply(lambda t: map(analyser.polarity_scores(t).get, keys)))
    df['vd_orientation'] = np.where(df['vd_compound'] >= 0.05, 'positive', np.where(df['vd_compound'] <= -0.05, 'negative', 'neutral'))
    return df

In [18]:
# sentiment analysis usting TextBlob
trump_modified = sentiment_textblob(trump_modified)
# sentiment analysis usting Vader
trump_modified = sentiment_vader(trump_modified)

trump_modified.head()

Unnamed: 0,id,created_at,source,year,month,day,hour,day_of_week,week,is_reply,...,text_cleaned,tb_polarity,tb_subjectivity,tb_orientation,tb_explanatory,vd_neg,vd_neu,vd_pos,vd_compound,vd_orientation
4,1256024228777857025,Fri May 01 00:54:46 +0000 2020,Twitter for iPhone,2020,4,30,20,4,17,False,...,the only thing get wrong be that be even bad t...,-0.4125,0.825,negative,subjective,0.373,0.627,0.0,-0.8786,negative
6,1256016249169084417,Fri May 01 00:23:04 +0000 2020,Twitter for iPhone,2020,4,30,20,4,17,False,...,vote by may opponent vote to fire teacher and ...,1.0,0.75,positive,subjective,0.205,0.671,0.123,-0.4168,negative
7,1256013240896835585,Fri May 01 00:11:06 +0000 2020,Twitter for iPhone,2020,4,30,20,4,17,True,...,be strong on crime the border and second amend...,0.088889,0.397222,positive,objective,0.072,0.74,0.189,0.6885,positive
8,1256013239487549441,Fri May 01 00:11:06 +0000 2020,Twitter for iPhone,2020,4,30,20,4,17,False,...,tom tiffany be a great advocate for the incred...,0.541667,0.633333,positive,subjective,0.046,0.64,0.314,0.9439,positive
10,1255926261068763143,Thu Apr 30 18:25:29 +0000 2020,Twitter for iPhone,2020,4,30,14,4,17,False,...,over million economic impact payment have alre...,0.25,0.4,positive,objective,0.0,1.0,0.0,0.0,neutral


In [19]:
trump_modified.to_csv('trump_basic_sentiment.csv', sep=',', encoding='utf-8', index=False)

In [30]:
t = 'bad!'
TextBlob(t).sentiment

Sentiment(polarity=-0.8749999999999998, subjectivity=0.6666666666666666)