In [1]:
import pandas as pd
import os
from nltk.sentiment import SentimentAnalyzer
from nltk.sentiment.util import *
from nltk.classify import NaiveBayesClassifier
from nltk.corpus import subjectivity
from nltk.sentiment.vader import SentimentIntensityAnalyzer

nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to C:\Users\Pc
[nltk_data]     Principale\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

Information of a tweet
- id
- created_at
- text
- user -> location

Location is a bit useless, we can use:
- timezone
- geo
- coordinates

In [2]:
# analisi dei tweet che contengono trump
hashtag = 'trump'

tweets = []
with open(hashtag+'.json', 'r') as f:
    for line in f:
        tweet = {}
        dict_tweet = json.loads(line)
        tweet['id'] = dict_tweet['id']
        tweet['created_at'] = dict_tweet['created_at']
        tweet['text'] = dict_tweet['text']
        tweet['location'] = dict_tweet['user']['location']
        tweet['timezone'] = dict_tweet['user']['time_zone']
        tweet['coord'] = dict_tweet['coordinates']
        tweet['place'] = dict_tweet['place']
        tweets.append(tweet)
tweets[0]

{'id': 863857433160163329,
 'created_at': 'Sun May 14 20:43:56 +0000 2017',
 'text': 'Except when Trump mentioned that Hillary should have won... https://t.co/JwySbKjHy1',
 'location': None,
 'timezone': None,
 'coord': None,
 'place': {'id': 'dd3b100831dd1763',
  'url': 'https://api.twitter.com/1.1/geo/id/dd3b100831dd1763.json',
  'place_type': 'city',
  'name': 'New Orleans',
  'full_name': 'New Orleans, LA',
  'country_code': 'US',
  'country': 'United States',
  'bounding_box': {'type': 'Polygon',
   'coordinates': [[[-90.137908, 29.889574],
     [-90.137908, 30.075628],
     [-89.884108, 30.075628],
     [-89.884108, 29.889574]]]},
  'attributes': {}}}

In [3]:
df_tweets = pd.DataFrame.from_dict(tweets)

In [4]:
df_tweets.count()

coord           0
created_at    117
id            117
location       81
place           1
text          117
timezone       76
dtype: int64

In [5]:
sid = SentimentIntensityAnalyzer()

Compound Variable
- positive sentiment: compound score >= 0.5
- neutral sentiment: (compound score > -0.5) and (compound score < 0.5)
- negative sentiment: compound score <= -0.5

In [6]:
def sentiment(x):
    sentence = x['text']
    sentiment = 'neutral'
    ss = sid.polarity_scores(sentence)
    for k in sorted(ss):
        if(k=='compound'):
            if(ss[k]>=0.5):
                sentiment = 'positive'
            elif(ss[k]<=-0.5):
                sentiment = 'negative'
            else:
                sentiment = 'neutral'
    return sentiment

In [7]:
def sentiment_compound(x):
    sentence = x['text']
    sentiment_compound = 0
    ss = sid.polarity_scores(sentence)
    for k in sorted(ss):
        sentiment_compound = ss[k]
    return sentiment_compound

In [8]:
df_tweets['sentiment'] = df_tweets.apply(lambda x: sentiment(x), axis=1)
df_tweets['sentiment_compound'] = df_tweets.apply(lambda x: sentiment_compound(x), axis=1)

In [9]:
df_tweets.head(2)

Unnamed: 0,coord,created_at,id,location,place,text,timezone,sentiment,sentiment_compound
0,,Sun May 14 20:43:56 +0000 2017,863857433160163329,,"{'id': 'dd3b100831dd1763', 'url': 'https://api...",Except when Trump mentioned that Hillary shoul...,,neutral,0.0
1,,Sun May 14 20:43:59 +0000 2017,863857445382348801,"noneya, business",,RT @amjoyshow: .@TRIBELAW strongly proclaims w...,,positive,0.25


In [10]:
df_tweets.count()

coord                   0
created_at            117
id                    117
location               81
place                   1
text                  117
timezone               76
sentiment             117
sentiment_compound    117
dtype: int64

In [11]:
df_tweets.groupby(['sentiment']).count()['id']

sentiment
negative    10
neutral     72
positive    35
Name: id, dtype: int64

In [12]:
pd.options.display.max_colwidth = 266

In [13]:
df_tweets[(df_tweets['sentiment']=='positive')].head(5) 

Unnamed: 0,coord,created_at,id,location,place,text,timezone,sentiment,sentiment_compound
1,,Sun May 14 20:43:59 +0000 2017,863857445382348801,"noneya, business",,RT @amjoyshow: .@TRIBELAW strongly proclaims why #Trump's impeachment process needs to start *NOW*. RETWEET TO AGREE #AMJoy https://t.co/8C…,,positive,0.25
3,,Sun May 14 20:44:01 +0000 2017,863857456132145152,,,RT @amjoyshow: .@TRIBELAW strongly proclaims why #Trump's impeachment process needs to start *NOW*. RETWEET TO AGREE #AMJoy https://t.co/8C…,Pacific Time (US & Canada),positive,0.25
14,,Sun May 14 20:44:08 +0000 2017,863857483533754368,Texas 78750,,RT @amjoyshow: .@TRIBELAW on #Trump: He feels empowered to act like a king or dictator demanding loyalty from people hired to inve…,Central Time (US & Canada),positive,0.232
15,,Sun May 14 20:44:10 +0000 2017,863857493335830530,ohio,,RT @Md_Renegade: Sound advice from the Dr. He has a the perfect treatment plan for TDS ( Trump Derangement Syndrome) https://t.co/2OGqeOcwcd,Eastern Time (US & Canada),positive,0.171
19,,Sun May 14 20:44:12 +0000 2017,863857500940115969,"Ottawa, ON Canada",,RT @mcwalker64: Dr Seuss popular today. Saw a good one on a picket line today too. https://t.co/UKrcn8jw4i,Eastern Time (US & Canada),positive,0.305


In [14]:
df_tweets[(df_tweets['sentiment']=='negative')].head(5)

Unnamed: 0,coord,created_at,id,location,place,text,timezone,sentiment,sentiment_compound
10,,Sun May 14 20:44:05 +0000 2017,863857469948256256,,,RT @PhilipdClarke: #JamesClapper: #Trump is assaulting #US institutions https://t.co/55aukzkVoT @RamirezShauna33 @wonderfullone @retiredfir…,Pacific Time (US & Canada),negative,0.0
23,,Sun May 14 20:44:19 +0000 2017,863857530618785792,,,@KurtSchlichter I am guessing #Trump would disagree on this. Stupid and Sketchy got him elected. #Resist,Pacific Time (US & Canada),negative,0.0
34,,Sun May 14 20:44:23 +0000 2017,863857546997751808,#followback,,@KellyannePolls plz keep out of wicked #Media! #Sessions #ABC #NBC #CNN #NYTimes #US #USA #Jared #EU #UK #WSJ #NY… https://t.co/lOMVGWlJpk,Eastern Time (US & Canada),negative,0.057
47,,Sun May 14 20:44:36 +0000 2017,863857601112657925,,,RT @starknightz: Fed Judge Drops Prosecution Bombshell abt Obama; asked 4 Benghazi doc's/ possible prosecution\nhttps://t.co/h3dGMhQqj1 #Tru…,,negative,0.0
53,,Sun May 14 20:44:47 +0000 2017,863857647912710144,,,"RT @MediaShrink: So @TeamTrump, Latest #DNC Released #FakeNews Smear! #GodSoeed @Flotus #Dems #Trump Fest hides Internal Fatal Flaws…",,negative,0.0


In [15]:
# Save
dir_df = os.path.join(os.path.abspath(''),'stg')
result_filename = r'df_tweets.pkl'
result_fullpath = os.path.join(dir_df, result_filename)
df_tweets.to_pickle(result_fullpath)