In [1]:
# Librerie
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
plt.style.use('ggplot')
get_ipython().magic('pylab inline')

Populating the interactive namespace from numpy and matplotlib


In [2]:
# definizione path
dir_df = os.path.join(os.path.abspath(''),'stg')
dir_out = os.path.join(os.path.abspath(''),'output')

In [3]:
# definizione del file contentente i tweet
df_filename = r'df_tweets.pkl'
df_fullpath = os.path.join(dir_df, df_filename)
df_tweets = pd.read_pickle(df_fullpath)

### G1 - classificazione dei sentimenti

In [4]:
# creazione del conteggio per i tweet negativi, positivi e neutrali
df_g1 = df_tweets[['sentiment','id']].groupby(['sentiment'],as_index=False).count()
df_g1

Unnamed: 0,sentiment,id
0,negative,10
1,neutral,72
2,positive,35


In [5]:
# creazione dell'istogramma
tp = df_g1.plot(
        x=df_g1['sentiment'],
        kind='bar',
        legend = False)
for p in tp.patches:
    tp.annotate(str(p.get_height()), 
                (p.get_x() * 1.005, p.get_height() * 1.005), 
                ha='center', va='center', xytext=(30, 5), textcoords='offset points')
    tp.plot()

fig_prj = tp.get_figure()
fig_prj.tight_layout()
# vado ad effettuare un salvataggio di un immagine contente il grafico creato
fig_prj.savefig(os.path.join(dir_out,'sentiment_distr.png'), format='png', dpi=300)

KeyError: "None of [Index(['negative', 'neutral', 'positive'], dtype='object')] are in the [columns]"

### G2 - analisi sentimenti tramite coordinate

In [6]:
# rilevazione della posizione geografica tramite coordinate
df_tweets.head(2)

Unnamed: 0,coord,created_at,id,location,place,text,timezone,sentiment,sentiment_compound
0,,Sun May 14 20:43:56 +0000 2017,863857433160163329,,"{'id': 'dd3b100831dd1763', 'url': 'https://api...",Except when Trump mentioned that Hillary shoul...,,neutral,0.0
1,,Sun May 14 20:43:59 +0000 2017,863857445382348801,"noneya, business",,RT @amjoyshow: .@TRIBELAW strongly proclaims w...,,positive,0.25


In [7]:
# associazione di date ai tweet analizzati
plt.rcParams['figure.figsize'] = (6.0, 6.0)
df_tweets['hour'] = pd.DatetimeIndex(df_tweets['created_at']).hour
df_tweets['date'] = pd.DatetimeIndex(df_tweets['created_at']).date
df_tweets['minute'] = pd.DatetimeIndex(df_tweets['created_at']).minute
df=(df_tweets.groupby('hour',as_index=False).sentiment_compound.mean())


In [8]:
df.count()

hour                  1
sentiment_compound    1
dtype: int64

In [9]:
# grafico per mostrare l'andamento de gradimento temporale rispetto alle ore del giorno
df.plot(x=df['hour'],y='sentiment_compound')

KeyError: "None of [Int64Index([20], dtype='int64')] are in the [columns]"

### G3 - analisi sentimenti in base alla località

In [10]:
# creazione di un array di località
df_tweets.location.unique()[0:20]

array([None, 'noneya, business', 'Houston ☆Texas  U S A🇺🇸',
       'Albuquerque, NM', 'USA', 'DMV, USA', 'Ireland', 'Sacramento, CA',
       'London, New York, Washington ', 'Texas 78750', 'ohio',
       'Gulfport, MS', 'Chico, CA', 'Inside the beltway!',
       'Ottawa, ON Canada', 'Edmonton, Alberta, Canada',
       'Chickahominy Swamp of Virginia', 'New York, NY',
       'Buchanan, Michigan', 'State of Befuddlement'], dtype=object)

In [25]:
# controlli da effettuare sui sentimenti
def col_neutral(x):
    if(x['sentiment']=='neutral'):
        return 1
    else:
        return 0

def col_positive(x):
    if(x['sentiment']=='positive'):
        return 1
    else:
        return 0
    
def col_negative(x):
    if(x['sentiment']=='negative'):
        return 1
    else:
        return 0

In [26]:
df_tweets['neutral'] = df_tweets.apply(lambda x: col_neutral(x), axis=1)
df_tweets['positive'] = df_tweets.apply(lambda x: col_positive(x), axis=1)
df_tweets['negative'] = df_tweets.apply(lambda x: col_negative(x), axis=1)

In [27]:
# analisi per fusi orari
df_t = df_tweets.groupby(['timezone'],as_index=False)
df_t = df_t.agg({
    'negative' : np.sum,
    'positive' : np.sum,
    'neutral' : np.sum
    })

In [28]:
# creazione di una top 10 delle zone con i commenti più negativi
df_t[['timezone','negative']].sort_values(by='negative', ascending=False).head(10)

Unnamed: 0,timezone,negative
26,Eastern Time (US & Canada),37
48,Pacific Time (US & Canada),36
23,Central Time (US & Canada),11
41,London,8
50,Quito,4
10,Arizona,4
45,Mountain Time (US & Canada),3
9,Amsterdam,3
55,Tehran,2
37,Kiev,2


In [29]:
# creazione di una top 10 delle zone con i commenti più positivi
df_t[['timezone','positive']].sort_values(by='positive', ascending=False).head(10)

Unnamed: 0,timezone,positive
48,Pacific Time (US & Canada),32
26,Eastern Time (US & Canada),23
23,Central Time (US & Canada),14
9,Amsterdam,3
50,Quito,2
45,Mountain Time (US & Canada),2
11,Athens,1
17,Bern,1
25,Dublin,1
52,Rome,1


In [30]:
# analisi sulle coordinate
df_tweets.coord.count()

0

In [31]:
import json 
geo_data = {
        "type": "FeatureCollection",
        "features": []
    }

In [32]:
for i, row in df_tweets['coord'].iteritems():
    if row:
        geo_json_feature = {
                "type": "Feature",
                "geometry": tweet['coordinates'],
                "properties": {
                    "text": tweet['text'],
                    "created_at": tweet['created_at']
                }
        }
        geo_data['features'].append(geo_json_feature)

# salvataggio della data geografica per l'export
with open('geo_data.json', 'w') as fout:
    fout.write(json.dumps(geo_data, indent=4))

In [78]:
#creazione della mappa grafica con i paesi e la rispettiva perdominanza di commenti positivi o negativi
import folium

df_tweets_neg = df_tweets[df_tweets['sentiment']=='negative']

map = folium.Map(location=[48, 10], zoom_start=4)
for i, row in df_tweets_neg['coord'].iteritems():
    map.simple_marker(df_tweets['coord'], popup=str(i))
    map.circle_marker(location=[10, 20], radius=10000,
                popup='My Popup Info', line_color='#3186cc',
                fill_color='#3186cc', fill_opacity=2.0)
map



### G4 - Analisi parole più utilizzate

In [6]:
#ho bisogno del pacchetto di nlp
from nltk.tokenize import word_tokenize
import re
from collections import Counter
from nltk.corpus import stopwords
import string
 
#creazione di stringhe contenti le varie possbili emoticon
emoticons_str = r"""
    (?:
        [:=;] # Eyes
        [oO\-]? # Nose (optional)
        [D\)\]\(\]/\\OpP] # Mouth
    )"""
 
regex_str = [
    emoticons_str,
    r'<[^>]+>', # HTML tags
    r'(?:@[\w_]+)', # @-mentions
    r"(?:\#+[\w_]+[\w\'_\-]*[\w_]+)", # hash-tags
    r'http[s]?://(?:[a-z]|[0-9]|[$-_@.&amp;+]|[!*\(\),]|(?:%[0-9a-f][0-9a-f]))+', # URLs
 
    r'(?:(?:\d+,?)+(?:\.?\d+)?)', # numbers
    r"(?:[a-z][a-z'\-_]+[a-z])", # words with - and '
    r'(?:[\w_]+)', # other words
    r'(?:\S)' # anything else
]
    
tokens_re = re.compile(r'('+'|'.join(regex_str)+')', re.VERBOSE | re.IGNORECASE)
emoticon_re = re.compile(r'^'+emoticons_str+'$', re.VERBOSE | re.IGNORECASE)


def tokenize(s):
    return tokens_re.findall(s)
 
def preprocess(s, lowercase=False):
    tokens = tokenize(s)
    if lowercase:
        tokens = [token if emoticon_re.search(token) else token.lower() for token in tokens]
    return tokens

Negative

In [34]:
df_tweets_neg = df_tweets[df_tweets['sentiment']=='negative']

In [7]:
# contatore per gli hashtag
count_hash = Counter()
punctuation = list(string.punctuation)
stop = stopwords.words('english') + punctuation + ['rt', 'via']
for i, row in df_tweets_neg['text'].iteritems():
    terms_hash = [term for term in preprocess(row) if term.startswith('#')]
    count_hash.update(terms_hash)
print(count_hash.most_common(5))

LookupError: 
**********************************************************************
  Resource [93mstopwords[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('stopwords')
  [0m
  Attempted to load [93mcorpora/stopwords[0m

  Searched in:
    - 'C:\\Users\\Pc Principale/nltk_data'
    - 'C:\\Users\\Pc Principale\\Anaconda3\\nltk_data'
    - 'C:\\Users\\Pc Principale\\Anaconda3\\share\\nltk_data'
    - 'C:\\Users\\Pc Principale\\Anaconda3\\lib\\nltk_data'
    - 'C:\\Users\\Pc Principale\\AppData\\Roaming\\nltk_data'
    - 'C:\\nltk_data'
    - 'D:\\nltk_data'
    - 'E:\\nltk_data'
**********************************************************************


In [8]:
# contatore di termini specifici, senza considerare hashtag e emoticon
count_only = Counter()
punctuation = list(string.punctuation)
stop = stopwords.words('english') + punctuation + ['rt', 'via']
for i, row in df_tweets_neg['text'].iteritems():
    terms_only = [term for term in preprocess(row) 
              if term not in stop and
              not term.startswith(('#', '@','RT','…','amp','The','I',
                                   'https','Just','To','use','–'))] 
    count_only.update(terms_only)
print(count_only.most_common(5))

LookupError: 
**********************************************************************
  Resource [93mstopwords[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('stopwords')
  [0m
  Attempted to load [93mcorpora/stopwords[0m

  Searched in:
    - 'C:\\Users\\Pc Principale/nltk_data'
    - 'C:\\Users\\Pc Principale\\Anaconda3\\nltk_data'
    - 'C:\\Users\\Pc Principale\\Anaconda3\\share\\nltk_data'
    - 'C:\\Users\\Pc Principale\\Anaconda3\\lib\\nltk_data'
    - 'C:\\Users\\Pc Principale\\AppData\\Roaming\\nltk_data'
    - 'C:\\nltk_data'
    - 'D:\\nltk_data'
    - 'E:\\nltk_data'
**********************************************************************


Positive

In [9]:
df_tweets_pos = df_tweets[df_tweets['sentiment']=='positive']

In [11]:
# contatore di hashtag
count_hash = Counter()
punctuation = list(string.punctuation)
stop = stopwords.words('english') + punctuation + ['rt', 'via']
for i, row in df_tweets_pos['text'].iteritems():
    terms_hash = [term for term in preprocess(row) if term.startswith('#')]
    count_hash.update(terms_hash)
print(count_hash.most_common(5))

LookupError: 
**********************************************************************
  Resource [93mstopwords[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('stopwords')
  [0m
  Attempted to load [93mcorpora/stopwords[0m

  Searched in:
    - 'C:\\Users\\Pc Principale/nltk_data'
    - 'C:\\Users\\Pc Principale\\Anaconda3\\nltk_data'
    - 'C:\\Users\\Pc Principale\\Anaconda3\\share\\nltk_data'
    - 'C:\\Users\\Pc Principale\\Anaconda3\\lib\\nltk_data'
    - 'C:\\Users\\Pc Principale\\AppData\\Roaming\\nltk_data'
    - 'C:\\nltk_data'
    - 'D:\\nltk_data'
    - 'E:\\nltk_data'
**********************************************************************


In [38]:
# Count terms only (no hashtags, no mentions)
count_only = Counter()
punctuation = list(string.punctuation)
stop = stopwords.words('english') + punctuation + ['rt', 'via']
for i, row in df_tweets_pos['text'].iteritems():
    terms_only = [term for term in preprocess(row) 
              if term not in stop and
              not term.startswith(('#', '@','RT','…','amp','The','I',
                                   'https','Just','To','use','–'))] 
    count_only.update(terms_only)
print(count_only.most_common(5))

LookupError: 
**********************************************************************
  Resource [93mstopwords[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('stopwords')
  [0m
  Attempted to load [93mcorpora/stopwords[0m

  Searched in:
    - 'C:\\Users\\Pc Principale/nltk_data'
    - 'C:\\Users\\Pc Principale\\Anaconda3\\nltk_data'
    - 'C:\\Users\\Pc Principale\\Anaconda3\\share\\nltk_data'
    - 'C:\\Users\\Pc Principale\\Anaconda3\\lib\\nltk_data'
    - 'C:\\Users\\Pc Principale\\AppData\\Roaming\\nltk_data'
    - 'C:\\nltk_data'
    - 'D:\\nltk_data'
    - 'E:\\nltk_data'
**********************************************************************
