In [1]:
!pip install emoji
import nltk
nltk.download('punkt')



[nltk_data] Downloading package punkt to
[nltk_data]     /Users/carlospenaloza/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
from nltk.tokenize import word_tokenize
import matplotlib.pyplot as plt
from collections import Counter
from textblob import TextBlob
from bs4 import BeautifulSoup
import matplotlib.cm as cm
from string import punctuation
from string import digits
import urllib.request
import networkx as nx
import pandas as pd
import itertools
import emoji
import math
import time
import json
import sys
import re
import os

# TODO: remove this when NetworkX is fixed
from warnings import simplefilter
import matplotlib.cbook
simplefilter("ignore", matplotlib.cbook.mplDeprecation)

In [3]:
# Import tweets
tweets = pd.read_csv('https://raw.githubusercontent.com/cpenalozag/twitter_network/master/tweets/statuses.csv')

tweets.head()

Unnamed: 0,id,tweet_id,created_at,text,favorite_count,retweet_count,phone,sensitive,hashtags,no_hashtags,mentions,no_mentions,no_urls,no_media
0,113127283.0,1.17252e+18,9/13/19 14:56,@enriqueserrano3 @Restrebooks @ELTIEMPO @enriq...,1,0,Twitter Web App,False,-,0,"[238261607, 3252651209, 9633802, 1073577478566...",6,0,1
1,113127283.0,1.17252e+18,9/13/19 14:54,@enriqueserrano3 @Restrebooks @ELTIEMPO @enriq...,2,0,Twitter Web App,False,-,0,"[238261607, 3252651209, 9633802, 1073577478566...",5,0,1
2,113127283.0,1.17252e+18,9/13/19 14:52,@enriqueserrano3 @Restrebooks @ELTIEMPO @enriq...,1,0,Twitter Web App,False,-,0,"[238261607, 3252651209, 9633802, 1073577478566...",6,0,1
3,113127283.0,1.17252e+18,9/13/19 14:50,@enriqueserrano3 @Restrebooks @ELTIEMPO @enriq...,0,0,Twitter Web App,False,-,0,"[238261607, 3252651209, 9633802, 1073577478566...",6,0,1
4,113127283.0,1.17252e+18,9/13/19 14:48,@enriqueserrano3 @Restrebooks @ELTIEMPO @enriq...,0,0,Twitter Web App,False,-,0,"[238261607, 3252651209, 9633802, 1073577478566...",4,0,1


In [4]:
# Import spanish stop word dictionary
url_sw = 'https://raw.githubusercontent.com/cpenalozag/twitter_network/master/utils/stopwords-es.json'
response_sw = urllib.request.urlopen(url_sw)
data_sw = response_sw.read()

stop_words = set(json.loads(data_sw))

# Import emoji meanings
emoji_translations = pd.read_csv('https://raw.githubusercontent.com/cpenalozag/twitter_network/master/utils/emojis_translated.csv')

# Get spanish meaning of an emoji
def emoji_meaning(emoji):
    meaning = emoji_translations.loc[emoji_translations['emoji'] == emoji]['translation']
    return meaning.values[0] if not meaning.empty else ''

# Transformations to remove digits and punctuation
remove_digits = str.maketrans('', '', digits)
remove_punctuation = str.maketrans('', '', punctuation)

In [16]:
from ipython_exit import exit

""" Processes text data in tweets
    
    text: text of tweet
    word_counts: word count dictionary
    
    returns processed text, length of content, polarity, subjectivity
"""
def process_text(text, word_counts, user_id):
    # Remove links
    text = ' '.join(re.sub("(\w+:\/\/\S+)", " ", text).split())
    
    # Remove file names
    
        
    # Remove mentions
    text = ' '.join(re.sub("(@[A-Za-z0-9^\w]+)", " ", text.replace('@ ','@').replace('# ','#')).split())
        
    # Replace hashtags with words
    if text.count('#')>0:
        text = ' '.join(re.findall('[A-Z][^A-Z]*', text.replace('#', ' ')))
            
    #Remove HTML tags
    text = BeautifulSoup(text).get_text()
    
    # Save content length (exluding links and mentions)
    length = len(text)
        
    # Remove punctuation symbols
    text = ' '.join(re.sub("[\.\,\¡\¿\!\?\:\;\-\=\*\(\)\[\]\"\'\“\_\+\”\%\/\‘\’]", " ", text).split())
    text = text.translate(remove_digits).translate(remove_punctuation)
        
    # Lower case to avoid case sensitive problems
    text = text.lower()
        
    # Replace emojis with names 
    text = emoji.demojize(text)
        
    # Add space between emojis and other characters
    ind = -2
    for c in range(text.count(':')):
        ind = text.find(':',ind+2)
        if c%2==0:
            newLetter = ' :'
        else:
            newLetter = ': '
        text ="".join((text[:ind],newLetter,text[ind+1:]))
            
    # Replace emoji names with spanish meaning
    result = []
    parts = text.split(' ')
    for part in parts:
        if part:
            if part[0]==':':
                em = handle_emoji_tone(part)
                em = emoji_meaning(em)
                if em:
                    result.append(em)
            else:
                if part not in stop_words:
                    if part not in word_counts:
                        word_counts[part] = 1
                    else:
                        word_counts[part] += 1
                result.append(part)
        
    text = ' '.join(result)
        
    # Filter using NLTK library append it to a string
    word_tokens = word_tokenize(text)
        
    result = [w for w in word_tokens if not w in stop_words]
        
    text = ' '.join(result)
    
    if len(text)>12:
        analysis = TextBlob(text)
        try:
            eng = analysis.translate(to='en')
            sentiment = eng.sentiment     
            polarity = sentiment.polarity
            subjectivity = sentiment.subjectivity
            time.sleep(.45)
        except Exception as e:
            if str(e)=='HTTP Error 429: Too Many Requests':
                print('Error')
                print('Last user:', user_id)
                exit()
            polarity = 0.0
            subjectivity = 0.0
    
    else:
        polarity = 0.0
        subjectivity = 0.0
    

    return text, length, polarity, subjectivity

tones = ['_light_skin_tone','_medium-light_skin_tone','_medium_skin_tone', 
             '_medium-dark_skin_tone','_dark_skin_tone']

# Method that removes the tone from emojis
def handle_emoji_tone(emoji):  
    for t in tones:
        if t in emoji:
            tone = t
            return emoji.replace(tone,'')
    return emoji


In [6]:
# Import user data
user_info=pd.read_csv('https://raw.githubusercontent.com/cpenalozag/twitter_network/master/network-data/user_info.csv')
user_ids = list(user_info['id'])

In [None]:
# List to hold part of the dataset
text_analysis = []
text_analysis.append(['id', 'tweet_id','engagement',
            'effective_length', 'polarity', 'subjectivity'])

# List to store average engagement
avg_eng = []
avg_eng.append(['id', 'average_engagement', 'common_words', 'common_ht_words',
               'common_hts'])

''' Text analysis '''

last_uid = 244592074
found_start = False

#for n in G.nodes():
for user_id in user_ids:
    if not found_start and user_id != last_uid:
        continue
    else:
        found_start = True
    
    # Total engagement
    eng = 0
    tweet_count = 0
    
    # Frequent word dictionary
    freq_words = {}

    # Frequent hashtag dictionary
    freq_hashtags = {}

    # Frequent words in hashtags dictionary
    freq_ht_words = {}
    
    user_tweets = tweets.loc[tweets['id'] == user_id]
    for index, row in user_tweets.iterrows():
      
        # Add current engagement
        currEng = int(row['favorite_count'])+int(row['retweet_count'])
        eng += currEng
        tweet_count +=1
        
        # Get hashtags
        ht = row['hashtags'].split(';') if row['hashtags'] != '-' else []
        
        # Update hashtag count
        for hashtag in ht:
            if hashtag not in freq_hashtags:
                freq_hashtags[hashtag] = 1
            else:
                freq_hashtags[hashtag] += 1
        
        # Separate hashtags by capitalization
        ht_words = [re.findall('[a-zA-Z][^A-Z]*', w) for w in ht]
        
        # Create a list with all the words in the hashtags
        hts = []
        for h in ht_words:
            hts = hts + h
        
        # Remove digits and lower caps for every hashtag word
        hts = [item.translate(remove_digits).lower() for item in hts]
        
        # Update hashtag word frequencies
        for word in hts:
            if word not in stop_words:
                if word not in freq_ht_words:
                    freq_ht_words[word] = 1
                else:
                    freq_ht_words[word] += 1
        
        text, length, polarity, subjectivity = process_text(row['text'], freq_words, user_id)
        
        # Update text in  data frame
        text_analysis.append([user_id, row['tweet_id'], currEng, length, polarity, subjectivity])
    
    print(tweet_count, 'tweets analysed for user', user_id)
    if tweet_count > 0:
        average_engagement = eng / tweet_count
    else:
        average_engagement = 0
    c_words = Counter(freq_words)
    c_ht_words = Counter(freq_ht_words)
    c_hashtags = Counter(freq_hashtags)
    avg_eng.append([user_id, average_engagement, c_words.most_common(3),
                   c_ht_words.most_common(3), c_hashtags.most_common(3)])

20 tweets analysed for user 244592074.0
190 tweets analysed for user 251983977.0
137 tweets analysed for user 15232111.0
151 tweets analysed for user 371797057.0
78 tweets analysed for user 546871427.0
200 tweets analysed for user 1439221506.0
56 tweets analysed for user 205347208.0
83 tweets analysed for user 38373082.0
66 tweets analysed for user 13438282.0
107 tweets analysed for user 58531272.0
0 tweets analysed for user 9.89e+17
62 tweets analysed for user 187579797.0
53 tweets analysed for user 350462935.0
12 tweets analysed for user 55510331.0
208 tweets analysed for user 197194717.0
111 tweets analysed for user 2327688362.0
0 tweets analysed for user 8.58e+17
63 tweets analysed for user 138922361.0
0 tweets analysed for user 1.09e+18
101 tweets analysed for user 522398369.0
93 tweets analysed for user 76666825.0
30 tweets analysed for user 247379224.0
107 tweets analysed for user 101486124.0
263 tweets analysed for user 1678649304.0
180 tweets analysed for user 89485410.0
169 t

In [11]:
print(len(text_analysis), len(avg_eng))
print(avg_eng[-1])
last_id = 244592074.0
text_analysis = [l for l in text_analysis if not last_id in l]
avg_eng = [l for l in avg_eng if not last_id in l]
print(len(text_analysis), len(avg_eng))

17822 170
[268322810.0, 547.6301369863014, [('ministra', 26), ('transporte', 22), ('moción', 18)], [('ministra', 5), ('transporte', 5), ('salir', 5)], [('MinistraTransporteDebeSalir', 5), ('SenadoDebeVotarSÍ', 2), ('SenadoDebeVotarSí', 2)]]
17822 170


In [12]:
# Save tweet analysis in a dataframe
analysis_results = pd.DataFrame(text_analysis[1:])
analysis_results.columns = text_analysis[0]
analysis_results["id"] = pd.to_numeric(analysis_results["id"])
analysis_results.head()

Unnamed: 0,id,tweet_id,engagement,effective_length,polarity,subjectivity
0,2440936000.0,1.16782e+18,0,6,0.0,0.0
1,2440936000.0,1.16742e+18,0,65,0.1,0.15
2,2440936000.0,1.16726e+18,0,73,0.1,0.333333
3,2440936000.0,1.16671e+18,0,81,0.0,0.0
4,2440936000.0,1.16653e+18,1,74,-0.666667,1.0


In [13]:
# Save user engagement in a dataframe
user_engagement = pd.DataFrame(avg_eng[1:])
user_engagement.columns = avg_eng[0]
user_engagement["id"] = pd.to_numeric(user_engagement["id"])
user_engagement.head()

Unnamed: 0,id,average_engagement,common_words,common_ht_words,common_hts
0,2440936000.0,0.531414,"[(rights, 109), (human, 108), (the, 48)]",[],"[(MNPSMVP, 1)]"
1,69051400.0,2.641176,"[(colombia, 91), (digital, 69), (educa, 44)]","[(colombia, 68), (digital, 48), (educa, 44)]","[(Colombia40, 45), (EducaDigital, 30), (Lidera..."
2,138099100.0,2.5,"[(gracias, 8), (hno, 7), (bendiciones, 6)]","[(crack, 2), (alex, 1), (martinez, 1)]","[(TIGRE, 2), (Crack, 2), (AlexMartinez, 1)]"
3,73567050.0,1.753425,"[(noticias, 74), (comisión, 16), (opinión, 16)]","[(noticias, 74), (opinión, 16), (comisión, 13)]","[(AIL, 76), (Noticias, 53), (Opinión, 16)]"
4,179246200.0,55.469388,"[(gracias, 21), (aire, 7), (semana, 6)]","[(hazañas, 4), (maestras, 4), (santrich, 3)]","[(Santrich, 3), (Colombia, 2), (HazañasMaestra..."


In [14]:
analysis_results = pd.merge(analysis_results, user_engagement, on='id')
len(analysis_results)

17727

In [15]:
file_name = 'analysis_results.csv'
if not os.path.exists(file_name):
    analysis_results.to_csv(file_name, index=False)
else:
    analysis_results.to_csv(file_name, mode='a', header=False, index=False)

In [None]:
# Check last merge with lenghts to avoid duplicates

In [46]:
# Merge user and tweet data
processed_data = pd.merge(user_info, analysis_results, on='id')
processed_data.head()

Unnamed: 0,id,katz,betweenness,authority,screen_name,followers,friends,verified,created_at,listed,tipo,average_engagement,common_words,common_ht_words,common_hts,tweet_id,engagement,effective_length,polarity,subjectivity
0,426146744.0,1.580982,0.000668,2.9e-05,SteevenOrozco,655637,476,False,2011,842,opinion,457.25,"[(creyentes, 4), (bavaria, 4), (rt, 3)]","[(creyentes, 4), (bavaria, 3), (efecty, 1)]","[(Creyentes, 4), (MeUnoBavaria, 3), (EfectyElG...",1.14285e+18,42,125,0.6,0.8
1,426146744.0,1.580982,0.000668,2.9e-05,SteevenOrozco,655637,476,False,2011,842,opinion,457.25,"[(creyentes, 4), (bavaria, 4), (rt, 3)]","[(creyentes, 4), (bavaria, 3), (efecty, 1)]","[(Creyentes, 4), (MeUnoBavaria, 3), (EfectyElG...",1.14257e+18,1441,62,0.0,0.0
2,426146744.0,1.580982,0.000668,2.9e-05,SteevenOrozco,655637,476,False,2011,842,opinion,457.25,"[(creyentes, 4), (bavaria, 4), (rt, 3)]","[(creyentes, 4), (bavaria, 3), (efecty, 1)]","[(Creyentes, 4), (MeUnoBavaria, 3), (EfectyElG...",1.14149e+18,611,20,0.2,0.4
3,426146744.0,1.580982,0.000668,2.9e-05,SteevenOrozco,655637,476,False,2011,842,opinion,457.25,"[(creyentes, 4), (bavaria, 4), (rt, 3)]","[(creyentes, 4), (bavaria, 3), (efecty, 1)]","[(Creyentes, 4), (MeUnoBavaria, 3), (EfectyElG...",1.14142e+18,16,184,0.366667,0.5
4,426146744.0,1.580982,0.000668,2.9e-05,SteevenOrozco,655637,476,False,2011,842,opinion,457.25,"[(creyentes, 4), (bavaria, 4), (rt, 3)]","[(creyentes, 4), (bavaria, 3), (efecty, 1)]","[(Creyentes, 4), (MeUnoBavaria, 3), (EfectyElG...",1.14113e+18,180,261,-0.6,0.9
5,426146744.0,1.580982,0.000668,2.9e-05,SteevenOrozco,655637,476,False,2011,842,opinion,457.25,"[(creyentes, 4), (bavaria, 4), (rt, 3)]","[(creyentes, 4), (bavaria, 3), (efecty, 1)]","[(Creyentes, 4), (MeUnoBavaria, 3), (EfectyElG...",1.14e+18,39,45,0.3,0.2
6,426146744.0,1.580982,0.000668,2.9e-05,SteevenOrozco,655637,476,False,2011,842,opinion,457.25,"[(creyentes, 4), (bavaria, 4), (rt, 3)]","[(creyentes, 4), (bavaria, 3), (efecty, 1)]","[(Creyentes, 4), (MeUnoBavaria, 3), (EfectyElG...",1.13898e+18,16,126,0.416667,0.616667
7,426146744.0,1.580982,0.000668,2.9e-05,SteevenOrozco,655637,476,False,2011,842,opinion,457.25,"[(creyentes, 4), (bavaria, 4), (rt, 3)]","[(creyentes, 4), (bavaria, 3), (efecty, 1)]","[(Creyentes, 4), (MeUnoBavaria, 3), (EfectyElG...",1.13811e+18,61,89,0.0,0.0
8,426146744.0,1.580982,0.000668,2.9e-05,SteevenOrozco,655637,476,False,2011,842,opinion,457.25,"[(creyentes, 4), (bavaria, 4), (rt, 3)]","[(creyentes, 4), (bavaria, 3), (efecty, 1)]","[(Creyentes, 4), (MeUnoBavaria, 3), (EfectyElG...",1.13569e+18,20,183,-0.145833,0.875
9,426146744.0,1.580982,0.000668,2.9e-05,SteevenOrozco,655637,476,False,2011,842,opinion,457.25,"[(creyentes, 4), (bavaria, 4), (rt, 3)]","[(creyentes, 4), (bavaria, 3), (efecty, 1)]","[(Creyentes, 4), (MeUnoBavaria, 3), (EfectyElG...",1.13091e+18,14,148,0.383333,0.6


In [None]:
tweets = tweets.drop(['text','phone','hashtags','mentions'], axis=1)
processed_data = pd.merge(processed_data, tweets, on='id')
processed_data.head()

In [47]:
# Write processed data to file
processed_data.to_csv('processed_data.csv', index=False)