In [0]:
!pip install emoji
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [1]:
from nltk.tokenize import word_tokenize
from sklearn.externals import joblib
from sklearn import preprocessing
import matplotlib.pyplot as plt
from string import punctuation
from textblob import TextBlob
from bs4 import BeautifulSoup
from string import digits
import urllib.request
import pandas as pd
import numpy as np
import matplotlib
import emoji
import numpy
import json
import re

%matplotlib inline



ModuleNotFoundError: ignored

In [0]:
# Load model
model = joblib.load('model.pkl')

# Load scaler
scaler = joblib.load('scaler.pkl')

# Load columns
columns = joblib.load('model_columns.pkl')

In [0]:
# Import spanish stop word dictionary
url_sw = 'https://raw.githubusercontent.com/cpenalozag/twitter_network/master/utils/stopwords-es.json'
response_sw = urllib.request.urlopen(url_sw)
data_sw = response_sw.read()

stop_words = set(json.loads(data_sw))

# Import emoji meanings
emoji_translations = pd.read_csv('https://raw.githubusercontent.com/cpenalozag/twitter_network/master/utils/emojis_translated.csv')

# Get spanish meaning of an emoji
def emoji_meaning(emoji):
    meaning = emoji_translations.loc[emoji_translations['emoji'] == emoji]['translation']
    return meaning.values[0] if not meaning.empty else ''

# Transformations to remove digits and punctuation
remove_digits = str.maketrans('', '', digits)
remove_punctuation = str.maketrans('', '', punctuation)

In [0]:
def find_urls(text):
    urls = re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', text)
    return urls 

""" Processes text data in tweets
    
    text: text of tweet
    
    returns processed text, length of content, polarity, subjectivity
"""
def process_text(text):
    mentions = text.count('@')
    hashtags = text.count('#')
    urls = len(find_urls(text))

    # Remove links
    text = ' '.join(re.sub("(\w+:\/\/\S+)", " ", text).split())    
        
    # Remove mentions
    text = ' '.join(re.sub("(@[A-Za-z0-9^\w]+)", " ", text.replace('@ ','@').replace('# ','#')).split())
        
    # Replace hashtags with words
    if text.count('#')>0:
        text = ' '.join(re.findall('[A-Z][^A-Z]*', text.replace('#', ' ')))
            
    #Remove HTML tags
    text = BeautifulSoup(text).get_text()
    
    # Save content length (exluding links and mentions)
    length = len(text)
        
    # Remove punctuation symbols
    text = ' '.join(re.sub("[\.\,\¡\¿\!\?\:\;\-\=\*\(\)\[\]\"\'\“\_\+\”\%\/\‘\’]", " ", text).split())
    text = text.translate(remove_digits).translate(remove_punctuation)
        
    # Lower case to avoid case sensitive problems
    text = text.lower()
        
    # Replace emojis with names 
    text = emoji.demojize(text)
        
    # Add space between emojis and other characters
    ind = -2
    for c in range(text.count(':')):
        ind = text.find(':',ind+2)
        if c%2==0:
            newLetter = ' :'
        else:
            newLetter = ': '
        text ="".join((text[:ind],newLetter,text[ind+1:]))
            
    # Replace emoji names with spanish meaning
    result = []
    parts = text.split(' ')
    for part in parts:
        if part:
            if part[0]==':':
                em = handle_emoji_tone(part)
                em = emoji_meaning(em)
                if em:
                    result.append(em)
            else:
                result.append(part)
        
    text = ' '.join(result)
        
    # Filter using NLTK library append it to a string
    word_tokens = word_tokenize(text)
    result = [w for w in word_tokens if not w in stop_words]
    text = ' '.join(result)
    
    # Check if text contains at least a word
    analysis = TextBlob(text)
    try:
        # Sentiment analysis
        eng = analysis.translate(to='en')
        sentiment = eng.sentiment     
        polarity = sentiment.polarity
        subjectivity = sentiment.subjectivity

    except Exception as e:
      polarity = 0.0
      subjectivity = 0.0

    result = {
        'no_hashtags': [hashtags],
        'no_mentions': [mentions],
        'no_urls': [urls],
        'effective_length': [length],
        'polarity': [polarity],
        'subjectivity': [subjectivity]
    }

    return result, text

tones = ['_light_skin_tone','_medium-light_skin_tone','_medium_skin_tone', 
             '_medium-dark_skin_tone','_dark_skin_tone']

# Method that removes the tone from emojis
def handle_emoji_tone(emoji):  
    for t in tones:
        if t in emoji:
            tone = t
            return emoji.replace(tone,'')
    return emoji

In [0]:
test = """#Cacerolazo23N Las personas se concentran en el Parque Nacional 
de Bogotá, para el tercer cacerolazo del #ParoNacional. 
Una manifestación en calma. ✌🏻 https://elespectador.com/noticias/ @elespectador"""
result, proccesed_text = process_text(test)
for i in result:
    print (i, result[i])

proccesed_text

no_hashtags [2]
no_mentions [1]
no_urls [1]
effective_length [157]
polarity [0.15000000000000002]
subjectivity [0.375]


'cacerolazo personas concentran parque nacional bogotá tercer cacerolazo paro nacional manifestación calma mano victoria'

In [0]:
time = 'afternoon'
times = ['afternoon','early morning','late night','morning','night','noon']

for t in times:
  if t==time:
    result[t] = [1]
  else:
    result[t] = [0]

result['sensitive'] = [0]
result['no_media'] = [0]

for i in result:
    print (i, result[i])

In [0]:
tweet_data = pd.DataFrame(result)
tweet_data['join'] = 0
tweet_data

Unnamed: 0,no_hashtags,no_mentions,no_urls,effective_length,polarity,subjectivity,afternoon,early morning,late night,morning,night,noon,sensitive,no_media,join
0,2,1,1,157,0.15,0.375,1,0,0,0,0,0,0,0,0


In [3]:
user_info = pd.read_csv('https://raw.githubusercontent.com/cpenalozag/twitter_network/master/network-data/user_info.csv')
user_info['join'] = 0
user_info.head()

Unnamed: 0,id,average_engagement,common_hts,partition,core,in_degree,clustering,closeness,betweenness,vote_rank,authority,hubs,pagerank,ambassador,community_hub,screen_name,followers,friends,verified,description,listed,tipo,join
0,113127283,4.540741,"[('LosArchivosInnovan', 16), ('SITA2019', 14),...",2,31,28,0.030954,0.366627,0.000902,0.002188,0.00112,0.000628,0.000616,0.062147,0.117582,ArchivoGeneral,26830,1062,True,Custodiamos protegemos y ponemos al servicio ...,328,informative,0
1,46389700,7.344186,"[('342Artes', 1), ('DíaInternacionaldelGato', ...",3,28,12,0.029095,0.344495,0.000293,0.0,0.000287,0.000633,0.00023,0.04581,0.083333,museodantioquia,122715,1735,False,Primer museo fundado en Antioquia hace 135 año...,540,government entity,0
2,53514952,326.972973,"[('NewProfilePic', 1), ('HackedByPaulettee', 1)]",3,21,12,0.214106,0.322901,0.000478,0.0,2.6e-05,9e-05,0.00036,0.017179,0.1875,PauletteeOfcl,78807,311,False,Youtuber | Influencer | Filmmaker • Misterio🕵🏼...,592,personal,0
3,1020885913,88.230769,"[('porristasmillos', 10), ('MasAzulQueNunca', ...",5,14,6,0.072115,0.322602,0.000291,0.0,4.1e-05,0.000457,0.000182,0.029622,0.029018,PorristasMILLOS,20965,109,False,Cuenta Oficial gracias al amor a MFC somos la...,38,sports,0
4,322623953,72.769231,"[('ElCasoWatson', 8), ('durosdeamar', 6), ('mu...",3,29,20,0.033784,0.356014,0.000448,0.0,0.000479,0.000575,0.000374,0.04581,0.25,Veronica_Orozco,470798,226,True,Actriz Colombiana. manager@mariaclaralopez.com,446,entertainment,0


In [0]:
data = user_info.merge(tweet_data, left_index=False, right_index=False)
data.head()

Unnamed: 0,id,average_engagement,common_hts,partition,core,in_degree,clustering,closeness,betweenness,vote_rank,authority,hubs,pagerank,ambassador,community_hub,screen_name,followers,friends,verified,description,listed,tipo,join,no_hashtags,no_mentions,no_urls,effective_length,polarity,subjectivity,afternoon,early morning,late night,morning,night,noon,sensitive,no_media
0,113127283,4.540741,"[('LosArchivosInnovan', 16), ('SITA2019', 14),...",2,31,28,0.030954,0.366627,0.000902,0.002188,0.00112,0.000628,0.000616,0.062147,0.117582,ArchivoGeneral,26830,1062,True,Custodiamos protegemos y ponemos al servicio ...,328,informative,0,2,1,1,157,0.15,0.375,1,0,0,0,0,0,0,0
1,46389700,7.344186,"[('342Artes', 1), ('DíaInternacionaldelGato', ...",3,28,12,0.029095,0.344495,0.000293,0.0,0.000287,0.000633,0.00023,0.04581,0.083333,museodantioquia,122715,1735,False,Primer museo fundado en Antioquia hace 135 año...,540,government entity,0,2,1,1,157,0.15,0.375,1,0,0,0,0,0,0,0
2,53514952,326.972973,"[('NewProfilePic', 1), ('HackedByPaulettee', 1)]",3,21,12,0.214106,0.322901,0.000478,0.0,2.6e-05,9e-05,0.00036,0.017179,0.1875,PauletteeOfcl,78807,311,False,Youtuber | Influencer | Filmmaker • Misterio🕵🏼...,592,personal,0,2,1,1,157,0.15,0.375,1,0,0,0,0,0,0,0
3,1020885913,88.230769,"[('porristasmillos', 10), ('MasAzulQueNunca', ...",5,14,6,0.072115,0.322602,0.000291,0.0,4.1e-05,0.000457,0.000182,0.029622,0.029018,PorristasMILLOS,20965,109,False,Cuenta Oficial gracias al amor a MFC somos la...,38,sports,0,2,1,1,157,0.15,0.375,1,0,0,0,0,0,0,0
4,322623953,72.769231,"[('ElCasoWatson', 8), ('durosdeamar', 6), ('mu...",3,29,20,0.033784,0.356014,0.000448,0.0,0.000479,0.000575,0.000374,0.04581,0.25,Veronica_Orozco,470798,226,True,Actriz Colombiana. manager@mariaclaralopez.com,446,entertainment,0,2,1,1,157,0.15,0.375,1,0,0,0,0,0,0,0


In [0]:
# Remove useless columns
dataset = data.drop(['common_hts','screen_name', 'description'], axis=1)

# Convert boolean data to numbers
dataset[["sensitive", "verified"]] *= 1

# One hot encoding for user type
one_hot2 = pd.get_dummies(dataset['tipo'])
dataset = dataset.drop('tipo',axis = 1)
dataset = dataset.join(one_hot2)

# One hot encoding for core
one_hot3 = pd.get_dummies(dataset['partition'])
dataset = dataset.drop('partition',axis = 1)
dataset = dataset.join(one_hot3)

# Set type for categorical variables
bool_vars = ['sensitive', 'verified']
time_vars = ['afternoon', 'early morning', 'late night', 'morning', 'night', 'noon']
type_vars = ['company', 'competition', 'education', 'entertainment', 'event', 'fans', 'gossip', 'government entity', 'informative', 'journalism', 'news', 'ngo', 'personal', 'politics', 'radio', 'religion', 'sports']
cat_vars = bool_vars
cat_vars.extend(time_vars)
cat_vars.extend(type_vars)

dataset[cat_vars] = dataset[cat_vars].astype('category')

# Transform attributes to [0,1]

attributes = ['core', 'no_hashtags', 'no_mentions', 'average_engagement', 'listed', 'no_urls', 'effective_length', 'no_media', 'polarity', 'in_degree', 'clustering', 'closeness', 'betweenness', 'vote_rank', 'authority', 'hubs', 'pagerank']
dataset[attributes] = scaler.fit_transform(dataset[attributes])

# Transformation for special variables: followers and friends (keep meaning)
dataset[['followers', 'friends']] = dataset[['followers', 'friends']].astype(np.int32)
dataset['followers'] = data['followers'] / 19000000
dataset['friends'] = data['friends'] / 180000
dataset = dataset[columns]
dataset.head()

Unnamed: 0,id,sensitive,no_hashtags,no_mentions,no_urls,no_media,effective_length,polarity,subjectivity,average_engagement,core,in_degree,clustering,closeness,betweenness,vote_rank,authority,hubs,pagerank,ambassador,community_hub,followers,friends,verified,listed,afternoon,early morning,late night,morning,night,noon,company,competition,education,entertainment,event,fans,gossip,government entity,informative,journalism,news,ngo,personal,politics,radio,religion,sports,0,1,2,3,4,5,6,7,8
0,113127283,0,0.0,0.0,0.0,0.0,0.0,0.0,0.375,0.000182,1.0,0.088328,0.030954,0.700399,0.055236,0.002188,0.00112,0.000628,0.042614,0.062147,0.117582,0.001412,0.0059,1,0.017691,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
1,46389700,0,0.0,0.0,0.0,0.0,0.0,0.0,0.375,0.000294,0.9,0.037855,0.029095,0.658118,0.017963,0.0,0.000287,0.000633,0.011514,0.04581,0.083333,0.006459,0.009639,0,0.029125,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
2,53514952,0,0.0,0.0,0.0,0.0,0.0,0.0,0.375,0.013121,0.666667,0.037855,0.214106,0.616864,0.029262,0.0,2.6e-05,9e-05,0.021968,0.017179,0.1875,0.004148,0.001728,0,0.031929,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0
3,1020885913,0,0.0,0.0,0.0,0.0,0.0,0.0,0.375,0.00354,0.433333,0.018927,0.072115,0.616294,0.017836,0.0,4.1e-05,0.000457,0.007611,0.029622,0.029018,0.001103,0.000606,0,0.00205,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0
4,322623953,0,0.0,0.0,0.0,0.0,0.0,0.0,0.375,0.00292,0.933333,0.063091,0.033784,0.680122,0.027441,0.0,0.000479,0.000575,0.023076,0.04581,0.25,0.024779,0.001256,1,0.024055,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0


In [0]:
result_list = []

for index, row in dataset.iterrows():
  entry = list(row)[1:]
  entry = numpy.reshape(entry, (1,-1))
  result_list.append((row['id'], model.predict(entry)[0]))

result_list.sort(key=lambda tup: tup[1], reverse=True)
print(result_list[:20])

[(58531272, 808.2150305995674), (280701704, 716.0107635455779), (1626545264, 676.9656763857749), (265714930, 640.5889779278774), (276675653, 603.1949713710629), (49849732, 537.9766662141787), (876948470, 517.9899542072259), (150327476, 504.8541103376574), (4899563849, 493.85485990875304), (1238774719, 491.1566405576529), (336145436, 479.3828163206359), (365005024, 458.9217860617214), (22488241, 455.11220799894477), (246967511, 449.68431118737254), (1108499743, 436.159352318925), (209915861, 434.96722206378104), (136112883, 433.91521289081436), (1117317140, 432.79059852413235), (77653794, 428.58296692585446), (1609987202, 424.6095634902314)]
