# **Global Sentiment**

En este cuaderno mostraremos la forma de extraer el *Sentimiento Global*, métrica que hemos desarrollado para este proyecto.

## Imports y Parametros

In [1]:
import datetime
from datetime import timedelta

import os
import re

import snscrape.modules.twitter as snstwitter

import pandas as pd
import seaborn as sns
import numpy as np

import matplotlib.pyplot as plt

Podemos escoger las fechas que queremos analizar. Para poder analizarlas, debemos tener los archivos de las fechas correspondientes descargados, sino no podremos ejecutar el analisis.

In [2]:
date_init = "2014-01-01"
date_limit = "2016-08-01"

## **Read Databases**

Definimos la carpeta donde se encuentran los datos así como los nombres de los archivos.

In [3]:
t_path = "../JABA/data/tweets"
t_file = "tweet_list.csv"
s_file = "tweet_sentiment_nltk.csv"

La base de datos esta formada por millones de filas y no usaremos todas las columnas, por lo que, para acelerar el proceso, eliminaremos las columnas no usadas.

In [4]:
def get_all_data():
    frames = []
    date_from = datetime.datetime.strptime(date_init, '%Y-%m-%d').date()
    date_until = datetime.datetime.strptime(date_limit, '%Y-%m-%d').date()
    
    if date_from >= date_until:
        return pd.DataFrame()
    
    while date_from < date_until:
        
        folder = os.path.join(t_path, str(date_from))
        # TODO Check if file exists
        if date_from.day == 1 and date_from.month == 1:
            print(f"Current Date {str(date_from)}")

        tweet_file = os.path.join(folder, t_file)
        sentiment_file = os.path.join(folder, s_file)

        tweet_df = pd.read_csv(tweet_file, sep=";")

        sent_df = pd.read_csv(sentiment_file, sep=";")
        
        frames += [tweet_df]
        
        date_from = date_from + timedelta(days=1)
    
    return pd.concat(frames, ignore_index=False)

In [5]:
df = get_all_data()
print("Extraction Completed!")

Current Date 2014-01-01
Current Date 2015-01-01
Current Date 2016-01-01
Extraction Completed!


In [6]:
len(df) 

6923270

In [7]:
positive_text = [':\)', ':D', '=D','=\)','😊', '🚀', '🔥','😋', '💰', '📈','💯']
negative_text = [':\(','=\(', ':c', ':C', '☹️', '😢', '😭', '🙁', '😟', '😒', '😔','📉','💀']

positive_text_f = [':)', ':D', '=D','=)','😊', '🚀', '🔥','😋', '💰', '📈','💯']
negative_text_f = [':(','=(', ':c', ':C', '☹️', '😢', '😭', '🙁', '😟', '😒', '😔', '📉','💀']

In [8]:
positive = df[df['Text'].str.contains('|'.join(positive_text))]['Text'].tolist()
negative = df[df['Text'].str.contains('|'.join(negative_text))]['Text'].tolist()
all_sent = df[df['Text'].str.contains('|'.join(positive_text+negative_text))]['Text'].tolist()

In [9]:
print(len(positive))
print(len(negative))
print("---")
print(len(all_sent))
print(len(positive) + len(negative))

64886
28127
---
91797
93013


In [10]:
all_sentence = positive + negative

for i, sentence in enumerate(all_sentence):
    for element in positive_text_f:
        sentence =  sentence.replace(element, "")
    for element in negative_text_f:
        sentence =  sentence.replace(element, "")
        
    all_sentence[i] = sentence

In [11]:

all_sentiment = [1] * len(positive) + [0] * len(negative)
all_map = {'text':all_sentence, 'sentiment':all_sentiment} 
final_df = pd.DataFrame(all_map)

In [37]:
final_df.to_csv('sentiment.csv')

## **Metric and Distance**

In [12]:
from sklearn.cluster import DBSCAN
import numpy as np
from math import ceil

In [13]:
def jacard_t(txt1, txt2):
    words1 = set(txt1.split(' '))
    words2 = set(txt2.split(' '))
    return 1 - len(words1.intersection(words2)) / len(words1.union(words2))

def jacard(index1, index2):
    words1 = set(positive[int(index1)].split(' '))
    words2 = set(positive[int(index2)].split(' '))
    return 1 - len(words1.intersection(words2)) / len(words1.union(words2))

def soronsen(index1, index2):
    words1 = set(positive[int(index1)].split(' '))
    words2 = set(positive[int(index2)].split(' '))
    return 1 - 2 * len(words1.intersection(words2)) / ( len(words1) + len(words2))

def overlap(index1, index2):
    words1 = set(positive[int(index1)].split(' '))
    words2 = set(positive[int(index2)].split(' '))
    return 1 - len(words1.intersection(words2)) / ( min( len(words1), len(words2) ) )

In [14]:
def filter_spam(data, batch_size = 5000, verbose = False, metric = jacard, eps = 0.3):
    ''' Filters spam based on text similarity '''
    batches = ceil(len(data)/batch_size)

    filtered_data = []

    if verbose:
        import time
        start_time = time.time()
        
    for n_batch in range(batches):
        if verbose:
            last_batch_time =  time.time() - start_time
            start_time = time.time()
            
            eta_time = (batches + 1 - n_batch ) * last_batch_time 
            
            print(f"Current batch {n_batch} of {batches}")
            
            print("ETA: %i:%i" % ( eta_time//60, int(eta_time)%60 ) )
            
        if n_batch == batches - 1:
            batch = data[batch_size * n_batch:]
        else:
            batch = data[batch_size * n_batch:batch_size * (n_batch+1)]

        X = np.arange(batch_size * n_batch,  batch_size * n_batch + len(batch)).reshape(-1, 1)
        db = DBSCAN(eps=eps,  metric=metric).fit(X)
        
        if verbose:
            print(f"Number of batch labels found {max(db.labels_)}")
            
        for i,v in enumerate(db.labels_):
            if v == -1:
                filtered_data += [batch[i]]
                
    return filtered_data

In [15]:
end_positive = filter_spam(positive, verbose = True)
end_negative = filter_spam(negative, verbose = True)

Current batch 0 of 13
ETA: 0:0
Number of batch labels found 4
Current batch 1 of 13
ETA: 26:4
Number of batch labels found 5
Current batch 2 of 13
ETA: 24:41
Number of batch labels found 17
Current batch 3 of 13
ETA: 23:24
Number of batch labels found 14
Current batch 4 of 13
ETA: 21:6
Number of batch labels found 19
Current batch 5 of 13
ETA: 18:38
Number of batch labels found 34
Current batch 6 of 13
ETA: 16:35
Number of batch labels found 27
Current batch 7 of 13
ETA: 14:32
Number of batch labels found 25
Current batch 8 of 13
ETA: 12:23
Number of batch labels found 33
Current batch 9 of 13
ETA: 10:11
Number of batch labels found 42
Current batch 10 of 13
ETA: 8:0
Number of batch labels found 46
Current batch 11 of 13
ETA: 6:5
Number of batch labels found 55
Current batch 12 of 13
ETA: 4:0
Number of batch labels found 47
Current batch 0 of 6
ETA: 0:0
Number of batch labels found 4
Current batch 1 of 6
ETA: 11:40
Number of batch labels found 5
Current batch 2 of 6
ETA: 10:2
Number of

In [60]:
n_b = 12
print(max(tot[n_b][0]))
for i in range(0, len(tot[n_b][0])):
    if tot[n_b][0][i] == 45:
        print(positive[batch_size*n_b+i])

47
r/btc mod logs public, but posting/discussing the history censored/not allowed :) bitcoin today!
r/btc mod logs public, but posting/discussing the history censored/not allowed :): bitcoin btc
r/btc mod logs public, but posting/discussing the history censored/not allowed :) bitcoin blockchain
r/btc mod logs public, but posting/discussing the history censored/not allowed :) bitcoin blockchain cryptos r…
r/btc mod logs public, but posting/discussing the history censored/not allowed :) (via /r/bitcoin)


In [18]:
print(len(positive))
print(len(end_positive))
print(len(end_positive)/len(positive))
print("-"*10)
print(len(negative))
print(len(end_negative)/len(negative))

64886
51737
0.7973522793823012
----------
28127
0.8041383723824084


In [19]:
all_sentence = end_positive + end_negative
all_sentiment = [1] * len(end_positive) + [0] * len(end_negative)
all_map = {'text':all_sentence, 'sentiment':all_sentiment} 
final_df = pd.DataFrame(all_map)
final_df.to_csv('filter_sentiment.csv')

## **Tensorflow Fine Tuning**


In [1]:
import tensorflow as tf
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))


Num GPUs Available:  0


In [9]:
import tensorflow as tf

# Get the GPU device name.
device_name = tf.test.gpu_device_name()
print(device_name)
# The device name should look like the following:
if device_name == '/device:GPU:0':
    print('Found GPU at: {}'.format(device_name))
else:
    raise SystemError('GPU device not found')




SystemError: GPU device not found

In [1]:
import tensorflow_datasets as tfds

In [None]:
(ds_train, ds_test), ds_info = tfds.load('imdb_reviews', 
          split = (tfds.Split.TRAIN, tfds.Split.TEST),
          as_supervised=True,
          with_info=True)
print('info', ds_info)

In [21]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [22]:
vocabulary = tokenizer.get_vocab()

print(list(vocabulary.keys())[5000:5020])

['knight', 'lap', 'survey', 'ma', '##ow', 'noise', 'billy', '##ium', 'shooting', 'guide', 'bedroom', 'priest', 'resistance', 'motor', 'homes', 'sounded', 'giant', '##mer', '150', 'scenes']


In [23]:
max_length_test = 20
test_sentence = 'Test tokenization sentence. Followed by another sentence'

# add special tokens

test_sentence_with_special_tokens = '[CLS]' + test_sentence + '[SEP]'

tokenized = tokenizer.tokenize(test_sentence_with_special_tokens)

print('tokenized', tokenized)

# convert tokens to ids in WordPiece
input_ids = tokenizer.convert_tokens_to_ids(tokenized)
  
# precalculation of pad length, so that we can reuse it later on
padding_length = max_length_test - len(input_ids)

# map tokens to WordPiece dictionary and add pad token for those text shorter than our max length
input_ids = input_ids + ([0] * padding_length)

# attention should focus just on sequence with non padded tokens
attention_mask = [1] * len(input_ids)

# do not focus attention on padded tokens
attention_mask = attention_mask + ([0] * padding_length)

# token types, needed for example for question answering, for our purpose we will just set 0 as we have just one sequence
token_type_ids = [0] * max_length_test

bert_input = {
    "token_ids": input_ids,
    "token_type_ids": token_type_ids,
    "attention_mask": attention_mask
}
print(bert_input)

tokenized ['[CLS]', 'test', 'token', '##ization', 'sentence', '.', 'followed', 'by', 'another', 'sentence', '[SEP]']
{'token_ids': [101, 3231, 19204, 3989, 6251, 1012, 2628, 2011, 2178, 6251, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]}


In [24]:
bert_input = tokenizer.encode_plus(
                        test_sentence,                      
                        add_special_tokens = True, # add [CLS], [SEP]
                        max_length = max_length_test, # max length of the text that can go to BERT
                        pad_to_max_length = True, # add [PAD] tokens
                        return_attention_mask = True, # add attention mask to not focus on pad tokens
              )

print('encoded', bert_input)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


encoded {'input_ids': [101, 3231, 19204, 3989, 6251, 1012, 2628, 2011, 2178, 6251, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]}




In [25]:

# can be up to 512 for BERT
max_length = 512
batch_size = 6

In [9]:
for review, label in tfds.as_numpy(ds_train.take(5)):
    print('review', review.decode()[0:50], label)

review This was an absolutely terrible movie. Don't be lu 0
review I have been known to fall asleep during films, but 0
review Mann photographs the Alberta Rocky Mountains in a  0
review This is the kind of film for a snowy Sunday aftern 1
review As others have mentioned, all the women that go nu 1


In [17]:
def convert_example_to_feature(review):
  
  # combine step for tokenization, WordPiece vector mapping, adding special tokens as well as truncating reviews longer than the max length
  
  return tokenizer.encode_plus(review, 
                add_special_tokens = True, # add [CLS], [SEP]
                max_length = max_length, # max length of the text that can go to BERT
                pad_to_max_length = True, # add [PAD] tokens
                return_attention_mask = True, # add attention mask to not focus on pad tokens
              )


In [19]:
def map_example_to_dict(input_ids, attention_masks, token_type_ids, label):
    return {
        "input_ids": input_ids,
        "token_type_ids": token_type_ids,
        "attention_mask": attention_masks,
    }, label
def encode_examples(ds, limit=-1):
    # prepare list, so that we can build up final TensorFlow dataset from slices.
    input_ids_list = []
    token_type_ids_list = []
    attention_mask_list = []
    label_list = []
    if (limit > 0):
        ds = ds.take(limit)
    
    for review, label in tfds.as_numpy(ds):
        bert_input = convert_example_to_feature(review.decode())

        input_ids_list.append(bert_input['input_ids'])
        token_type_ids_list.append(bert_input['token_type_ids'])
        attention_mask_list.append(bert_input['attention_mask'])
        label_list.append([label])
    return tf.data.Dataset.from_tensor_slices((input_ids_list, attention_mask_list, token_type_ids_list, label_list)).map(map_example_to_dict)


In [26]:
# train dataset
ds_train_encoded = encode_examples(ds_train).shuffle(10000).batch(batch_size)
# test dataset
ds_test_encoded = encode_examples(ds_test).batch(batch_size)


In [27]:
from transformers import TFBertForSequenceClassification
import tensorflow as tf

# recommended learning rate for Adam 5e-5, 3e-5, 2e-5
learning_rate = 2e-5

# we will do just 1 epoch for illustration, though multiple epochs might be better as long as we will not overfit the model
number_of_epochs = 1

# model initialization
model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased')

# optimizer Adam
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate, epsilon=1e-08)

# we do not have one-hot vectors, we can use sparce categorical cross entropy and accuracy
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
model.compile(optimizer=optimizer, loss=loss, metrics=[metric])


All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [28]:
bert_history = model.fit(ds_train_encoded, epochs=number_of_epochs, validation_data=ds_test_encoded)


Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method


Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method


Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method








Instructions for updating:
The `validate_indices` argument has no effect. Indices are always validated on CPU and never validated on GPU.


Instructions for updating:
The `validate_indices` argument has no effect. Indices are always validated on CPU and never validated on GPU.










  23/4167 [..............................] - ETA: 11:24:19 - loss: 0.7153 - accuracy: 0.4783

KeyboardInterrupt: 