# **Global Sentiment**

En este cuaderno mostraremos la forma de extraer el *Sentimiento Global*, métrica que hemos desarrollado para este proyecto.

## Imports y Parametros

In [6]:
import datetime
from datetime import timedelta

import os
import re

import snscrape.modules.twitter as snstwitter

import pandas as pd
import seaborn as sns
import numpy as np

import matplotlib.pyplot as plt

Podemos escoger las fechas que queremos analizar. Para poder analizarlas, debemos tener los archivos de las fechas correspondientes descargados, sino no podremos ejecutar el analisis.

In [7]:
date_init = "2014-01-01"
date_limit = "2018-05-30"

## **Read Databases**

Definimos la carpeta donde se encuentran los datos así como los nombres de los archivos.

In [8]:
t_path = "../JABA/data/tweets"
t_file = "tweet_list.csv"
s_file = "tweet_sentiment_nltk.csv"

La base de datos esta formada por millones de filas y no usaremos todas las columnas, por lo que, para acelerar el proceso, eliminaremos las columnas no usadas.

In [9]:
def get_all_data():
    frames = []
    date_from = datetime.datetime.strptime(date_init, '%Y-%m-%d').date()
    date_until = datetime.datetime.strptime(date_limit, '%Y-%m-%d').date()
    
    if date_from >= date_until:
        return pd.DataFrame()
    
    while date_from < date_until:
        
        folder = os.path.join(t_path, str(date_from))
        # TODO Check if file exists
        if date_from.day == 1 and date_from.month == 1:
            print(f"Current Date {str(date_from)}")

        tweet_file = os.path.join(folder, t_file)
        sentiment_file = os.path.join(folder, s_file)

        tweet_df = pd.read_csv(tweet_file, sep=";")

        sent_df = pd.read_csv(sentiment_file, sep=";")
        
        frames += [tweet_df]
        
        date_from = date_from + timedelta(days=1)
    
    return pd.concat(frames, ignore_index=False)

In [10]:
df = get_all_data()
print("Extraction Completed!")

Current Date 2014-01-01
Current Date 2015-01-01
Current Date 2016-01-01
Current Date 2017-01-01
Current Date 2018-01-01
Extraction Completed!


In [11]:
len(df) 

18477101

In [12]:
positive_text = [':\)', ':D', '=D','=\)','😊', '🚀', '🔥','😋', '💰', '📈','💯']
negative_text = [':\(','=\(', ':c', ':C', '☹️', '😢', '😭', '🙁', '😟', '😒', '😔','📉','💀']

positive_text_f = [':)', ':D', '=D','=)','😊', '🚀', '🔥','😋', '💰', '📈','💯']
negative_text_f = [':(','=(', ':c', ':C', '☹️', '😢', '😭', '🙁', '😟', '😒', '😔', '📉','💀']

In [13]:
positive = df[df['Text'].str.contains('|'.join(positive_text))]['Text'].tolist()
negative = df[df['Text'].str.contains('|'.join(negative_text))]['Text'].tolist()
all_sent = df[df['Text'].str.contains('|'.join(positive_text+negative_text))]['Text'].tolist()

In [14]:
print(len(positive))
print(len(negative))
print("---")
print(len(all_sent))
print(len(positive) + len(negative))

358465
74601
---
415587
433066


In [15]:
def remove_emojis(data, filters = [positive_text_f, negative_text_f]):
    ''' Removes the emojis from the sentences '''
    for i, sentence in enumerate(data):
        for i_filter in filters:
            for element in i_filter:
                sentence =  sentence.replace(element, "")
        data[i] = sentence
    return data

## **Metric and Distance**

In [16]:
from sklearn.cluster import DBSCAN
import numpy as np
from math import ceil

In [17]:
def jacard_t(txt1, txt2):
    words1 = set(txt1.split(' '))
    words2 = set(txt2.split(' '))
    return 1 - len(words1.intersection(words2)) / len(words1.union(words2))

def jacard(index1, index2):
    words1 = set(positive[int(index1)].split(' '))
    words2 = set(positive[int(index2)].split(' '))
    return 1 - len(words1.intersection(words2)) / len(words1.union(words2))

def soronsen(index1, index2):
    words1 = set(positive[int(index1)].split(' '))
    words2 = set(positive[int(index2)].split(' '))
    return 1 - 2 * len(words1.intersection(words2)) / ( len(words1) + len(words2))

def overlap(index1, index2):
    words1 = set(positive[int(index1)].split(' '))
    words2 = set(positive[int(index2)].split(' '))
    return 1 - len(words1.intersection(words2)) / ( min( len(words1), len(words2) ) )

In [18]:
def filter_spam(data, batch_size = 5000, verbose = False, metric = jacard, eps = 0.3):
    ''' Filters spam based on text similarity '''
    batches = ceil(len(data)/batch_size)

    filtered_data = []

    if verbose:
        import time
        start_time = time.time()
        
    for n_batch in range(batches):
        if verbose:
            last_batch_time =  time.time() - start_time
            start_time = time.time()
            
            eta_time = (batches + 1 - n_batch ) * last_batch_time 
            
            batch_output = f'Current batch {n_batch+1} of {batches}.' 
            time_output = 'ETA : %i:%2i' % ( eta_time//60, int(eta_time)%60 )
            print(batch_output + ' ' + time_output , end='\r')
            
        if n_batch == batches - 1:
            batch = data[batch_size * n_batch:]
        else:
            batch = data[batch_size * n_batch:batch_size * (n_batch+1)]

        X = np.arange(batch_size * n_batch,  batch_size * n_batch + len(batch)).reshape(-1, 1)
        db = DBSCAN(eps=eps,  metric=metric).fit(X)
                    
        for i,v in enumerate(db.labels_):
            if v == -1:
                filtered_data += [batch[i]]
                
    return filtered_data

In [None]:
end_positive = filter_spam(positive, verbose = True)
end_negative = filter_spam(negative, verbose = True)

Current batch 47 of 72. ETA : 92:215

In [None]:
print(len(positive))
print(len(end_positive))
print(len(end_positive)/len(positive))
print("-"*10)
print(len(negative))
print(len(end_negative)/len(negative))

In [None]:
all_sentence = remove_emojis(end_positive + end_negative)
all_sentiment = [1] * len(end_positive) + [0] * len(end_negative)
all_map = {'text':all_sentence, 'sentiment':all_sentiment} 
final_df = pd.DataFrame(all_map)
final_df.to_csv('filter_sentiment.csv')

In [None]:
all_sentence[17960:18000]