# **Global Sentiment**

En este cuaderno mostraremos la forma de extraer el *Sentimiento Global*, métrica que hemos desarrollado para este proyecto.

## Imports y Parametros

In [1]:
import datetime
from datetime import timedelta

import os
import re

import snscrape.modules.twitter as snstwitter

import pandas as pd
import seaborn as sns
import numpy as np

import matplotlib.pyplot as plt

Podemos escoger las fechas que queremos analizar. Para poder analizarlas, debemos tener los archivos de las fechas correspondientes descargados, sino no podremos ejecutar el analisis.

In [2]:
date_init = "2014-01-01"
date_limit = "2014-05-10"

date_from = datetime.datetime.strptime(date_init, '%Y-%m-%d').date()
date_until = datetime.datetime.strptime(date_limit, '%Y-%m-%d').date()

## **Read Databases**

Definimos la carpeta donde se encuentran los datos así como los nombres de los archivos.

In [3]:
t_path = "../JABA/data/tweets"
t_file = "tweet_list.csv"
s_file = "tweet_sentiment_nltk.csv"

La base de datos esta formada por millones de filas y no usaremos todas las columnas, por lo que, para acelerar el proceso, eliminaremos las columnas no usadas.

In [4]:
positive_text = [':\)', ':D', '=D','=\)','😊', '🚀', '🔥','😋', '💰', '📈','💯']
negative_text = [':\(','=\(', ':c', ':C', '☹️', '😢', '😭', '🙁', '😟', '😒', '😔','📉','💀']

positive_text_f = [':)', ':D', '=D','=)','😊', '🚀', '🔥','😋', '💰', '📈','💯']
negative_text_f = [':(','=(', ':c', ':C', '☹️', '😢', '😭', '🙁', '😟', '😒', '😔', '📉','💀']

def apply_filter(data):
    positive = data[data['Text'].str.contains('|'.join(positive_text))]['Text'].tolist()
    negative = data[data['Text'].str.contains('|'.join(negative_text))]['Text'].tolist()
    
    return positive, negative

In [5]:
def get_positive_negative_data(date_from, date_until):
    end_positive, end_negative = [], []
    total_number = 0
    
    while date_from < date_until:
        
        if date_from.day == 1 and date_from.month == 1:
            print(f"Current Date {str(date_from)}")
        
        folder = os.path.join(t_path, str(date_from))
        
        tweet_file = os.path.join(folder, t_file)
        tweet_df = pd.read_csv(tweet_file, sep=";")
        
        total_number += len(tweet_df)
        
        positive, negative = apply_filter(tweet_df)
        
        end_positive += positive
        end_negative += negative
        
        date_from = date_from + timedelta(days=1)
    
    return end_positive, end_negative, total_number

In [6]:
positive, negative, df_len = get_positive_negative_data(date_from, date_until)
print("Extraction Completed!")

Current Date 2014-01-01
Extraction Completed!


In [7]:
print(len(positive))
print(len(negative))
print("---")
print(len(positive) + len(negative))

15349
3949
---
19298


In [8]:
def remove_emojis(data, filters = [positive_text_f, negative_text_f]):
    ''' Removes the emojis from the sentences '''
    for i, sentence in enumerate(data):
        for i_filter in filters:
            for element in i_filter:
                sentence =  sentence.replace(element, "")
        data[i] = sentence
    return data

## **Metric and Distance**

In [9]:
from JABA.service.scraper.spam import filtering, metrics

In [None]:
from multiprocessing import Pool

filtering.filter_spam(positive, verbose = True)    

In [19]:
end_positive = filter_spam(positive, verbose = True)
end_negative = filter_spam(negative, verbose = True)

Current batch 15 of 15. ETA : 4: 965

In [20]:
print(len(positive))
print(len(end_positive))
print(len(end_positive)/len(positive))
print("-"*10)
print(len(negative))
print(len(end_negative)/len(negative))

358465
281398
0.78500829927608
----------
74601
0.7715982359485798


In [21]:
all_sentence = remove_emojis(end_positive + end_negative)
all_sentiment = [1] * len(end_positive) + [0] * len(end_negative)
all_map = {'text':all_sentence, 'sentiment':all_sentiment} 
final_df = pd.DataFrame(all_map)
final_df.to_csv('filter_sentiment.csv')

In [None]:
all_sentence[17960:18000]