# Tweet Scraper

Este es el notebook que usaremos para el scrapeo de datos


## Librerias

Para el scrapeo usaremos las siguientes librerias:
* Sntwitter
* Pandas

In [None]:
import datetime
from datetime import timedelta
import snscrape.modules.twitter as snstwitter
import pandas as pd
from tqdm import tqdm
import seaborn as sns

## Parametros



In [None]:
columnNames = [
    'Datetime', 'Tweet Id', 'Text', 
    'NumReplies', 'NumRetweets', 'NumLikes', 
    'IDOriginalRetweeted', 'Username', 'isVerified'
]

condition_query = '"BTC" OR "bitcoin" since:{since} until:{until} land:{lang}'

## Función obtener tweets


In [None]:
def get_tweets(date_from, date_until, max_tweets, lang="en"):
    """
        Función para scrapear tweets entre fechas
        
        Parameters:
        date_from (datetime.date): Fecha de comienzo del scrapping
        date_until (datetime.date): Fecha hasta la que se realiza el scrapping. Fecha no incluida.

        Returns:
        Lista de Valores del tweet
    """
    tweet_list = []
    while(date_from != date_until):
        print("Day " + str(date_from))
        a = datetime.datetime.now()

        format_string = condition_query.format(
            since=str(date_from),
            until=str(date_from + timedelta(days=1)),
            lang=lang
        )
        
        for i, tweet in enumerate(snstwitter.TwitterSearchScraper(format_string + " lang:en").get_items()):
            if i>=max_tweets:
                break
            if(i%2500==0):
                print(i , " / " , max_tweets)
            
            tweet_list.append(
                [
                    tweet.date, tweet.id, tweet.content,
                    tweet.replyCount, tweet.retweetCount,
                    tweet.likeCount, tweet.retweetedTweet,
                    tweet.user.username, tweet.user.verified
                ]
            )
        
        date_from += timedelta(days=1)
        b = datetime.datetime.now()
        print(b-a)

    return tweet_list

## Ejemplo de uso

In [None]:
date_from = datetime.date(2021, 1, 1)
date_until = datetime.date(2021, 1, 3)
max_tweet = 3000

directory = 'data/tweets/' + str(date_from) + '/' +  str(date_until)
file_name = directory + '/tweet_list.csv'

In [None]:
tweet_list = get_tweets(date_from, date_until, max_tweet)

tweet_df = pd.DataFrame(tweet_list, columns=columnNames)
del tweet_list

In [None]:
from pathlib import Path

Path(directory).mkdir(parents=True, exist_ok=True)
tweet_df.to_csv(file_name, sep=',', index=False)

### Lectura de ficheros ya existentes

In [None]:
tweet_df = pd.read_csv(file_name, sep=',')

## Analisis de sentimientos

In [None]:
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer

### Preparación del Analisis

En caso de ser nuestra primera ejecución, deberemos instalar un conjunto de datasets utiles para *nltk*.

In [None]:
nltk.download([     
    "names",
    "stopwords",
    "state_union",
    "twitter_samples",
    "movie_reviews",
    "averaged_perceptron_tagger",
    "vader_lexicon",
    "punkt",
])

### Limpieza de dataset

In [None]:
import re
import emoji

def clean_tweet(tweet):
    tweet = re.sub("@[A-Za-z0-9_]+","",tweet) #Remove @ sign
    tweet = re.sub(r"(?:\@|http?\://|https?\://|www)\S+", "", tweet) #Remove http links
    tweet = " ".join(tweet.split())
    tweet = emoji.get_emoji_regexp().sub(r'', tweet)
    tweet = tweet.replace("#", "").replace("_", " ") #Remove hashtag sign but keep the text
    return tweet

In [None]:
def get_tweet_sentiment_summary(data = tweet_df, n=100, verbose=False):
    sia = SentimentIntensityAnalyzer()
    
    avg_sentiment = 0
    for i in range(n):
        tweet = tweet_df['Text'].iloc[i]
        clean_tweet_str = clean_tweet(tweet)
        polarity = sia.polarity_scores(clean_tweet_str)
        
        avg_sentiment += polarity['compound']
        
        if verbose:
            print("\n")
            print(f"Before : " + tweet)
            print(f"Afetr : " +  clean_tweet_str) 
            print(polarity)
            print("\n")
            print("-"*20)
    return avg_sentiment/n


In [None]:
get_tweet_sentiment_summary(data = tweet_df, n=2000)