# Tweet Scraper

Este es el notebook que usaremos para el scrapeo de datos


## Librerias

Para el scrapeo usaremos las siguientes librerias:
* Sntwitter
* Pandas

In [63]:
import datetime
from datetime import timedelta
import snscrape.modules.twitter as sntwitter
import pandas as pd
from tqdm import tqdm
import seaborn as sns

## Parametros



In [64]:
columnNames = [
    'Datetime', 'Tweet Id', 'Text', 
    'NumReplies', 'NumRetweets', 'NumLikes', 
    'IDOriginalRetweeted', 'Username', 'isVerified'
]

condition_query = '"BTC" OR "bitcoin" since:{since} until:{until}'

## Función obtener tweets


In [65]:
def get_tweets(date_from, date_until, max_tweets):
    """
        Función para scrapear tweets entre fechas
        
        Parameters:
        date_from (datetime.date): Fecha de comienzo del scrapping
        date_until (datetime.date): Fecha hasta la que se realiza el scrapping. Fecha no incluida.

        Returns:
        Lista de Valores del tweet
    """
    tweet_list = []
    while(date_from != date_until):
        print("Day " + str(date_from))
        a = datetime.datetime.now()

        format_string = condition_query.format(since=str(date_from), until=str(date_from + timedelta(days=1)))
        
        for i, tweet in enumerate(sntwitter.TwitterSearchScraper(format_string).get_items()):
            if i>=max_tweets:
                break
            if(i%2500==0):
                print(i , " / " , max_tweets)
            
            tweet_list.append(
                [
                    tweet.date, tweet.id, tweet.content,
                    tweet.replyCount, tweet.retweetCount,
                    tweet.likeCount, tweet.retweetedTweet,
                    tweet.user.username, tweet.user.verified
                ]
            )
        
        date_from += timedelta(days=1)
        b = datetime.datetime.now()
        print(b-a)

    return tweet_list

In [43]:
date_from = datetime.date(2021, 1, 1)
date_until = datetime.date(2021, 5, 21)
tweet_list = get_tweets(date_from, date_until)

Day 2021-01-01
0  /  20000
1000  /  20000
2000  /  20000
3000  /  20000
4000  /  20000
5000  /  20000
6000  /  20000
7000  /  20000
8000  /  20000
9000  /  20000
10000  /  20000
11000  /  20000
12000  /  20000
13000  /  20000
14000  /  20000
15000  /  20000
16000  /  20000
17000  /  20000
18000  /  20000
19000  /  20000
20000  /  20000
0:04:36.663125
Day 2021-01-02
0  /  20000
1000  /  20000
2000  /  20000
3000  /  20000
4000  /  20000
5000  /  20000
6000  /  20000
7000  /  20000
8000  /  20000
9000  /  20000
10000  /  20000
11000  /  20000
12000  /  20000
13000  /  20000
14000  /  20000
15000  /  20000
16000  /  20000
17000  /  20000
18000  /  20000
19000  /  20000
20000  /  20000
0:04:28.352077
Day 2021-01-03
0  /  20000
1000  /  20000
2000  /  20000
3000  /  20000
4000  /  20000
5000  /  20000
6000  /  20000
7000  /  20000
8000  /  20000
9000  /  20000
10000  /  20000
11000  /  20000
12000  /  20000
13000  /  20000
14000  /  20000
15000  /  20000
16000  /  20000
17000  /  20000
1800

In [44]:
tweet_df = pd.DataFrame(tweet_list, columns=columnNames)

# Display first 5 entries from dataframe
tweet_df.head()

Unnamed: 0,Datetime,Tweet Id,Text,NumReplies,NumRetweets,NumLikes,IDOriginalRetweeted,Username,isVerified
0,2021-05-20 23:59:59+00:00,1395529770847416327,#XRP/USDT #Binance #Kripto #Otomatik \n#Sinyal...,1,0,0,,kucuktakipci,False
1,2021-05-20 23:59:56+00:00,1395529758625173504,@Melt_Dem @compass_mining He is still in Seaso...,0,0,0,,proofofwork1,False
2,2021-05-20 23:59:56+00:00,1395529756003782657,@TJCobain #Gracie #btc 3DezN9dYRDQKmWt3hqT58SF...,0,0,0,,OldtimeComputer,False
3,2021-05-20 23:59:55+00:00,1395529753919164416,@DenizSaaat @microbtminer @bitmain @CanaanInc1...,1,1,2,,Jermeh8,False
4,2021-05-20 23:59:54+00:00,1395529747854200834,@glasstoken @shibinformer The only token where...,0,0,1,,Jack53792519,False


In [45]:
tweet_df.describe()

Unnamed: 0,Tweet Id,NumReplies,NumRetweets,NumLikes
count,2800140.0,2800140.0,2800140.0,2800140.0
mean,1.395508e+18,0.9935882,1.283625,9.0653
std,12060030000000.0,12.47524,22.69849,174.9703
min,1.395488e+18,0.0,0.0,0.0
25%,1.395497e+18,0.0,0.0,0.0
50%,1.395507e+18,0.0,0.0,0.0
75%,1.395518e+18,1.0,0.0,2.0
max,1.39553e+18,961.0,1995.0,20968.0


In [46]:
tweet_df.to_csv('tweets_2021.csv', sep=',', index=False)

In [50]:
tweet_df.to_string('tweets_2021.txt')

In [None]:
date_from = datetime.date(2021, 1, 1)
date_until = datetime.date(2021, 2, 1)
tweet_list2021 = get_tweets(date_from, date_until, 20000)

Day 2021-01-01
0  /  20000
2500  /  20000
5000  /  20000
7500  /  20000
10000  /  20000
12500  /  20000
15000  /  20000
17500  /  20000


In [58]:
tweet_df2021 = pd.DataFrame(tweet_list2021, columns=columnNames)

# Display first 5 entries from dataframe
tweet_df2021.head()

Unnamed: 0,Datetime,Tweet Id,Text,NumReplies,NumRetweets,NumLikes,IDOriginalRetweeted,Username,isVerified
0,2021-01-01 23:59:59+00:00,1345157857575129090,@bitcoin_qr #BTC,0,0,1,,jaydoamongus,False
1,2021-01-01 23:59:58+00:00,1345157850478170112,@coloradotravis @Talk_BTC @grove654 This is so...,0,0,1,,CynicalIndn,False
2,2021-01-01 23:59:58+00:00,1345157850440544258,Crypto adoption in 2021: Top trends and predic...,0,0,1,,WoodooProd,False
3,2021-01-01 23:59:57+00:00,1345157849329098754,@nic__carter @LynAldenContact From Lyn's artic...,1,0,0,,HerfinMN,False
4,2021-01-01 23:59:56+00:00,1345157843721269250,THINGS THAT MAKE YOU GO HMMMMM\n#SEC Will Inau...,0,0,0,,opalessense,False


In [59]:
tweet_df2021.to_csv('tweets_2021_ENERO.csv', sep=',', index=False)