# Tweet Scraper

Este es el notebook que usaremos para el scrapeo de datos


## Librerias

Para el scrapeo usaremos las siguientes librerias:
* Sntwitter
* Pandas

In [None]:
import datetime
from datetime import timedelta

import snscrape.modules.twitter as snstwitter

import pandas as pd
import seaborn as sns

from tqdm import tqdm

from src.JATS import JATS

## Parametros



In [None]:
from JATS.src.JATS.analyzer import Analyzer
from nltk.corpus import stopwords
a = Analyzer()
from nltk.tokenize import word_tokenize
stop_words = set(stopwords.words('english'))

In [None]:
text = "Hi, today I hate bitcoin and all their products. Call me Mark Tyson"
print(" ".join([w for w in text.split() if not w in stop_words]))

In [None]:
%%timeit text = "Hi, today I hate bitcoin and all their products"
filtered_sentence = " ".join([w for w in text.split() if not w in stop_words])

a.get_sentiment(filtered_sentence)

In [None]:
%%timeit text = "Hi, today I hate bitcoin and all their products"
a.get_sentiment(text)

In [None]:
query = '"BTC" OR "bitcoin"'

date_from = datetime.date(2018, 4, 1)
date_until = datetime.date(2018, 4, 30)

tweet_list = JATS.get_tweets(query, date_from, date_until, verbose = True)

In [None]:
columnNames = [
    'Datetime',
    'Tweet Id',
    'Text', 
    'NumReplies',
    'NumRetweets',
    'NumLikes', 
    'IDOriginalRetweeted', 
    'Username',
    'isVerified'
]
tweet_df = pd.DataFrame(tweet_list, columns=columnNames)

### Lectura de ficheros ya existentes

In [None]:
file_name = "data/tweets/2018-03-02/2018-04-03/tweet_list.csv"
tweet_df = pd.read_csv(file_name, sep=';')

tweet_df["Datetime"] = pd.to_datetime(tweet_df["Datetime"])

## Analisis de sentimientos

### Preparación del Analisis

En caso de ser nuestra primera ejecución, deberemos instalar un conjunto de datasets utiles para *nltk*.

In [None]:
from JATS.src.JATS.analyzer import Analyzer

## Analisis de Sentimiento

Eliminaremos los valores nulos ya que parece que cuando el algoritmo no es capaz de determinar el sentimiento, tiende a ponerle un 0, creando una desviación del sentimiento real.

Lo primero que haremos será mostrar la **media del sentimiento** y una **gráfica de distribución del sentimiento**.

In [None]:
a = Analyzer()

In [None]:
a.analyze(tweet_df, "data/tweets/2018-04-01/2018-04-02") # Saved to a CSV

## Analisis de Similitudes

Tenemos que comprobar la existencia de tweets similares para evitar el SPAM que existe en mensajes que no son completamente identicos.

Para ellos haremos uso de la metrica de similitud Cosine Similarity y despues aplicaremos un DBScan para asignar clusters a esos tweets.

In [None]:
import string
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer

def get_cosine_similarity(cleaned_texts):
    vectorizer = CountVectorizer().fit_transform(cleaned_texts)
    vectors = vectorizer.toarray()
    return cosine_similarity(vectors)

In [None]:

print(tweet_df.isnull().sum())
tweet_df = tweet_df.dropna(axis=0, subset=['Text'])
print(tweet_df.isnull().sum())

csim = get_cosine_similarity(tweet_df['Text'])

In [None]:
from sklearn.cluster import DBSCAN
import numpy as np
clustering = DBSCAN(eps=1.04, min_samples=1).fit(csim)
unique_elements, counts_elements = np.unique(clustering.labels_, return_counts=True)
print(type(clustering.labels_))
print(np.asarray((unique_elements, counts_elements)))

In [None]:
print(csim)

In [None]:
tweet_df['Prediction'] = clustering.labels_.tolist()
df = pd.DataFrame
tweet_df[tweet_df['Prediction'] < 1].head()

In [None]:
import pandas as pd

column = ["DateTime","Cosa"]
df = pd.DataFrame(columns=column)

In [None]:
df.count().reset_index()

In [None]:
class Foo: pass
class Bar(Foo): pass
class Bar2(Foo): pass
class Bar(Bar): pass

In [None]:
Foo.__subclasses__()

In [None]:
[ q.__name__ for q in Foo.__subclasses__()]


In [None]:
[ q() for q in Foo.__subclasses__() if q.__name__ == "Bar"]

In [None]:
import pandas as pd


d = {'col1': [1, 2,1,2,4,7,1], 
     'col2': [1, 2,1,1,4,1,1],
     'col3': [11, 32,41,14,4,4,18],
     'col5': ['11', '32','41','14','4','4','18']}
df = pd.DataFrame(data=d)

In [None]:
df.groupby('col1', as_index=False).mean().columns



## Separar obtención de bitcoin a dia a dia

In [5]:
import os
from pathlib import Path
import pandas as pd
import datetime

In [10]:
df = pd.read_csv("bitcoinData1M.csv", sep=",")

In [12]:
df['Timestamp'] = pd. to_datetime(df['Timestamp'], unit='s')

In [15]:
df = df.drop(['Open','High','Low','Weighted_Price','Volume_(BTC)', 'Volume_(Currency)'], axis=1)

In [19]:
df.head()

Unnamed: 0,Timestamp,Close
0,2011-12-31 07:52:00,4.39
1,2011-12-31 07:53:00,0.0
2,2011-12-31 07:54:00,0.0
3,2011-12-31 07:55:00,0.0
4,2011-12-31 07:56:00,0.0


In [11]:
df["timestamp"] = pd.to_datetime(df["timestamp"])

df["timestamp_round"] = df["timestamp"].dt.floor("d")
df = df.set_index("timestamp_round")

In [12]:
df = df[["timestamp", "close", "volume"]]

In [25]:
df = df.rename(columns={'timestamp': 'Timestamp'})


In [43]:
df['round_datetime'] = df['timestamp'].dt.floor('D')
df = df.set_index('round_datetime')

In [44]:
df.head()

Unnamed: 0_level_0,timestamp,Close
round_datetime,Unnamed: 1_level_1,Unnamed: 2_level_1
2011-12-31,2011-12-31 07:52:00,4.39
2011-12-31,2011-12-31 07:53:00,0.0
2011-12-31,2011-12-31 07:54:00,0.0
2011-12-31,2011-12-31 07:55:00,0.0
2011-12-31,2011-12-31 07:56:00,0.0


In [None]:
base_dir_btc = "JABA/data/bitcoin"
DFList = []
i = 0
datee = 0
for group in df.groupby(df.index):
    
    i += 1
    datee = str(group[1].iloc[0]["timestamp"].to_pydatetime().date())
    path = bitcoin_dataset_file_name = os.path.join(base_dir_btc, datee)
    file_name = os.path.join(path, "bitcoin.csv")
    Path(path).mkdir(parents=True, exist_ok=True)
    if i%100 == 0: 
        print(file_name)
        print(f"Paso {i%365} del año {i//365}")
    group[1].to_csv(file_name, sep=";")

In [19]:
df = pd.read_csv("JABA/data/tweets/2021-05-13/tweet_list.csv", sep=";")

In [20]:
df.to_parquet('df2.parquet.gzip',compression='gzip')  

In [23]:
%%timeit
pd.read_parquet('df2.parquet.gzip')  

257 ms ± 7.07 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [22]:
%%timeit
df = pd.read_csv("JABA/data/tweets/2021-05-13/tweet_list.csv", sep=";")

524 ms ± 4.96 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [11]:
data = pd.read_parquet('df.parquet.gzip')  

In [17]:
data.shape

(102660, 9)

In [18]:
df.shape

(208080, 9)

In [8]:
import pandas as pd

df = pd.read_csv("JABA/data/tweets/2018-01-11/tweet_list.csv", sep=";")
df.sort_values('NumLikes', ascending = False).head(n=50)[["Text", "Username", "NumLikes", "NumRetweets", "NumReplies"]]

Unnamed: 0,Text,Username,NumLikes,NumRetweets,NumReplies
6554,"kfc canada presents the bitcoin bucket. sure, ...",kfc_canada,6603,3505,575
824,"dear noobs, bitcoin is not naturally going dow...",ProfesorCrypto,2486,1669,147
2795,kodak makes a scam miner. kfc makes a bitcoin ...,CryptoCobain,2230,390,128
21854,bitcoin conference won't accept bitcoin becaus...,paulkrugman,1681,702,173
19530,"the facts: as measured in usd, btc break was h...",PeterLBrandt,1299,510,78
12380,you buy btc to buy alts to make more btc to bu...,bonzocorleonee,1260,399,61
20410,"keep in mind, buffett has never created or inv...",maxkeiser,1130,349,69
17325,btc is officially looking sexy. i've moved the...,parabolictrav,852,295,89
6037,finally a a crypto to rival bitcoin,pigdotavi,832,111,7
23462,i still have half a bitcoin and i will give it...,ashleyfeinberg,817,120,10


In [3]:
df = df.sort_values('NumLikes', ascending = False).head(n=50)

In [4]:
df.head(n=50)

Unnamed: 0,Datetime,Tweet Id,Text,NumReplies,NumRetweets,NumLikes,IDOriginalRetweeted,Username,isVerified
100155,2021-05-13 12:08:33+00:00,1392814015181635584,MicroStrategy has purchased an additional 271 ...,2667,8442,42354,,michael_saylor,True
124477,2021-05-13 09:34:38+00:00,1392775281631391744,ELON VS. BITCOIN,729,2921,24010,,beeple,True
195706,2021-05-13 00:38:21+00:00,1392640323721912322,We’re supposed to believe Elon JUST realized m...,467,1614,16794,,andrewschulz,True
5045,2021-05-13 23:16:37+00:00,1392982140929855492,Fuck Elon Musk for fucking up the crypto marke...,289,870,15966,,OrdinaryGamers,False
171732,2021-05-13 02:48:57+00:00,1392673187222065152,bitcoin does not derive its value from Tesla o...,746,1674,14954,,TheCryptoLark,False
25810,2021-05-13 20:46:21+00:00,1392944325995212813,Bitcoin offers 8 billion people a superior te...,699,2492,13419,,michael_saylor,True
28187,2021-05-13 20:26:51+00:00,1392939420169515008,Can confirm Bitcoin uses a lot of energy. I’ve...,293,1193,12841,,coffeencrypt0,False
37286,2021-05-13 19:10:47+00:00,1392920275184988161,The estimated electricity consumption per YT...,487,1555,12514,,michael_saylor,True
187678,2021-05-13 01:11:28+00:00,1392648654805082113,I refuse to believe one of the smartest people...,845,1081,12200,,andreijikh,False
90977,2021-05-13 13:03:31+00:00,1392827848654721027,Laser eyes channel action even as they protect...,406,1103,11853,,michael_saylor,True
