In [34]:
import pandas as pd

from textblob import TextBlob
import re
import string

import numpy as np

from numpy import mean
from numpy import median

from statistics import mode



In [10]:
def preprocess_tweet(tweet):
    # Rimuove i caratteri di nuova riga e a capo
    tweet = tweet.replace("\n", " ").replace("\r", " ")
    
    # Rimuove i link
    tweet = re.sub(r"http\S+", "", tweet)
    
    # Rimuove gli hashtag
    tweet = re.sub(r"#\S+", "", tweet)
    
    # Rimuove le menzioni
    tweet = re.sub(r"@\S+", "", tweet)
    
    # Rimuove i caratteri di punteggiatura
    tweet = tweet.translate(str.maketrans("", "", string.punctuation))
    
    # Converti il testo in minuscolo
    tweet = tweet.lower()
    
    return tweet


In [6]:
df = pd.read_csv('data/bitcoin_tweets.csv', index_col='created_at')
print(df.shape)
df.tail()

(100264, 3)


Unnamed: 0_level_0,tweet_id,text,language
created_at,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2022-05-24 12:38:24+00:00,1529079365765578752,Bitcoin dives to fill CME gap amid claim new a...,en
2022-05-24 12:38:24+00:00,1529079366650585095,RT @JohanPe44499068: This current week can bec...,en
2022-05-24 12:38:24+00:00,1529079366793039872,RT @DMM_Bitcoin: ／\n毎日参加！その場で当たる！\n🎉 #フォローRTキャ...,ja
2022-05-24 12:38:24+00:00,1529079366889746432,RT @bip_show: ARTE è una TV pubblica Franco-Te...,it
2022-05-24 12:38:24+00:00,1529079367460003840,RT @Blockworks_: Ray Dalio: #Bitcoin is still ...,en


In [7]:
tweets = df['text']

In [11]:
# Crea una copia della colonna 'text' del dataframe
df['text_clean'] = df['text'].copy()

# Sostituisci i testi elaborati nella colonna 'text_clean' del dataframe
df['text_clean'] = df['text_clean'].apply(preprocess_tweet)

In [12]:
df.head()

Unnamed: 0_level_0,tweet_id,text,language,text_clean
created_at,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2022-12-21 12:35:33+00:00,1605542489414602752,Giant Bitcoin Miner Core Scientific Files For ...,en,giant bitcoin miner core scientific files for ...
2022-12-21 12:35:33+00:00,1605542489447747584,#ETC\n\nTradebot Strategies ETC Teknik Analizi...,tr,tradebot strategies etc teknik analizini olu...
2022-12-21 12:35:33+00:00,1605542490643353600,RT @BillyM2k: @elonmusk according to fox busin...,en,rt according to fox business i’m an “anonymo...
2022-12-21 12:35:34+00:00,1605542491251527680,RT @BitcoinII: BitBull is the safest and most ...,en,rt bitbull is the safest and most reliable cr...
2022-12-21 12:35:34+00:00,1605542491406872576,Giant Bitcoin Miner Core Scientific Files For ...,en,giant bitcoin miner core scientific files for ...


In [13]:
def get_sentiment(tweet_text):
    analysis = TextBlob(tweet_text)
    return analysis.sentiment.polarity

In [14]:
df['sentiment'] = df['text_clean'].apply(get_sentiment)

In [16]:
df.tail()

Unnamed: 0_level_0,tweet_id,text,language,text_clean,sentiment
created_at,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2022-05-24 12:38:24+00:00,1529079365765578752,Bitcoin dives to fill CME gap amid claim new a...,en,bitcoin dives to fill cme gap amid claim new a...,0.136364
2022-05-24 12:38:24+00:00,1529079366650585095,RT @JohanPe44499068: This current week can bec...,en,rt this current week can become bullish for b...,0.0
2022-05-24 12:38:24+00:00,1529079366793039872,RT @DMM_Bitcoin: ／\n毎日参加！その場で当たる！\n🎉 #フォローRTキャ...,ja,rt ／ 毎日参加！その場で当たる！ 🎉 🎉 400名様に500円分のデジタルギフトをプ...,0.0
2022-05-24 12:38:24+00:00,1529079366889746432,RT @bip_show: ARTE è una TV pubblica Franco-Te...,it,rt arte è una tv pubblica francotedesca a fin...,0.416667
2022-05-24 12:38:24+00:00,1529079367460003840,RT @Blockworks_: Ray Dalio: #Bitcoin is still ...,en,rt ray dalio is still part of my portfolio,0.0


In [22]:
df = df.reset_index()
df['created_at'] = pd.to_datetime(df['created_at'])

In [23]:
# Crea una nuova colonna 'date' che estrae la data dalla colonna 'created_at'
df['date'] = df['created_at'].dt.date

In [24]:
df.head()

Unnamed: 0,index,created_at,tweet_id,text,language,text_clean,sentiment,date
0,0,2022-12-21 12:35:33+00:00,1605542489414602752,Giant Bitcoin Miner Core Scientific Files For ...,en,giant bitcoin miner core scientific files for ...,0.0,2022-12-21
1,1,2022-12-21 12:35:33+00:00,1605542489447747584,#ETC\n\nTradebot Strategies ETC Teknik Analizi...,tr,tradebot strategies etc teknik analizini olu...,0.0,2022-12-21
2,2,2022-12-21 12:35:33+00:00,1605542490643353600,RT @BillyM2k: @elonmusk according to fox busin...,en,rt according to fox business i’m an “anonymo...,0.0,2022-12-21
3,3,2022-12-21 12:35:34+00:00,1605542491251527680,RT @BitcoinII: BitBull is the safest and most ...,en,rt bitbull is the safest and most reliable cr...,0.5,2022-12-21
4,4,2022-12-21 12:35:34+00:00,1605542491406872576,Giant Bitcoin Miner Core Scientific Files For ...,en,giant bitcoin miner core scientific files for ...,0.0,2022-12-21


In [35]:
# raggruppa i dati per data utilizzando la colonna 'data' come chiave di raggruppamento
grouped_data = df.groupby('date')

# per ogni gruppo di dati, calcola la mediana del sentiment e aggiungi il valore alla lista medie
medie = []
for name, group in grouped_data:
    medie.append(mean(group['sentiment']))

# stampa le medie calcolate
print(medie)


[0.07844298811478614, 0.07671463978408416, 0.053116319816505456, 0.08964688758500029, 0.08136759192129345, 0.06273947033358797, 0.07986098213718863, 0.04869874825224048, 0.08974624994675032, 0.10212446215400751, 0.07824566523530894, 0.10207643711566419, 0.0949485114468558, 0.09717434411226351, 0.059055202251876576, 0.07779988926262395, 0.0849533175672064, 0.09145284859173744, 0.08442521899295032, 0.045943426379156216, 0.039120231178703724, 0.04805293076355854, 0.06352405623587633, 0.07021760940722804, 0.06377590147423133, 0.05664362562027108, 0.06863283551865802, 0.07212946833172397, 0.05818634627244586, 0.07585630319653841, 0.06968248486577007, 0.08128068968011126, 0.030215539928352908, 0.0747785515611602, -0.00022166926880041356, 0.05930925950978031, 0.06130138518697402, 0.04903604141104141, 0.0704195539353434, 0.045749268583036844, 0.0733169268938897, 0.10788274153808372, 0.08971649198754467, 0.0849480894170285, 0.07694170674313239, 0.09410815222878868, 0.11442100682822329, 0.146692

In [59]:
# crea il dizionario con le date come chiavi e i valori medi come valori
dati = {key: value for key, value in zip(grouped_data.groups.keys(), medie)}

# crea il DataFrame utilizzando il dizionario appena creato
df_medie = pd.DataFrame.from_dict(dati, orient='index')

# rinomina la colonna del DataFrame con il nome desiderato
df_medie.rename(columns={0: 'sentiment_medio'}, inplace=True)

# visualizza il DataFrame con i valori medi del sentiment per ogni giorno
df_medie

Unnamed: 0,sentiment_medio
2022-05-24,0.078443
2022-05-25,0.076715
2022-05-26,0.053116
2022-05-27,0.089647
2022-05-28,0.081368
...,...
2022-12-17,0.062978
2022-12-18,0.066380
2022-12-19,0.061157
2022-12-20,0.056468


In [60]:
df_medie = df_medie.reset_index()
df_medie.head()

Unnamed: 0,index,sentiment_medio
0,2022-05-24,0.078443
1,2022-05-25,0.076715
2,2022-05-26,0.053116
3,2022-05-27,0.089647
4,2022-05-28,0.081368


In [61]:
df_medie.rename(columns={'index': 'time'}, inplace=True)
df_medie.head()

Unnamed: 0,time,sentiment_medio
0,2022-05-24,0.078443
1,2022-05-25,0.076715
2,2022-05-26,0.053116
3,2022-05-27,0.089647
4,2022-05-28,0.081368


In [62]:
df_medie = df_medie.iloc[::-1]
df_medie.head()

Unnamed: 0,time,sentiment_medio
211,2022-12-21,0.049291
210,2022-12-20,0.056468
209,2022-12-19,0.061157
208,2022-12-18,0.06638
207,2022-12-17,0.062978


In [63]:
#trasformo la colonna created_at in datetime
df_medie['time'] = pd.to_datetime(df_medie['time'])

In [64]:
df_medie = df_medie.set_index('time')
df_medie.head()

Unnamed: 0_level_0,sentiment_medio
time,Unnamed: 1_level_1
2022-12-21,0.049291
2022-12-20,0.056468
2022-12-19,0.061157
2022-12-18,0.06638
2022-12-17,0.062978


In [65]:
df_medie = df_medie.loc['2022-12-11':'2022-06-01']

In [70]:
df_medie.head()

Unnamed: 0,index,time,sentiment_medio
0,0,2022-12-11,0.057855
1,1,2022-12-10,0.023664
2,2,2022-12-09,0.079547
3,3,2022-12-08,0.098894
4,4,2022-12-07,0.098525


**Importo il dataset dei bitcoin**

In [67]:
df = pd.read_csv('data/bitcoin.csv')
print(df.shape)
df.head()

(194, 8)


Unnamed: 0,time,low,high,open,close,volume,greed_and_fear,trend
0,2022-12-11,17073.19,17271.92,17128.1,17085.21,9948.85267,26,30
1,2022-12-10,17093.42,17227.64,17130.49,17128.1,7860.586876,27,28
2,2022-12-09,17060.69,17352.62,17226.03,17130.59,20976.636999,26,28
3,2022-12-08,16738.0,17300.59,16839.76,17226.01,23533.234537,25,28
4,2022-12-07,16679.52,17140.22,17089.18,16840.0,22635.468488,29,28


In [69]:
# Rendi le date presenti nell'indice del dataframe df_sentiment come colonne
df_medie = df_medie.reset_index()

#trasforma df['time'] in datetime
df['time'] = pd.to_datetime(df['time'])

# Unisci i due dataframe utilizzando la colonna "time" come chiave di unione
df_merged = pd.merge(df, df_medie, on='time')

Unnamed: 0,time,low,high,open,close,volume,greed_and_fear,trend,index,sentiment_medio
0,2022-12-11,17073.19,17271.92,17128.1,17085.21,9948.85267,26,30,0,0.057855
1,2022-12-10,17093.42,17227.64,17130.49,17128.1,7860.586876,27,28,1,0.023664
2,2022-12-09,17060.69,17352.62,17226.03,17130.59,20976.636999,26,28,2,0.079547
3,2022-12-08,16738.0,17300.59,16839.76,17226.01,23533.234537,25,28,3,0.098894
4,2022-12-07,16679.52,17140.22,17089.18,16840.0,22635.468488,29,28,4,0.098525


In [73]:
df_merged = df_merged.set_index('time')
df_merged.head()

Unnamed: 0_level_0,low,high,open,close,volume,greed_and_fear,trend,index,sentiment_medio
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2022-12-11,17073.19,17271.92,17128.1,17085.21,9948.85267,26,30,0,0.057855
2022-12-10,17093.42,17227.64,17130.49,17128.1,7860.586876,27,28,1,0.023664
2022-12-09,17060.69,17352.62,17226.03,17130.59,20976.636999,26,28,2,0.079547
2022-12-08,16738.0,17300.59,16839.76,17226.01,23533.234537,25,28,3,0.098894
2022-12-07,16679.52,17140.22,17089.18,16840.0,22635.468488,29,28,4,0.098525


In [75]:
df_merged.rename(columns={'trend': 'google_trend'}, inplace=True)
df_merged.head()

Unnamed: 0_level_0,low,high,open,close,volume,greed_and_fear,google_trend,index,sentiment_medio
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2022-12-11,17073.19,17271.92,17128.1,17085.21,9948.85267,26,30,0,0.057855
2022-12-10,17093.42,17227.64,17130.49,17128.1,7860.586876,27,28,1,0.023664
2022-12-09,17060.69,17352.62,17226.03,17130.59,20976.636999,26,28,2,0.079547
2022-12-08,16738.0,17300.59,16839.76,17226.01,23533.234537,25,28,3,0.098894
2022-12-07,16679.52,17140.22,17089.18,16840.0,22635.468488,29,28,4,0.098525


In [76]:
df = df_merged.drop("index", axis=1)
df.head()

Unnamed: 0_level_0,low,high,open,close,volume,greed_and_fear,google_trend,sentiment_medio
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2022-12-11,17073.19,17271.92,17128.1,17085.21,9948.85267,26,30,0.057855
2022-12-10,17093.42,17227.64,17130.49,17128.1,7860.586876,27,28,0.023664
2022-12-09,17060.69,17352.62,17226.03,17130.59,20976.636999,26,28,0.079547
2022-12-08,16738.0,17300.59,16839.76,17226.01,23533.234537,25,28,0.098894
2022-12-07,16679.52,17140.22,17089.18,16840.0,22635.468488,29,28,0.098525


**salvo il dataset finale nel csv originale**

In [77]:
df.to_csv('data/bitcoin.csv')