In [1]:
import pandas as pd
import datetime
import numpy as np
import bs4
import requests
import os
import datetime as dt
import re

In [2]:
from tqdm import tqdm
tqdm.pandas()

In [3]:
def getting_longtext(url):
    """ Parse text from an url """
    try:
        response = requests.get(url, headers={'User-Agent': 'Mozilla/5.0'})
        soup = bs4.BeautifulSoup(response.text,'lxml')
        print(url, end="\r", flush=True)
        return soup.body.get_text(' ', strip=True)
    except:
        return ""

In [4]:
def longtext_parser(text, news_portal=None):
    try:
        if news_portal == "coindesk":
            text = str(text).lower()
            text = text.split("% crypto prices coindesk")[1]
            text = text.split("view all prices sign up for crypto long & short")[0]
            return text
        else:
            text = str(text).lower()
            text = text.split("markets pro")[1]
            text = text.split("related news")[0]
            return text
    except:
        return text

In [5]:
def merge_csv_files(folder_path, date=None):
    if date == None:
        bad_files = []
        # Get a list of all CSV files in the folder
        csv_files = [f for f in os.listdir(folder_path) if f.endswith('.csv')]
        # Create an empty dataframe to store the merged data
        merged_df = pd.DataFrame()

        # Loop through each CSV file and append its data to the merged dataframe
        for file in csv_files:
            file_path = os.path.join(folder_path, file)
            if os.path.getsize(file_path) > 0:
                print(file_path, end="\r", flush=True)
                try:
                    df = pd.read_csv(file_path, encoding="utf-8", sep=",")
                    merged_df = pd.concat([merged_df, df], ignore_index=True)
                except:
                    bad_files.append(file_path)
                    pass

        # Reset the index of the merged dataframe
        merged_df = merged_df.reset_index(drop=True)
        print(len(bad_files))
        return merged_df
    else:
        bad_files = []
        # Get a list of all CSV files in the folder
        csv_files = [f for f in os.listdir(folder_path) if f.endswith('.csv')]
        # Create an empty dataframe to store the merged data
        merged_df = pd.DataFrame()

        # Loop through each CSV file and append its data to the merged dataframe
        for file in csv_files:
            if date in file:
                file_path = os.path.join(folder_path, file)
                if os.path.getsize(file_path) > 0:
                    print(file_path, end="\r", flush=True)
                    try:
                        df = pd.read_csv(file_path, encoding="utf-8", sep=",")
                        merged_df = pd.concat([merged_df, df], ignore_index=True)
                    except:
                        bad_files.append(file_path)
                        pass
                else:
                    break

        # Reset the index of the merged dataframe
        merged_df.reset_index(drop=True, inplace=True)
        merged_df.drop("Unnamed: 0", axis=1, inplace=True)
        print(len(bad_files))
        return merged_df

### Flair Sentiment Analyser Module
Details: https://github.com/flairNLP/flair/blob/master/resources/docs/TUTORIAL_TAGGING_SENTIMENT.md

In [6]:
from flair.models import TextClassifier
from flair.data import Sentence

In [7]:
## Setting max rows, widths, columns in this Notebook
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_colwidth', 500)

In [8]:
## Load text classifier model (flair)
classifier = TextClassifier.load('sentiment')

In [9]:
def flair_sentiment(text):
    """ Predict text sentiment and return its value and score"""
    try:
        sentence = Sentence(text)
        classifier.predict(sentence)
        if sentence.labels[0].value.upper() == "NEGATIVE":
            return -np.round(sentence.labels[0].score, 4)
        else:
            return np.round(sentence.labels[0].score, 4)
    except:
        return np.nan

In [10]:
#Example
flair_sentiment("The price of ether has fallen 30% in the last week")

-0.9999

-0.9999

### TextBlob
Details: https://textblob.readthedocs.io/en/dev/quickstart.html

In [11]:
from textblob import TextBlob

In [12]:
def textblob_sentiment(text):
    try:
        t = TextBlob(text)
        return t.sentiment.polarity
    except:
        return np.nan

In [13]:
#Example
textblob_sentiment("The price of ether has fallen 30% in the last week")

0.0

0.0

### Vader Sentiment module
Details: https://github.com/cjhutto/vaderSentiment

In [14]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [15]:
def vader_sentiment(sentence):
 
    # Create a SentimentIntensityAnalyzer object.
    sid_obj = SentimentIntensityAnalyzer()
 
    # polarity_scores method of SentimentIntensityAnalyzer
    # object gives a sentiment dictionary.
    # which contains pos, neg, neu, and compound scores.
    sentiment_dict = sid_obj.polarity_scores(sentence)
    #print(sentiment_dict["compound"], end="\r", flush=True)
    return sentiment_dict["compound"]


In [16]:
#Example
vader_sentiment("The price of ether has fallen 30% in the last week")

-0.3612

-0.3612

In [None]:
df_coindesk = pd.read_excel("../src/section2/eth_news/eth_results_coindesk.xlsx")

In [None]:
df_coindesk = df_coindesk.sample(100)

### Long text test

In [None]:
df_coindesk["sentiment_textblob"] = df_coindesk.long_text.progress_apply(lambda x: flair_sentiment(x))

In [None]:
df_coindesk["sentiment_flair"] = df_coindesk.long_text.progress_apply(lambda x: textblob_sentiment(x))

In [None]:
df_coindesk["sentiment_vader"] = df_coindesk.long_text.progress_apply(lambda x: vader_sentiment(x))

In [None]:
df_coindesk[["headline", "sentiment_textblob", "sentiment_flair", "sentiment_vader"]].sort_values(by="sentiment_textblob")

In [None]:
df_coindesk[["headline", "sentiment_textblob", "sentiment_flair", "sentiment_vader"]].sort_values(by="sentiment_vader").to_csv("headline_test.csv")

# Findings
### Textblob worked best with long_text

### Headline test

In [None]:
df_coindesk["sentiment_textblob"] = df_coindesk.headline.progress_apply(lambda x: flair_sentiment(x))

In [None]:
df_coindesk["sentiment_flair"] = df_coindesk.headline.progress_apply(lambda x: textblob_sentiment(x))

In [None]:
df_coindesk["sentiment_vader"] = df_coindesk.headline.progress_apply(lambda x: vader_sentiment(x))

In [None]:
df_coindesk[["headline", "sentiment_textblob", "sentiment_flair", "sentiment_vader"]].sort_values(by="sentiment_vader")

## ETH news (ETH related)

### - Coindesk (biggest news site)

### - Cointelegraph

### - Google News

In [None]:
## Reading excel files
df_coindesk = pd.read_excel("../src/section2/eth_news/eth_results_coindesk.xlsx")
df_cointelegraph = pd.read_excel("../src/section2/eth_news/eth_results_cointelegraph.xlsx")
df_googlenews = pd.read_excel("../src/section2/google_news/ethereum_result.xlsx")

In [None]:
## Apply getting longtext function on datasets
#df_coindesk.long_text = df_coindesk.link.progress_apply(lambda x: getting_longtext("https://www.coindesk.com"+ x))
#df_cointelegraph.long_text = df_cointelegraph.link.progress_apply(lambda x: getting_longtext("https://www.cointelegraph.com"+ x))
#df_googlenews["long_text"] = df_googlenews.link.progress_apply(lambda x: getting_longtext(x))

In [None]:
df_coindesk.head()

In [None]:
df_cointelegraph.head()

In [None]:
df_googlenews.head()

In [None]:
#df_coindesk.long_text = df_coindesk.long_text.progress_apply(lambda x: longtext_parser(x, news_portal="coindesk"))
#df_cointelegraph.long_text = df_cointelegraph.long_text.progress_apply(lambda x: longtext_parser(x, news_portal="cointelegraph"))

In [None]:
## Applying sentiment text function
df_coindesk["final_sentiment"] = df_coindesk.headline.progress_apply(lambda x: vader_sentiment(x))
df_cointelegraph["final_sentiment"] = df_cointelegraph.headline.progress_apply(lambda x: vader_sentiment(x))
df_googlenews["final_sentiment"] = df_googlenews.post_header.progress_apply(lambda x: vader_sentiment(x))

In [None]:
print(f"Length of Coindesk dataframe: {len(df_coindesk)}")
print(f"Length of CoinTelegraph dataframe: {len(df_cointelegraph)}")
print(f"Length of Google News dataframe: {len(df_googlenews)}")

In [None]:
df_coindesk.to_csv("../src/section2/final_dataframes/df_coindesk.csv", index=False)
df_cointelegraph.to_csv("../src/section2/final_dataframes/df_cointelegraph.csv", index=False)
df_googlenews.to_csv("../src/section2/final_dataframes/df_googlenews.csv", index=False)

## Google Trends

In [None]:
df_googletrends = pd.read_csv("src/section2/google_trends/google_trends.csv")

In [None]:
df_googletrends.rename(columns={"Hónap": "month", "Ethereum: (Világszerte)": "ethereum_search_trend"}, inplace=True)

In [None]:
df_googletrends.to_csv("src/section2/final_dataframes/df_google_trends.csv", index=False)

## Reddit comments (ETH related)

In [None]:
folder_path = '../src/section2/reddit/scrapes/comments'
df_reddit = merge_csv_files(folder_path)

In [None]:
df_reddit.shape

In [None]:
df_reddit.body = df_reddit.body.progress_apply(lambda x: x.replace("&gt;", ""))
df_reddit.body = df_reddit.body.progress_apply(lambda x: x.replace("\n", ""))
df_reddit.body = df_reddit.body.progress_apply(lambda x: x.replace("\n;", ""))
df_reddit.body = df_reddit.body.progress_apply(lambda x: x.replace("\t", ""))
df_reddit.body = df_reddit.body.progress_apply(lambda x: x.replace("--------", ""))

In [None]:
df_reddit.body = df_reddit.body.progress_apply(lambda x: re.sub('\s+',' ', x))

In [None]:
df_reddit = df_reddit[["created_utc", "subreddit", "body", "score","stickied"]]

In [None]:
df_reddit.head()

In [None]:
df_reddit["created_date"] = df_reddit.created_utc.progress_apply(lambda x: datetime.datetime.utcfromtimestamp(x))

In [None]:
df_reddit.sort_values(by="created_date", inplace=True)

In [None]:
df_reddit.drop(columns=["created_utc"], inplace=True)

In [None]:
df_reddit.head()

In [None]:
df_reddit.created_date = df_reddit.created_date.progress_apply(lambda x: x.date())

In [None]:
df_reddit = df_reddit[["created_date"]+[i for i in df_reddit if i != "created_date"]]

In [None]:
df_reddit["final_sentiment"] = df_reddit.body.progress_apply(lambda x: vader_sentiment(x))

In [None]:
df_reddit.set_index("created_date", drop=True, inplace=True)

In [None]:
df_reddit.index.freq="D"

In [None]:
df_reddit.index = pd.to_datetime(df_reddit.index)

In [None]:
daily_total_scores = df_reddit.copy()
daily_total_scores.score = abs(daily_total_scores.score)

In [None]:
daily_total_scores = pd.DataFrame(daily_total_scores.groupby(daily_total_scores.index)["score"].sum())

In [None]:
daily_total_scores.columns=["total_score"]

In [None]:
df_reddit = pd.merge(df_reddit, daily_total_scores, how="left", left_index=True, right_index=True)

In [None]:
#df_reddit.loc[pd.to_datetime("2017-11-29")]

In [None]:
### Weight by daily total score of reddit comments
df_reddit['final_sentiment'] = (df_reddit["score"] / df_reddit["total_score"]) * df_reddit["final_sentiment"]

In [None]:
df_reddit['final_sentiment'].plot()

In [None]:
df_reddit.to_csv("../src/section2/final_dataframes/df_reddit.csv", index=True)

## Twitter tweets (about Ethereum or ETH or ether)

### 2017

In [None]:
folder_path = '../src/section2/twitter/scrapes'
df_twitter17 = merge_csv_files(folder_path, date="2017")

In [None]:
df_twitter17["date"] = df_twitter17.datetime.progress_apply(lambda x: x.split(" ")[0])

In [None]:
df_twitter17.set_index("date", inplace=True)

In [None]:
df_twitter17.index = pd.to_datetime(df_twitter17.index)

In [None]:
df_twitter17.index

In [None]:
df_twitter17.head()

In [None]:
df_twitter17.info()

In [None]:
df_twitter17.shape

In [None]:
df_twitter17.iloc[0].text

In [None]:
df_twitter17.text = df_twitter17.text.progress_apply(lambda x: re.sub('\s+',' ', x))

In [None]:
df_twitter17["final_sentiment_test"] = df_twitter17.text.progress_apply(lambda x: vader_sentiment(x))

In [None]:
df_twitter17.head(50)

In [None]:
df_twitter17.to_csv("twitter_data/twitter_2017.csv", index=True)

In [None]:
del df_twitter17

### 2018

In [None]:
folder_path = '../src/section2/twitter/scrapes'
df_twitter18 = merge_csv_files(folder_path, date="2018")

In [None]:
df_twitter18.shape

In [None]:
df_twitter18["date"] = df_twitter18.datetime.progress_apply(lambda x: x.split(" ")[0])

In [None]:
df_twitter18.set_index("date", inplace=True)

In [None]:
df_twitter18.index = pd.to_datetime(df_twitter18.index)

In [None]:
df_twitter18.index

In [None]:
df_twitter18.head()

In [None]:
df_twitter18.info()

In [None]:
df_twitter18.shape

In [None]:
df_twitter18.iloc[0].text

In [None]:
df_twitter18.text = df_twitter18.text.progress_apply(lambda x: re.sub('\s+',' ', x))

In [None]:
df_twitter18["final_sentiment_test"] = df_twitter18.text.progress_apply(lambda x: vader_sentiment(x))

In [None]:
df_twitter18.head(50)

In [None]:
df_twitter18.to_csv("twitter_data/twitter_2018.csv", index=True)

In [None]:
del df_twitter18

### 2019

In [None]:
folder_path = '../src/section2/twitter/scrapes'
df_twitter19 = merge_csv_files(folder_path, date="2019")

In [None]:
df_twitter19.shape

In [None]:
df_twitter19["date"] = df_twitter19.datetime.progress_apply(lambda x: x.split(" ")[0])

In [None]:
df_twitter19.set_index("date", inplace=True)

In [None]:
df_twitter19.index = pd.to_datetime(df_twitter19.index)

In [None]:
df_twitter19.index

In [None]:
df_twitter19.head()

In [None]:
df_twitter19.info()

In [None]:
df_twitter19.shape

In [None]:
df_twitter19.iloc[0].text

In [None]:
df_twitter19.text = df_twitter19.text.progress_apply(lambda x: re.sub('\s+',' ', x))

In [None]:
df_twitter19["final_sentiment_test"] = df_twitter19.text.progress_apply(lambda x: vader_sentiment(x))

In [None]:
df_twitter19.head(50)

In [None]:
df_twitter19.to_csv("twitter_data/twitter_2019.csv", index=True)

In [None]:
del df_twitter19

### 2020

In [None]:
folder_path = '../src/section2/twitter/scrapes'
df_twitter20 = merge_csv_files(folder_path, date="2020")

In [None]:
df_twitter20.shape

In [None]:
df_twitter20["date"] = df_twitter20.datetime.progress_apply(lambda x: x.split(" ")[0])

In [None]:
df_twitter20.set_index("date", inplace=True)

In [None]:
df_twitter20.index = pd.to_datetime(df_twitter20.index)

In [None]:
df_twitter20.index

In [None]:
df_twitter20.head()

In [None]:
df_twitter20.info()

In [None]:
df_twitter20.shape

In [None]:
df_twitter20.iloc[0].text

In [None]:
df_twitter20.text = df_twitter20.text.progress_apply(lambda x: re.sub('\s+',' ', x))

In [None]:
df_twitter20["final_sentiment_test"] = df_twitter20.text.progress_apply(lambda x: vader_sentiment(x))

In [None]:
df_twitter20.head(50)

In [None]:
df_twitter20.to_csv("twitter_data/twitter_2020.csv", index=True)

In [None]:
del df_twitter20

### 2021

In [None]:
folder_path = '../src/section2/twitter/scrapes'
df_twitter21 = merge_csv_files(folder_path, date="2021")

In [None]:
df_twitter21.shape

In [None]:
df_twitter21["date"] = df_twitter21.datetime.progress_apply(lambda x: x.split(" ")[0])

In [None]:
df_twitter21.set_index("date", inplace=True)

In [None]:
df_twitter21.index = pd.to_datetime(df_twitter21.index)

In [None]:
df_twitter21.index

In [None]:
df_twitter21.head()

In [None]:
df_twitter21.info()

In [None]:
df_twitter21.shape

In [None]:
df_twitter21.iloc[0].text

In [None]:
df_twitter21.text = df_twitter21.text.progress_apply(lambda x: re.sub('\s+',' ', x))

In [None]:
df_twitter21["final_sentiment_test"] = df_twitter21.text.progress_apply(lambda x: vader_sentiment(x))

In [None]:
df_twitter21.head(50)

In [None]:
df_twitter21.to_csv("twitter_data/twitter_2021.csv", index=True)

In [None]:
del df_twitter21

### 2022

In [None]:
folder_path = '../src/section2/twitter/scrapes'
df_twitter22 = merge_csv_files(folder_path, date="2022")

In [None]:
df_twitter22.shape

In [None]:
df_twitter22["date"] = df_twitter22.datetime.progress_apply(lambda x: x.split(" ")[0])

In [None]:
df_twitter22.set_index("date", inplace=True)

In [None]:
df_twitter22.index = pd.to_datetime(df_twitter22.index)

In [None]:
df_twitter22.index

In [None]:
df_twitter22.head()

In [None]:
df_twitter22.info()

In [None]:
df_twitter22.shape

In [None]:
df_twitter22.iloc[0].text

In [None]:
df_twitter22.text = df_twitter22.text.progress_apply(lambda x: re.sub('\s+',' ', x))

In [None]:
df_twitter22["final_sentiment_test"] = df_twitter22.text.progress_apply(lambda x: vader_sentiment(x))

In [None]:
df_twitter22.head(50)

In [None]:
df_twitter22.to_csv("twitter_data/twitter_2022.csv", index=True)

In [None]:
del df_twitter22