# Highlight

The results in this notebook are based on the application of a simple sentiment analysis method and common ML models. This approach is performed on a small dataset of crypto-related Reddit posts during 2022-01-10 to 2022-02-08. Addresing the obtained results the following points can be highlighted:

- We need to make sure about compability of the sentiment analasis ??? with the case.
- We need to elaborate our prediction modles using macro data.
- We need to improve the  ??? or data.
- We need to extract our social media dataset particularly for each cryptocurrency.


# Importing Packages

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer


from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import (LogisticRegression, RidgeClassifier, SGDClassifier)
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.svm import NuSVC
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import (accuracy_score, recall_score, confusion_matrix, f1_score, 
                             roc_auc_score, classification_report)


from sklearn.dummy import DummyClassifier

from sklearn.pipeline import Pipeline


# Avoiding warnings
import warnings
import os
########### Prevent Warnings ###########
warnings.filterwarnings(action='ignore')
########### Prevent Warnings ###########

import time
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from datetime import timedelta
from datetime import datetime
from sklearn.preprocessing import StandardScaler, Normalizer

# Importing CSV Files

In [None]:
social_df = pd.read_csv('SocialData.csv')
btc_df = pd.read_csv('BTC-Market Data.csv')
eth_df = pd.read_csv('ETH-Market Data.csv')
doge_df = pd.read_csv('DOGE-Market Data.csv')
xrp_df = pd.read_csv('XRP-Market Data.csv')

social_df["text"] = social_df["Title"].astype(str) + social_df["Post Text"].astype(str)

social_df["text"] = social_df["text"].str.replace('nan', '')
# social_df.head()

# Calculating VADER Sntiment Score

In [None]:
def sentiment_scores(sentence):

    sid_obj = SentimentIntensityAnalyzer()
    sentiment_dict = sid_obj.polarity_scores(sentence)

    return {'neg': sentiment_dict['neg'], 'neu': sentiment_dict['neu'], 'pos': sentiment_dict['pos']}

social_df['sentDict'] = [sentiment_scores(x) for x in social_df["text"]]

In [None]:
social_df['neg'] = [x['neg'] for x in social_df["sentDict"]]
social_df['neu'] = [x['neu'] for x in social_df["sentDict"]]
social_df['pos'] = [x['pos'] for x in social_df["sentDict"]]

social_df['date'] = pd.to_datetime(social_df["Creation Date"]).dt.date
# social_df.head()

# Merging Datasets to Create a Social Media Dataset

In [None]:
social_summary_df = pd.DataFrame(social_df['date'].unique())
social_summary_df.columns = ['Date']

postCount = social_df.groupby('date').Title.count().reset_index()
postCount.columns = ['Date','PostCount']

NegMean = social_df.groupby('date').neg.mean().reset_index()
NegMean.columns = ['Date','NegMean']

NeuMean = social_df.groupby('date').neu.mean().reset_index()
NeuMean.columns = ['Date','NeuMean']

PosMean = social_df.groupby('date').pos.mean().reset_index()
PosMean.columns = ['Date','PosMean']

social_summary_df = pd.merge(social_summary_df, postCount[['Date','PostCount']], on='Date')
social_summary_df = pd.merge(social_summary_df, NegMean[['Date','NegMean']], on='Date')
social_summary_df = pd.merge(social_summary_df, NeuMean[['Date','NeuMean']], on='Date')
social_summary_df = pd.merge(social_summary_df, PosMean[['Date','PosMean']], on='Date')

# social_summary_df.sort_values('Date').head()
# social_summary_df.dtypes

### Labeling Days with Positive, Negative, or Zero for Each Cryptocurrency based on % Daily Changes and Creating Market Dataset

In [None]:
def MarketLabeler(change):

    change = change[0]
    status = ''

    if change>0:
        status = 'p'
    elif change<0:
        status = 'n'
    else:
        status = 'z'

    return status
    

market_df = pd.DataFrame(social_df['date'].unique())
market_df.columns = ['Date']

market_df['Date'] = pd.to_datetime(market_df['Date']).dt.date
btc_df['Date'] = pd.to_datetime(btc_df['Date']).dt.date
eth_df['Date'] = pd.to_datetime(eth_df['Date']).dt.date
doge_df['Date'] = pd.to_datetime(doge_df['Date']).dt.date
xrp_df['Date'] = pd.to_datetime(xrp_df['Date']).dt.date


market_df = pd.merge(market_df, btc_df[['Date','Change %']], on='Date')
market_df = pd.merge(market_df, eth_df[['Date','Change %']], on='Date')
market_df = pd.merge(market_df, doge_df[['Date','Change %']], on='Date')
market_df = pd.merge(market_df, xrp_df[['Date','Change %']], on='Date')


market_df.columns = [['Date','btcChange','ethChange','dogeChange','xrpChange']]


market_df[['btcChange','ethChange','dogeChange','xrpChange']] = market_df[['btcChange','ethChange','dogeChange','xrpChange']].replace('%', '',regex=True)
market_df[['btcChange','ethChange','dogeChange','xrpChange']] = market_df[['btcChange','ethChange','dogeChange','xrpChange']].astype(float)

market_df['btcChange'] = [MarketLabeler(x) for x in market_df['btcChange'].values.tolist()]
market_df['ethChange'] = [MarketLabeler(x) for x in market_df['ethChange'].values.tolist()]
market_df['dogeChange'] = [MarketLabeler(x) for x in market_df['dogeChange'].values.tolist()]
market_df['xrpChange'] = [MarketLabeler(x) for x in market_df['xrpChange'].values.tolist()]

# market_df['Date'] = pd.to_datetime(market_df['Date']).dt.date

# market_df.dtypes

# Mearging Market and Social Media Datasets on Date Considering One Day Lag

In [None]:
social_summary_df["LagDate"] = social_summary_df["Date"] + timedelta(days=-1)
market_df["LagDate"] = market_df['Date'] + timedelta(days=0)

market_df.columns = [['Date','btcChange','ethChange','dogeChange','xrpChange','LagDate']]

market_df = market_df.drop(market_df[(market_df.LagDate < datetime.strptime('2022-01-10', '%Y-%m-%d').date())].index)
market_df = market_df.drop(market_df[(market_df.LagDate > datetime.strptime('2022-02-08', '%Y-%m-%d').date())].index)

social_summary_df = social_summary_df.drop(social_summary_df[(social_summary_df.LagDate < datetime.strptime('2022-01-10', '%Y-%m-%d').date())].index)
social_summary_df = social_summary_df.drop(social_summary_df[(social_summary_df.LagDate > datetime.strptime('2022-02-08', '%Y-%m-%d').date())].index)


df = pd.merge(social_summary_df, market_df[['Date','btcChange','ethChange','dogeChange','xrpChange','LagDate']], on='LagDate', how='inner')


# market_df.to_csv('market_df.csv', index=False)
# social_summary_df.to_csv('social_summary_df.csv', index=False)

# Training and Evaluating ML Models

In [97]:
SGD = Pipeline([
    ('normalizer', Normalizer()), 
    ('SGD', SGDClassifier())
])

DT = Pipeline([
    ('normalizer', Normalizer()),
    ('DT', DecisionTreeClassifier())
])

GD = Pipeline([
    ('normalizer', Normalizer()),
    ('GD', GradientBoostingClassifier())
])

NB = Pipeline([
    ('normalizer', Normalizer()),
    ('NB', BernoulliNB())
])

RF = Pipeline([
    ('normalizer', Normalizer()),
    ('RF', RandomForestClassifier())
])


LR = Pipeline([
    ('normalizer', Normalizer()),
    ('LR', LogisticRegression())
])

SVC = Pipeline([
    ('normalizer', Normalizer()),
    ('SVC', NuSVC())
])

MLP = Pipeline([
    ('normalizer', Normalizer()),
    ('MLP', MLPClassifier())
])

DM = Pipeline([
    ('normalizer', Normalizer()),
    ('Dummy', DummyClassifier())
])



clfs = [NB, RF, SGD, LR, SVC, MLP, DM]

In [98]:
features = ['NegMean', 'NeuMean', 'PosMean'] 
label = ['btcChange','ethChange','dogeChange','xrpChange']

X_train, X_test, Y_train, Y_test = train_test_split(df[features], df[label], test_size=0.30, train_size=0.70)

In [100]:
for currency in label:

    X_train, X_test, Y_train, Y_test = train_test_split(df[features], df[currency], test_size=0.30, train_size=0.70)
    
    for clf in clfs:
        
        start = time.time()
        
        clf.fit(X_train, Y_train)
        preds = clf.predict(X_test)
        
        print('\t \t      **** %s:  %s ****'% (currency[0:3].upper(), clf.steps[1][0]) )
        print('')
        
        # Print the confusion matrix
        #print(confusion_matrix(Y_test, preds))

        # Print the precision and recall, among other metrics
        print(classification_report(Y_test, preds, digits=3))
        
        stop = time.time()
        duration = stop-start
        print("Execution Time: ",round(duration, 4),"sec")
        print('')
        print('-------------------------------------------------------')

	 	      **** BTC:  NB ****

              precision    recall  f1-score   support

           n      0.000     0.000     0.000         4
           p      0.556     1.000     0.714         5

    accuracy                          0.556         9
   macro avg      0.278     0.500     0.357         9
weighted avg      0.309     0.556     0.397         9

Execution Time:  0.0271 sec

-------------------------------------------------------
	 	      **** BTC:  RF ****

              precision    recall  f1-score   support

           n      0.500     0.250     0.333         4
           p      0.571     0.800     0.667         5

    accuracy                          0.556         9
   macro avg      0.536     0.525     0.500         9
weighted avg      0.540     0.556     0.519         9

Execution Time:  0.2487 sec

-------------------------------------------------------
	 	      **** BTC:  SGD ****

              precision    recall  f1-score   support

           n      0.000     0.000