In [1]:
import numpy as np
import pandas as pd
import pickle
from datetime import timedelta, datetime
from dateutil.relativedelta import relativedelta
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import cross_val_score

In [2]:
models = [LinearDiscriminantAnalysis, KNeighborsClassifier, RandomForestClassifier, GaussianNB]
coin_names = ['Bitcoin', 'Litecoin','Ethereum', 'Dogecoin', 'XRP']

Part 1 - only twitter

In [3]:
all_crypto = pd.DataFrame()
for coin_name in coin_names:
    twitter_signals = pd.read_csv('{0}_twitter_signals.csv'.format(coin_name))
    twitter_signals['Date'] = twitter_signals['Date'].apply(lambda x: datetime.strptime(x, '%Y-%m-%d'))
    twitter_signals['Date'] = twitter_signals['Date'].apply(lambda x: x.strftime('%m-%d-%Y'))
    news_signals = pd.read_csv('{0}_news_signals.csv'.format(coin_name))
    news_signals.drop(columns = ['compound','positive','neutral','negative','polarity','subjective'], inplace=True)
    # twitter_signals.drop(columns = ['label','Returns','Adj Close'], inplace=True)
    combined_signals = twitter_signals.merge(news_signals, left_on='Date', right_on='Date')
    # combined_signals = news_signals.append(twitter_signals, ignore_index=True)
    all_crypto = all_crypto.append(combined_signals, ignore_index=True)
    
X = all_crypto.drop(columns = ['Date', 'label','Returns','Adj Close'])
y = all_crypto['label']

In [4]:
for model in models:
    # Define method to evaluate model
    cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=5, random_state=1)
    # Evaluate model
    # r2 = cross_val_score(model(), X, y, scoring='mae', cv=cv, n_jobs=-1)
    # Mean squared error; this is a loss function, so "scores" are negative
    loss = cross_val_score(model(), X, y, scoring='neg_mean_squared_error', cv=cv, n_jobs=-1)
    r2 = cross_val_score(model(), X, y, scoring='r2', cv=cv, n_jobs=-1)
    accuracy = cross_val_score(model(), X, y, scoring='accuracy', cv=cv, n_jobs=-1)
    print('model:' + str(model))
    print('Mean loss for: %.3f (+- %.3f)' % (np.mean(loss), np.std(loss)))
    print('Mean r2 for: %.3f (+- %.3f)' % (np.mean(r2), np.std(r2)))
    print('Mean Accuracy for: %.3f (+- %.3f)' % (np.mean(accuracy), np.std(accuracy)))
    print('--------------------------------------------------')

model:<class 'sklearn.discriminant_analysis.LinearDiscriminantAnalysis'>
Mean loss for: -0.309 (+- 0.022)
Mean r2 for: -0.035 (+- 0.074)
Mean Accuracy for: 0.691 (+- 0.022)
--------------------------------------------------
model:<class 'sklearn.neighbors._classification.KNeighborsClassifier'>
Mean loss for: -0.415 (+- 0.075)
Mean r2 for: -0.387 (+- 0.251)
Mean Accuracy for: 0.641 (+- 0.041)
--------------------------------------------------
model:<class 'sklearn.ensemble._forest.RandomForestClassifier'>
Mean loss for: -0.429 (+- 0.116)
Mean r2 for: -0.499 (+- 0.365)
Mean Accuracy for: 0.653 (+- 0.034)
--------------------------------------------------
model:<class 'sklearn.naive_bayes.GaussianNB'>
Mean loss for: -0.404 (+- 0.111)
Mean r2 for: -0.352 (+- 0.371)
Mean Accuracy for: 0.640 (+- 0.060)
--------------------------------------------------


Part 2 - only news

In [5]:
all_crypto = pd.DataFrame()
for coin_name in coin_names:
    twitter_signals = pd.read_csv('{0}_twitter_signals.csv'.format(coin_name))
    twitter_signals['Date'] = twitter_signals['Date'].apply(lambda x: datetime.strptime(x, '%Y-%m-%d'))
    twitter_signals['Date'] = twitter_signals['Date'].apply(lambda x: x.strftime('%m-%d-%Y'))
    news_signals = pd.read_csv('{0}_news_signals.csv'.format(coin_name))
    # news_signals.drop(columns = ['compound','positive','neutral','negative','polarity','subjective'], inplace=True)
    # twitter_signals.drop(columns = ['label','Returns','Adj Close'], inplace=True)
    # combined_signals = twitter_signals.merge(news_signals, left_on='Date', right_on='Date')
    combined_signals = news_signals
    # combined_signals = news_signals.append(twitter_signals, ignore_index=True)
    all_crypto = all_crypto.append(combined_signals, ignore_index=True)
    
X = all_crypto.drop(columns = ['Date', 'label','Returns','Adj Close'])
y = all_crypto['label']

In [6]:
for model in models:
    # Define method to evaluate model
    cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=5, random_state=1)
    # Evaluate model
    # r2 = cross_val_score(model(), X, y, scoring='mae', cv=cv, n_jobs=-1)
    # Mean squared error; this is a loss function, so "scores" are negative
    loss = cross_val_score(model(), X, y, scoring='neg_mean_squared_error', cv=cv, n_jobs=-1)
    r2 = cross_val_score(model(), X, y, scoring='r2', cv=cv, n_jobs=-1)
    accuracy = cross_val_score(model(), X, y, scoring='accuracy', cv=cv, n_jobs=-1)
    print('model:' + str(model))
    print('Mean loss for: %.3f (+- %.3f)' % (np.mean(loss), np.std(loss)))
    print('Mean r2 for: %.3f (+- %.3f)' % (np.mean(r2), np.std(r2)))
    print('Mean Accuracy for: %.3f (+- %.3f)' % (np.mean(accuracy), np.std(accuracy)))
    print('--------------------------------------------------')

model:<class 'sklearn.discriminant_analysis.LinearDiscriminantAnalysis'>
Mean loss for: -0.353 (+- 0.064)
Mean r2 for: -0.182 (+- 0.214)
Mean Accuracy for: 0.663 (+- 0.051)
--------------------------------------------------
model:<class 'sklearn.neighbors._classification.KNeighborsClassifier'>
Mean loss for: -0.408 (+- 0.087)
Mean r2 for: -0.365 (+- 0.291)
Mean Accuracy for: 0.628 (+- 0.055)
--------------------------------------------------
model:<class 'sklearn.ensemble._forest.RandomForestClassifier'>
Mean loss for: -0.369 (+- 0.068)
Mean r2 for: -0.245 (+- 0.191)
Mean Accuracy for: 0.645 (+- 0.035)
--------------------------------------------------
model:<class 'sklearn.naive_bayes.GaussianNB'>
Mean loss for: -0.671 (+- 0.187)
Mean r2 for: -1.244 (+- 0.624)
Mean Accuracy for: 0.489 (+- 0.110)
--------------------------------------------------


Part 3 - Concatenated data

In [7]:
all_crypto = pd.DataFrame()
for coin_name in coin_names:
    twitter_signals = pd.read_csv('{0}_twitter_signals.csv'.format(coin_name))
    twitter_signals['Date'] = twitter_signals['Date'].apply(lambda x: datetime.strptime(x, '%Y-%m-%d'))
    twitter_signals['Date'] = twitter_signals['Date'].apply(lambda x: x.strftime('%m-%d-%Y'))
    news_signals = pd.read_csv('{0}_news_signals.csv'.format(coin_name))
    # news_signals.drop(columns = ['compound','positive','neutral','negative','polarity','subjective'], inplace=True)
    # twitter_signals.drop(columns = ['label','Returns','Adj Close'], inplace=True)
    combined_signals = news_signals.merge(twitter_signals, left_on='Date', right_on='Date')
    # combined_signals = news_signals.append(twitter_signals, ignore_index=True)
    all_crypto = all_crypto.append(combined_signals, ignore_index=True)
    
X = all_crypto.drop(columns = ['Date', 'label','Returns','Adj Close'])
y = all_crypto['label']

In [8]:
for model in models:
    # Define method to evaluate model
    cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=5, random_state=1)
    # Evaluate model
    # r2 = cross_val_score(model(), X, y, scoring='mae', cv=cv, n_jobs=-1)
    # Mean squared error; this is a loss function, so "scores" are negative
    loss = cross_val_score(model(), X, y, scoring='neg_mean_squared_error', cv=cv, n_jobs=-1)
    r2 = cross_val_score(model(), X, y, scoring='r2', cv=cv, n_jobs=-1)
    accuracy = cross_val_score(model(), X, y, scoring='accuracy', cv=cv, n_jobs=-1)
    print('model:' + str(model))
    print('Mean loss for: %.3f (+- %.3f)' % (np.mean(loss), np.std(loss)))
    print('Mean r2 for: %.3f (+- %.3f)' % (np.mean(r2), np.std(r2)))
    print('Mean Accuracy for: %.3f (+- %.3f)' % (np.mean(accuracy), np.std(accuracy)))
    print('--------------------------------------------------')

model:<class 'sklearn.discriminant_analysis.LinearDiscriminantAnalysis'>
Mean loss for: -0.371 (+- 0.076)
Mean r2 for: -0.240 (+- 0.253)
Mean Accuracy for: 0.645 (+- 0.067)
--------------------------------------------------
model:<class 'sklearn.neighbors._classification.KNeighborsClassifier'>
Mean loss for: -0.429 (+- 0.077)
Mean r2 for: -0.436 (+- 0.257)
Mean Accuracy for: 0.635 (+- 0.049)
--------------------------------------------------
model:<class 'sklearn.ensemble._forest.RandomForestClassifier'>
Mean loss for: -0.368 (+- 0.071)
Mean r2 for: -0.312 (+- 0.232)
Mean Accuracy for: 0.687 (+- 0.025)
--------------------------------------------------
model:<class 'sklearn.naive_bayes.GaussianNB'>
Mean loss for: -0.753 (+- 0.179)
Mean r2 for: -1.520 (+- 0.599)
Mean Accuracy for: 0.443 (+- 0.109)
--------------------------------------------------
