In [1]:
# basic libraries to work on the dataframe
import pandas as pd
import numpy as np
# libraries
import sklearn
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.svm import SVC

In [2]:
# Reading the data file using pandas
data1 = pd.read_csv('FinalDataset.csv')


FileNotFoundError: [Errno 2] No such file or directory: 'FinalDataset.csv'

In [None]:
#viewing all rows of a pandas dataframe
pd.set_option('display.max_rows', None)

## Data pre-processing

In [None]:
#grouping the important attributes by 'STOCK' and 'DATE'
#calculating the mean of the attributes
#resetting the index after each calculation
averagewords = data1.groupby(["STOCK", "DATE"])["WordsPerTweet"].mean().reset_index()
averageletters = data1.groupby(["STOCK", "DATE"])["LettersPerTweet"].mean().reset_index()
nrtweets = data1.groupby(["STOCK", "DATE"])["LAST_PRICE"].count().reset_index()
nrnouns = data1.groupby(["STOCK", "DATE"])["NOUN"].mean().reset_index()
nrverbs = data1.groupby(["STOCK", "DATE"])["VERB"].mean().reset_index()
nradj = data1.groupby(["STOCK", "DATE"])["ADJ"].mean().reset_index()
polarity = data1.groupby(["STOCK", "DATE"])["LSTM_POLARITY"].mean().reset_index()
textblob = data1.groupby(["STOCK", "DATE"])["TEXTBLOB_POLARITY"].mean().reset_index()
volatility10 = data1.groupby(["STOCK", "DATE"])["VOLATILITY_10D"].mean().reset_index()
volatility30 = data1.groupby(["STOCK", "DATE"])["VOLATILITY_30D"].mean().reset_index()

In [None]:
averagewords['tuple'] = list(zip(averagewords['STOCK'], averagewords['DATE']))

In [None]:
#adding the features to a dictionary
avglettersdict = dict(zip(list(zip(averageletters['STOCK'], averageletters['DATE'])), averageletters['LettersPerTweet']))
nrtweetsdict = dict(zip(list(zip(nrtweets['STOCK'], nrtweets['DATE'])), nrtweets['LAST_PRICE']))
noundict = dict(zip(list(zip(nrnouns['STOCK'], nrnouns['DATE'])), nrnouns['NOUN']))
verbdict = dict(zip(list(zip(nrverbs['STOCK'], nrverbs['DATE'])), nrverbs['VERB']))
adjdict = dict(zip(list(zip(nradj['STOCK'], nradj['DATE'])), nradj['ADJ']))
closingpricedict =  dict(zip(list(zip(data1['STOCK'], data1['DATE'])), data1['LAST_PRICE']))
polaritydict =  dict(zip(list(zip(polarity['STOCK'], polarity['DATE'])), polarity['LSTM_POLARITY']))
textblobdict =  dict(zip(list(zip(textblob['STOCK'], textblob['DATE'])), textblob['TEXTBLOB_POLARITY']))
volatility10dict =  dict(zip(list(zip(volatility10['STOCK'], volatility10['DATE'])), volatility10['VOLATILITY_10D']))
volatility30dict =  dict(zip(list(zip(volatility30['STOCK'], volatility30['DATE'])), volatility30['VOLATILITY_30D']))

In [None]:
#adding the important attributes to the 'averagewords' dataframe
averagewords['Tweets Per Day'] = averagewords['tuple'].apply(lambda x : nrtweetsdict[x])
averagewords['Average Letters Per Tweet'] = averagewords['tuple'].apply(lambda x : avglettersdict[x])
averagewords['Average Nouns Per Tweet'] = averagewords['tuple'].apply(lambda x : noundict[x])
averagewords['Average Verbs Per Tweet'] = averagewords['tuple'].apply(lambda x : verbdict[x])
averagewords['Average Adjectives Per Tweet'] = averagewords['tuple'].apply(lambda x : adjdict[x])
averagewords['Average Sentiment Per Tweet'] = averagewords['tuple'].apply(lambda x : polaritydict[x])
averagewords['Average Textblob Sentiment Per Tweet'] = averagewords['tuple'].apply(lambda x : textblobdict[x])
averagewords['Average 10Day Volatilty'] = averagewords['tuple'].apply(lambda x : volatility10dict[x])
averagewords['Average 30Day Volatilty'] = averagewords['tuple'].apply(lambda x : volatility30dict[x])
averagewords['Closing Price'] = averagewords['tuple'].apply(lambda x : closingpricedict[x])

In [None]:
averagewords = averagewords.drop(columns = ['tuple']) 
#convert 'DATE' column to date type
averagewords['DATE'] = pd.to_datetime(averagewords['DATE'])
#performing experiments by adding or dropping specific columns to the dataset
#averagewords = averagewords.drop(columns = ['Average 10Day Volatilty','Average 30Day Volatilty', 'WordsPerTweet', 'Average Tweets Per Day', 'Average Letters Per Tweet', 'Average Nouns Per Tweet'])
averagewords = averagewords.drop(columns = ['Average 10Day Volatilty','Average 30Day Volatilty']) 

In [None]:
#Sorting the dataset by Stock and Date
averagewords = averagewords.sort_values(['STOCK', 'DATE'],
              ascending = [True, True]).reset_index(drop=True)

## Creating a class attribute

In [None]:
Prediction = []
count = 0
interval = 1
not_found = []

for index in range(len(averagewords)):
    currentStock = averagewords.loc[index, 'STOCK']
    future_index = index + interval
    
    while future_index < len(averagewords):
        #If statement that makes sure that the stocks are independent from each other
        if averagewords.loc[future_index, 'STOCK'] == currentStock:
            price = averagewords.iloc[index]['Closing Price']
            future_price = averagewords.iloc[future_index]['Closing Price'] 
            #if the difference between the future and current price is greater than 0
            #assign the class variable to 1
            #else assign the class variable to 0
            if future_price - price > 0:
                Prediction.append(1) 
            else:
                Prediction.append(0)
            break
        else:
            future_index += 1
          
    if future_index >= len(averagewords):
        not_found.append(index)

averagewords = averagewords.drop(not_found)
averagewords.reset_index(drop=True)

averagewords["Class"] = np.array(Prediction)

print(len(averagewords), len(Prediction))        

#averagewords

In [None]:
# information regarding the attributes
averagewords.info()

In [None]:
# check whether there are any duplicates
averagewords.duplicated().sum()

In [None]:
#checking the number of null values for each column
averagewords.isnull().sum()

In [None]:
# check the shape of the dataset
averagewords.shape

In [None]:
# see the class distribution
averagewords["Class"].groupby(averagewords["Class"]).count()

## Creating the classifiers and performing experiments

### Random Forest

In [None]:
#prepare training data
#predictor and class attributes
X = averagewords.loc[:,'WordsPerTweet': 'Closing Price']
y = averagewords['Class']

scores = 0

for i in range(30):
    # splits the data in training + testing
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=i)
    # builds the classifier
    forest = RandomForestClassifier()
    forest.fit(X_train, y_train)
    # evaluates on the test data
    scores += forest.score(X_test, y_test)
    
print(scores / 30)

### Decision Tree

In [None]:
#prepare training data
#predictor and class attributes
X = averagewords.loc[:,'WordsPerTweet': 'Closing Price']
y = averagewords['Class']

scores = 0

for i in range(30):
    # splits the data in training + testing
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=i)
    # builds the classifier
    tree = DecisionTreeClassifier()
    tree.fit(X_train, y_train)
    # evaluates on the test data
    scores += tree.score(X_test, y_test)
    
print(scores / 30) 

 ### Support Vector Machine

In [None]:
#prepare training data
#predictor and class attributes
X = averagewords.loc[:,'WordsPerTweet': 'Closing Price']
y = averagewords['Class']

scores = 0

for i in range(30):
    # splits the data in training + testing
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=i)
    # builds the classifier
    svm = SVC()
    svm.fit(X_train, y_train)
    # evaluates on the test data
    scores += svm.score(X_test, y_test)
    
print(scores / 30) 

### StratifiedKfold cross-validation

In [None]:
#a function which gets the cross-validation score
def get_score(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    return model.score(X_test, y_test)


In [None]:
# stratified cross-validation
kf = StratifiedKFold(n_splits=10)
kf

In [None]:
# specifying the predictor and class attributes
X = averagewords.loc[:,'WordsPerTweet': 'Closing Price']
y = averagewords['Class']

In [None]:
#creating a list for each classifier
scores_rf = []
scores_dTree = [] 
scores_svm = []

for train_index, test_index, in kf.split(X, y):
    X_train, X_test = X.iloc[train_index, :], X.iloc[test_index, :]
    y_train = y.iloc[train_index]
    y_test = y.iloc[test_index]
   
    scores_rf.append(get_score(RandomForestClassifier(n_estimators=80), X_train, X_test, y_train, y_test))
    scores_dTree.append(get_score(DecisionTreeClassifier(), X_train, X_test, y_train, y_test))
    scores_svm.append(get_score(SVC(), X_train, X_test, y_train, y_test))
  

In [None]:
#a function that divides the sum of the scores by the total amount of scores
def Average(lst):
    return sum(lst) / len(lst)

In [None]:
scores_rf

In [None]:
#calculating the average of the 10 scores
average = Average(scores_rf)
print("Average of RF Scores =", round(average, 2))

In [None]:
#calculating the standard deviation of the scores
st_dev1 = np.std(scores_rf)
print("Standard deviation of RF: " + str(st_dev1))

In [None]:
scores_dTree

In [None]:
#calculating the average of the 10 scores
average = Average(scores_dTree)
print("Average of Decision Tree Scores =", round(average, 2))

In [None]:
#calculating the standard deviation of the scores
st_dev2 = np.std(scores_dTree)
print("Standard deviation of Decision Tree: " + str(st_dev2))

In [None]:
scores_svm

In [None]:
#calculating the average of the 10 scores
average = Average(scores_svm)
print("Average of SVM Scores =", round(average, 2))

In [None]:
#calculating the standard deviation of the scores
st_dev3 = np.std(scores_svm)
print("Standard deviation of SVM: " + str(st_dev3))