In [1]:
# basic libraries to work on the dataframe
import pandas as pd
import numpy as np
# libraries
import sklearn
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.svm import SVC

In [2]:
# Reading the data file using pandas
data1 = pd.read_csv('FinalDataset.csv')


In [3]:
#viewing all rows of a pandas dataframe
pd.set_option('display.max_rows', None)

## Data pre-processing

In [4]:
#grouping the important attributes by 'STOCK' and 'DATE'
#calculating the mean of the attributes
#resetting the index after each calculation
averagewords = data1.groupby(["STOCK", "DATE"])["WordsPerTweet"].mean().reset_index()
averageletters = data1.groupby(["STOCK", "DATE"])["LettersPerTweet"].mean().reset_index()
nrtweets = data1.groupby(["STOCK", "DATE"])["LAST_PRICE"].count().reset_index()
nrnouns = data1.groupby(["STOCK", "DATE"])["NOUN"].mean().reset_index()
nrverbs = data1.groupby(["STOCK", "DATE"])["VERB"].mean().reset_index()
nradj = data1.groupby(["STOCK", "DATE"])["ADJ"].mean().reset_index()
polarity = data1.groupby(["STOCK", "DATE"])["LSTM_POLARITY"].mean().reset_index()
textblob = data1.groupby(["STOCK", "DATE"])["TEXTBLOB_POLARITY"].mean().reset_index()
volatility10 = data1.groupby(["STOCK", "DATE"])["VOLATILITY_10D"].mean().reset_index()
volatility30 = data1.groupby(["STOCK", "DATE"])["VOLATILITY_30D"].mean().reset_index()

In [5]:
averagewords['tuple'] = list(zip(averagewords['STOCK'], averagewords['DATE']))

In [6]:
#adding the features to a dictionary
avglettersdict = dict(zip(list(zip(averageletters['STOCK'], averageletters['DATE'])), averageletters['LettersPerTweet']))
nrtweetsdict = dict(zip(list(zip(nrtweets['STOCK'], nrtweets['DATE'])), nrtweets['LAST_PRICE']))
noundict = dict(zip(list(zip(nrnouns['STOCK'], nrnouns['DATE'])), nrnouns['NOUN']))
verbdict = dict(zip(list(zip(nrverbs['STOCK'], nrverbs['DATE'])), nrverbs['VERB']))
adjdict = dict(zip(list(zip(nradj['STOCK'], nradj['DATE'])), nradj['ADJ']))
closingpricedict =  dict(zip(list(zip(data1['STOCK'], data1['DATE'])), data1['LAST_PRICE']))
polaritydict =  dict(zip(list(zip(polarity['STOCK'], polarity['DATE'])), polarity['LSTM_POLARITY']))
textblobdict =  dict(zip(list(zip(textblob['STOCK'], textblob['DATE'])), textblob['TEXTBLOB_POLARITY']))
volatility10dict =  dict(zip(list(zip(volatility10['STOCK'], volatility10['DATE'])), volatility10['VOLATILITY_10D']))
volatility30dict =  dict(zip(list(zip(volatility30['STOCK'], volatility30['DATE'])), volatility30['VOLATILITY_30D']))

In [7]:
#adding the important attributes to the 'averagewords' dataframe
averagewords['Tweets Per Day'] = averagewords['tuple'].apply(lambda x : nrtweetsdict[x])
averagewords['Average Letters Per Tweet'] = averagewords['tuple'].apply(lambda x : avglettersdict[x])
averagewords['Average Nouns Per Tweet'] = averagewords['tuple'].apply(lambda x : noundict[x])
averagewords['Average Verbs Per Tweet'] = averagewords['tuple'].apply(lambda x : verbdict[x])
averagewords['Average Adjectives Per Tweet'] = averagewords['tuple'].apply(lambda x : adjdict[x])
averagewords['Average Sentiment Per Tweet'] = averagewords['tuple'].apply(lambda x : polaritydict[x])
averagewords['Average Textblob Sentiment Per Tweet'] = averagewords['tuple'].apply(lambda x : textblobdict[x])
averagewords['Average 10Day Volatilty'] = averagewords['tuple'].apply(lambda x : volatility10dict[x])
averagewords['Average 30Day Volatilty'] = averagewords['tuple'].apply(lambda x : volatility30dict[x])
averagewords['Closing Price'] = averagewords['tuple'].apply(lambda x : closingpricedict[x])

In [8]:
averagewords = averagewords.drop(columns = ['tuple']) 
#convert 'DATE' column to date type
averagewords['DATE'] = pd.to_datetime(averagewords['DATE'])
#performing experiments by adding or dropping specific columns to the dataset
#averagewords = averagewords.drop(columns = ['Average 10Day Volatilty','Average 30Day Volatilty', 'WordsPerTweet', 'Average Tweets Per Day', 'Average Letters Per Tweet', 'Average Nouns Per Tweet'])
averagewords = averagewords.drop(columns = ['Average 10Day Volatilty','Average 30Day Volatilty']) 

In [9]:
#Sorting the dataset by Stock and Date
averagewords = averagewords.sort_values(['STOCK', 'DATE'],
              ascending = [True, True]).reset_index(drop=True)

## Creating a class attribute

In [10]:
Prediction = []
count = 0
interval = 1
not_found = []

for index in range(len(averagewords)):
    currentStock = averagewords.loc[index, 'STOCK']
    future_index = index + interval
    
    while future_index < len(averagewords):
        #If statement that makes sure that the stocks are independent from each other
        if averagewords.loc[future_index, 'STOCK'] == currentStock:
            price = averagewords.iloc[index]['Closing Price']
            future_price = averagewords.iloc[future_index]['Closing Price'] 
            #if the difference between the future and current price is greater than 0
            #assign the class variable to 1
            #else assign the class variable to 0
            if future_price - price > 0:
                Prediction.append(1) 
            else:
                Prediction.append(0)
            break
        else:
            future_index += 1
          
    if future_index >= len(averagewords):
        not_found.append(index)

averagewords = averagewords.drop(not_found)
averagewords.reset_index(drop=True)

averagewords["Class"] = np.array(Prediction)

print(len(averagewords), len(Prediction))        

averagewords

503 503


Unnamed: 0,STOCK,DATE,WordsPerTweet,Tweets Per Day,Average Letters Per Tweet,Average Nouns Per Tweet,Average Verbs Per Tweet,Average Adjectives Per Tweet,Average Sentiment Per Tweet,Average Textblob Sentiment Per Tweet,Closing Price,Class
0,Amazon,2017-01-02,17.5,12,92.25,2.5,1.833333,0.5,-0.166667,0.070778,832.35,1
1,Amazon,2017-01-03,15.666667,6,85.333333,1.333333,1.166667,0.333333,0.0,0.141667,853.08,1
2,Amazon,2017-01-04,14.8,5,83.2,1.0,0.4,0.0,0.6,0.06,886.54,1
3,Amazon,2017-01-06,14.0,2,88.5,2.5,1.0,0.5,0.0,0.0,995.95,0
4,Amazon,2017-01-07,16.777778,9,82.666667,1.888889,1.0,0.888889,-0.333333,0.005556,968.0,1
5,Amazon,2017-01-08,21.384615,13,99.538462,1.538462,1.846154,1.076923,-0.538462,0.001534,996.19,0
6,Amazon,2017-01-09,20.848485,33,109.424242,3.060606,4.757576,0.060606,0.939394,0.187879,978.25,1
7,Amazon,2017-01-11,17.428571,7,97.857143,4.0,0.857143,1.142857,-0.142857,0.14961,1103.68,1
8,Amazon,2017-01-12,16.416667,12,84.5,2.583333,1.0,1.0,0.5,0.030395,1162.35,0
9,Amazon,2017-01-31,15.826087,23,81.173913,2.391304,1.826087,0.652174,0.130435,0.047826,823.48,1


In [11]:
# information regarding the attributes
averagewords.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 503 entries, 0 to 507
Data columns (total 12 columns):
 #   Column                                Non-Null Count  Dtype         
---  ------                                --------------  -----         
 0   STOCK                                 503 non-null    object        
 1   DATE                                  503 non-null    datetime64[ns]
 2   WordsPerTweet                         503 non-null    float64       
 3   Tweets Per Day                        503 non-null    int64         
 4   Average Letters Per Tweet             503 non-null    float64       
 5   Average Nouns Per Tweet               503 non-null    float64       
 6   Average Verbs Per Tweet               503 non-null    float64       
 7   Average Adjectives Per Tweet          503 non-null    float64       
 8   Average Sentiment Per Tweet           503 non-null    float64       
 9   Average Textblob Sentiment Per Tweet  503 non-null    float64       
 10  Cl

In [12]:
# check whether there are any duplicates
averagewords.duplicated().sum()

0

In [13]:
#checking the number of null values for each column
averagewords.isnull().sum()

STOCK                                   0
DATE                                    0
WordsPerTweet                           0
Tweets Per Day                          0
Average Letters Per Tweet               0
Average Nouns Per Tweet                 0
Average Verbs Per Tweet                 0
Average Adjectives Per Tweet            0
Average Sentiment Per Tweet             0
Average Textblob Sentiment Per Tweet    0
Closing Price                           0
Class                                   0
dtype: int64

In [14]:
# check the shape of the dataset
averagewords.shape

(503, 12)

In [15]:
# see the class distribution
averagewords["Class"].groupby(averagewords["Class"]).count()

Class
0    271
1    232
Name: Class, dtype: int64

## Creating the classifiers and performing experiments

### Random Forest

In [16]:
#prepare training data
#predictor and class attributes
X = averagewords.loc[:,'WordsPerTweet': 'Closing Price']
y = averagewords['Class']

scores = 0

for i in range(30):
    # splits the data in training + testing
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=i)
    # builds the classifier
    forest = RandomForestClassifier()
    forest.fit(X_train, y_train)
    # evaluates on the test data
    scores += forest.score(X_test, y_test)
    
print(scores / 30)

0.5663366336633664


### Decision Tree

In [17]:
#prepare training data
#predictor and class attributes
X = averagewords.loc[:,'WordsPerTweet': 'Closing Price']
y = averagewords['Class']

scores = 0

for i in range(30):
    # splits the data in training + testing
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=i)
    # builds the classifier
    tree = DecisionTreeClassifier()
    tree.fit(X_train, y_train)
    # evaluates on the test data
    scores += tree.score(X_test, y_test)
    
print(scores / 30) 

0.5250825082508251


 ### Support Vector Machine

In [18]:
#prepare training data
#predictor and class attributes
X = averagewords.loc[:,'WordsPerTweet': 'Closing Price']
y = averagewords['Class']

scores = 0

for i in range(30):
    # splits the data in training + testing
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=i)
    # builds the classifier
    svm = SVC()
    svm.fit(X_train, y_train)
    # evaluates on the test data
    scores += svm.score(X_test, y_test)
    
print(scores / 30) 

0.5217821782178219


### StratifiedKfold cross-validation

In [19]:
#a function which gets the cross-validation score
def get_score(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    return model.score(X_test, y_test)


In [20]:
# stratified cross-validation
kf = StratifiedKFold(n_splits=10)
kf

StratifiedKFold(n_splits=10, random_state=None, shuffle=False)

In [21]:
# specifying the predictor and class attributes
X = averagewords.loc[:,'WordsPerTweet': 'Closing Price']
y = averagewords['Class']

In [22]:
#creating a list for each classifier
scores_rf = []
scores_dTree = [] 
scores_svm = []

for train_index, test_index, in kf.split(X, y):
    X_train, X_test = X.iloc[train_index, :], X.iloc[test_index, :]
    y_train = y.iloc[train_index]
    y_test = y.iloc[test_index]
   
    scores_rf.append(get_score(RandomForestClassifier(n_estimators=80), X_train, X_test, y_train, y_test))
    scores_dTree.append(get_score(DecisionTreeClassifier(), X_train, X_test, y_train, y_test))
    scores_svm.append(get_score(SVC(), X_train, X_test, y_train, y_test))
  

In [23]:
#a function that divides the sum of the scores by the total amount of scores
def Average(lst):
    return sum(lst) / len(lst)

In [24]:
scores_rf

[0.6274509803921569,
 0.5686274509803921,
 0.49019607843137253,
 0.62,
 0.46,
 0.34,
 0.52,
 0.44,
 0.5,
 0.48]

In [25]:
#calculating the average of the 10 scores
average = Average(scores_rf)
print("Average of RF Scores =", round(average, 2))

Average of RF Scores = 0.5


In [26]:
#calculating the standard deviation of the scores
st_dev1 = np.std(scores_rf)
print("Standard deviation of RF: " + str(st_dev1))

Standard deviation of RF: 0.08175295914115437


In [27]:
scores_dTree

[0.5490196078431373,
 0.5098039215686274,
 0.49019607843137253,
 0.62,
 0.58,
 0.4,
 0.48,
 0.48,
 0.38,
 0.48]

In [28]:
#calculating the average of the 10 scores
average = Average(scores_dTree)
print("Average of Decision Tree Scores =", round(average, 2))

Average of Decision Tree Scores = 0.5


In [29]:
#calculating the standard deviation of the scores
st_dev2 = np.std(scores_dTree)
print("Standard deviation of Decision Tree: " + str(st_dev2))

Standard deviation of Decision Tree: 0.0699279466590599


In [30]:
scores_svm

[0.5294117647058824,
 0.5098039215686274,
 0.5294117647058824,
 0.54,
 0.54,
 0.54,
 0.54,
 0.54,
 0.54,
 0.54]

In [31]:
#calculating the average of the 10 scores
average = Average(scores_svm)
print("Average of SVM Scores =", round(average, 2))

Average of SVM Scores = 0.53


In [32]:
#calculating the standard deviation of the scores
st_dev3 = np.std(scores_svm)
print("Standard deviation of SVM: " + str(st_dev3))

Standard deviation of SVM: 0.009338686881160557


### Confusion Matrix

In [33]:
##Displaying a confusion matrix for random forest
predictions = forest.predict(X_test, X_train)
cm = confusion_matrix(y_test, predictions, labels=forest.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=forest.classes_)
_ = disp.plot()

TypeError: predict() takes 2 positional arguments but 3 were given

In [None]:
##Displaying a confusion matrix for decision tree
predictions = tree.predict(X_test)
cm = confusion_matrix(y_test, predictions, labels=tree.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=tree.classes_)
_ = disp.plot()

In [None]:
##Displaying a confusion matrix for support vector machine
import seaborn as sns

predictions = svm.predict(X_test)
cm = confusion_matrix(y_test, predictions, labels=svm.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=svm.classes_)
_ = disp.plot()

In [None]:
#data1