In [1]:
import pandas as pd
import numpy as np
import datetime as dt
from datetime import date
import pandas_datareader as web
import matplotlib.patches as patches
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import pearsonr
import os  
import glob 
import warnings

warnings.filterwarnings("ignore")

In [2]:
vitrox = pd.read_csv(r"Data/Stocks_WithPriceChange/0097.KL.csv", sep="\t")

In [3]:
vitrox = vitrox.drop(['Unnamed: 0'], axis=1)
vitrox = vitrox.drop(['Unnamed: 0.1'], axis=1)
vitrox["Date"] = pd.to_datetime(vitrox["Date"], format="%Y-%m-%d")
vitrox.head()

Unnamed: 0,Date,High,Low,Open,Close,Volume,Adj Close,pct_change,quote
0,2019-07-01,3.54,3.51,3.51,3.53,98400.0,3.479728,,97
1,2019-07-02,3.535,3.525,3.535,3.53,126200.0,3.479728,0.0,97
2,2019-07-03,3.515,3.5,3.515,3.51,32400.0,3.460013,-0.566562,97
3,2019-07-04,3.5,3.48,3.5,3.495,22800.0,3.445226,-0.427367,97
4,2019-07-05,3.5,3.475,3.485,3.495,20600.0,3.445226,0.0,97


In [4]:
start = dt.datetime(2020,1,1)
end = dt.datetime(2020,12,31)

In [5]:
vitrox = vitrox[(vitrox['Date'] >= start) & (vitrox['Date'] <= end)]
vitrox = vitrox.set_index('Date')
vitrox.head()

Unnamed: 0_level_0,High,Low,Open,Close,Volume,Adj Close,pct_change,quote
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2020-01-02,3.96,3.955,3.955,3.96,6000.0,3.910993,0.0,97
2020-01-03,4.035,3.96,3.96,4.0,704600.0,3.950498,1.010096,97
2020-01-06,4.025,3.975,3.975,4.015,694800.0,3.965312,0.374988,97
2020-01-07,4.02,4.0,4.0,4.01,583600.0,3.960374,-0.124515,97
2020-01-08,4.01,4.0,4.0,4.0,1107800.0,3.950498,-0.249383,97


In [6]:
sentiment_data = pd.read_csv(r"Data/sentiment_result/sentiment_lagged_with_category.csv", sep="\t")

In [7]:
sentiment_data = sentiment_data.drop(['Unnamed: 0'], axis=1)
sentiment_data["date"] = pd.to_datetime(sentiment_data["date"], format="%Y-%m-%d")
sentiment_data = sentiment_data.rename(columns={'date':'Date'})
sentiment_data = sentiment_data[(sentiment_data['Date'] >= start) & (sentiment_data['Date'] <= end)]
sentiment_data.head()

Unnamed: 0,Date,sentiment_score,sentiment_score_1,category
30,2020-01-17,0.018119,0.028988,Axiata
31,2020-01-23,-0.334419,0.018119,Axiata
32,2020-01-24,-0.119017,-0.334419,Axiata
33,2020-01-29,0.146429,-0.119017,Axiata
34,2020-02-10,0.028221,0.146429,Axiata


In [8]:
pivot_sentiment_data = sentiment_data.pivot(index="Date", columns="category", values="sentiment_score")

In [9]:
pivot_sentiment_data = pivot_sentiment_data.fillna(0)
pivot_sentiment_data.head()

category,Axiata,Bank Negara Malaysia,Bumi Armada,CIMB,Construction,Consumer,Covid-19 Malaysia,Digi,Econpile,Ecoworld,...,Technology,Telecommunication,Tenaga,Topglove,Transportation,UOA,Utilities,Vitrox,YTL,Yinson
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2020-01-01,0.0,0.0,0.0,0.0,0.0,0.015264,0.0,0.0,0.0,0.334765,...,0.06819,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2020-01-02,0.0,0.261956,0.0,0.052813,0.01492,0.0,0.0,0.0,0.0,0.0,...,0.016034,0.0,0.0,0.595648,0.0,0.0,0.0,0.0,0.0,0.0
2020-01-03,0.0,-0.620632,0.0,0.022332,0.0,0.01497,0.0,0.023489,0.0,0.0,...,0.018728,0.0,0.0,0.0,0.033127,0.0,0.0,0.0,0.0,0.0
2020-01-04,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2020-01-05,0.0,0.0,0.589797,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.043314,0.511417,0.0,0.0,0.019548,0.0,0.0,0.0,0.0,0.0


In [10]:
vitrox_sentiment = pd.concat([vitrox["pct_change"], pivot_sentiment_data], axis=1)
vitrox_sentiment = vitrox_sentiment.dropna(subset=["pct_change"])
vitrox_sentiment = vitrox_sentiment.drop(vitrox_sentiment[vitrox_sentiment["pct_change"] == 0.000000].index)

In [11]:
vitrox_sentiment

Unnamed: 0_level_0,pct_change,Axiata,Bank Negara Malaysia,Bumi Armada,CIMB,Construction,Consumer,Covid-19 Malaysia,Digi,Econpile,...,Technology,Telecommunication,Tenaga,Topglove,Transportation,UOA,Utilities,Vitrox,YTL,Yinson
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2020-01-03,1.010096,0.000000,-0.620632,0.000000,0.022332,0.000000,0.014970,0.0,0.023489,0.000000,...,0.018728,0.000000,0.000000,0.000000,0.033127,0.000000,0.000000,0.000000,0.000000,0.0
2020-01-06,0.374988,0.000000,0.494941,0.000000,0.088340,0.011502,0.000000,0.0,0.122144,0.000000,...,0.392524,0.000000,0.000000,0.000000,0.000000,0.000000,0.067325,0.000000,0.000000,0.0
2020-01-07,-0.124515,0.000000,0.000000,0.000000,0.000000,0.116439,0.007940,0.0,0.000000,0.000000,...,0.027872,0.009140,0.000000,0.000000,0.000000,0.000000,-0.280261,0.016780,0.423123,0.0
2020-01-08,-0.249383,0.000000,0.000000,-0.054008,0.000000,0.000000,0.000000,0.0,0.000000,0.944624,...,0.024867,0.000000,0.000000,0.000000,0.000000,-0.860914,0.000000,0.000000,0.000000,0.0
2020-01-10,3.624984,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.808618,0.402126,...,0.000000,0.000000,0.000000,0.811845,0.000000,0.000000,0.026633,0.000000,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2020-12-24,0.416671,-0.180722,-0.181543,0.000000,0.525665,0.000000,0.199491,0.0,0.000000,0.587467,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0
2020-12-28,1.659756,0.686088,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,...,0.031618,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0
2020-12-29,0.816316,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,...,0.040920,0.000000,0.025345,0.000000,0.000000,0.000000,0.000000,0.011372,0.000000,0.0
2020-12-30,-1.097782,0.024076,0.000000,0.000000,0.000000,0.000000,0.007368,0.0,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.820407,0.000000,0.009453,0.472429,0.010213,0.000000,0.0


In [12]:
#convert positive as 1 and negative as 0
vitrox_sentiment[vitrox_sentiment<=-0.5] = -1
vitrox_sentiment[vitrox_sentiment>=0.5] = 1
vitrox_sentiment[(vitrox_sentiment>-0.5) & ((vitrox_sentiment<-0.5)] = 0

In [23]:
vitrox_sentiment

Unnamed: 0_level_0,pct_change,Axiata,Bank Negara Malaysia,Bumi Armada,CIMB,Construction,Consumer,Covid-19 Malaysia,Digi,Econpile,...,Technology,Telecommunication,Tenaga,Topglove,Transportation,UOA,Utilities,Vitrox,YTL,Yinson
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2020-01-03,1.0,0.0,-1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2020-01-06,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2020-01-07,-1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,...,1.0,1.0,0.0,0.0,0.0,0.0,-1.0,1.0,1.0,0.0
2020-01-08,-1.0,0.0,0.0,-1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,1.0,0.0,0.0,0.0,0.0,-1.0,0.0,0.0,0.0,0.0
2020-01-10,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2020-12-24,1.0,-1.0,-1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2020-12-28,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2020-12-29,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2020-12-30,-1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0


In [34]:
total_count = vitrox_sentiment.shape[0]

# Compare the values in the "column1" and "column2" columns
match = (vitrox_sentiment["Covid-19 Malaysia"] == vitrox_sentiment["pct_change"])
zero = (vitrox_sentiment["Covid-19 Malaysia"] == 0)
not_match = (vitrox_sentiment["Covid-19 Malaysia"] != vitrox_sentiment["pct_change"])
# Count the number of matches
count = match.sum()
count_zero = zero.sum()
not_match_count = not_match.sum() - count_zero
# Print the count
print(count)
print(not_match_count)

18
18


In [35]:
accurate = count / (count+not_match_count)
print(accurate)

0.5


In [2]:
def evaluate_match(stock_sentiment, stock_name):
    evaluate_data = pd.DataFrame()
    fieldnames = ['quote','category', 'accuracy']
    # Iterate over column names
    for column in stock_sentiment:
        # Compare the values in the "column1" and "column2" columns
        match = (stock_sentiment[column] == stock_sentiment["pct_change"])
        zero = (stock_sentiment[column] == 0)
        not_match = (stock_sentiment[column] != stock_sentiment["pct_change"])
    
        # Count the number of matches
        count = match.sum()
        count_zero = zero.sum()
        not_match_count = not_match.sum() - count_zero
    
        accuracy = count / (count+not_match_count)
        category = column
    
        record = [(stock_name, category, accuracy)]
        records = pd.DataFrame(record, columns = fieldnames)
        evaluate_data = pd.concat([evaluate_data, records], ignore_index=True, axis=0)
        
    return evaluate_data
    

In [48]:
data = evaluate_match(vitrox_sentiment, "Vitrox")
print(data)

     quote              category  accuracy
0   Vitrox            pct_change  1.000000
1   Vitrox                Axiata  0.578125
2   Vitrox  Bank Negara Malaysia  0.555556
3   Vitrox           Bumi Armada  0.666667
4   Vitrox                  CIMB  0.571429
..     ...                   ...       ...
66  Vitrox                   UOA  0.576923
67  Vitrox             Utilities  0.566667
68  Vitrox                Vitrox  0.510204
69  Vitrox                   YTL  0.545455
70  Vitrox                Yinson  0.566038

[71 rows x 3 columns]


In [13]:
#filter data
start = dt.datetime(2020,1,1)
end = dt.datetime(2020,12,31)

#get sentiment result
sentiment_data = pd.read_csv(r"Data/sentiment_result/sentiment_lagged_with_category.csv", sep="\t")

sentiment_data = sentiment_data.drop(['Unnamed: 0'], axis=1)
sentiment_data["date"] = pd.to_datetime(sentiment_data["date"], format="%Y-%m-%d")
sentiment_data = sentiment_data.rename(columns={'date':'Date'})
sentiment_data = sentiment_data[(sentiment_data['Date'] >= start) & (sentiment_data['Date'] <= end)]

pivot_sentiment_data = sentiment_data.pivot(index="Date", columns="category", values="sentiment_score")
pivot_sentiment_data = pivot_sentiment_data.fillna(0)

#for lagged sentiment score
#pivot_sentiment_data = sentiment_data.pivot(index="Date", columns="category", values="sentiment_score_1")
#pivot_sentiment_data = pivot_sentiment_data.fillna(0)

#get stock data with percentage change
directory = "Data\Stocks_WithPriceChange"

all_evaluate_data = pd.DataFrame()

for filename in os.listdir(directory):  
    if filename.endswith(".csv"):    
        quote = filename.split(".")[0]
        data = pd.read_csv(r"Data/Stocks_WithPriceChange/{0}".format(filename), sep="\t")  
        data = data.drop(['Unnamed: 0'], axis=1)
        data = data.drop(['Unnamed: 0.1'], axis=1)
        data["Date"] = pd.to_datetime(data["Date"], format="%Y-%m-%d")
        
        stock_data = data[(data['Date'] >= start) & (data['Date'] <= end)]
        stock_data = stock_data.set_index('Date')
        
        #merge stock data and sentiment data
        stock_sentiment_data = pd.concat([stock_data["pct_change"], pivot_sentiment_data], axis=1)
        stock_sentiment_data = stock_sentiment_data.dropna(subset=["pct_change"])
        stock_sentiment_data = stock_sentiment_data.drop(stock_sentiment_data[stock_sentiment_data["pct_change"] == 0.000000].index)
        
        #pct_change
        pct_change = stock_sentiment_data['pct_change']
        pct_change[pct_change < 0] = -1
        pct_change[pct_change > 0] = 1
        pct_change[pct_change == 0] = 0
        
        #convert positive as 1 and negative as 0
        stock_sentiment_data[stock_sentiment_data<=-0.5] = -1
        stock_sentiment_data[stock_sentiment_data>=0.5] = 1
        stock_sentiment_data[(stock_sentiment_data>-0.5) & (stock_sentiment_data<0.5)] = 0
        
        #convert back pct_change 
        stock_sentiment_data['pct_change'] = pct_change
        
        evaluate_data = evaluate_match(stock_sentiment_data, quote)
        all_evaluate_data = all_evaluate_data.append(evaluate_data, ignore_index=True)

In [14]:
all_evaluate_data.to_csv(r"Data/sentiment_result/evaluate_sentiment_with_filter.csv", sep="\t")

In [15]:
all_evaluate_data

Unnamed: 0,quote,category,accuracy
0,0097,pct_change,1.000000
1,0097,Axiata,0.666667
2,0097,Bank Negara Malaysia,0.518519
3,0097,Bumi Armada,0.647059
4,0097,CIMB,0.500000
...,...,...,...
3758,^KLSE;1=9,UOA,0.727273
3759,^KLSE;1=9,Utilities,0.368421
3760,^KLSE;1=9,Vitrox,0.555556
3761,^KLSE;1=9,YTL,0.400000
