In [74]:
import pandas as pd
import numpy as np
import datetime as dt
from datetime import date
import pandas_datareader as web
import matplotlib.patches as patches
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import pearsonr
import warnings
warnings.filterwarnings("ignore")
# Generate Word Cloud image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

In [75]:
sentiment_news = pd.read_csv(r"Data/sentiment_result/sentiment.csv", sep=",")

In [76]:
sentiment_news.head()

Unnamed: 0,Column1,date,category,publisher,content,sentiment_score
0,3386,1-Jul-19,Malaysia Government Policy,The Malaysian Reserve,malaysia economy grow 46 percent 2019 world re...,0.902186
1,3854,1-Jul-19,Nestle,USA Today,nestle toll house selling edible cookie wait n...,0.040744
2,3913,1-Jul-19,IOI,Yonhap News,girl band ioi get back together new album oct ...,0.031064
3,3930,1-Jul-19,IOI,Bandwagon Asia,k pop girl group ioi reunite october chung ha ...,0.113659
4,3942,1-Jul-19,IOI,CNN Indonesia,9 orang personel ioi reuni pada oktober 2019 i...,0.048623


In [77]:
sentiment_news.dtypes

Column1              int64
date                object
category            object
publisher           object
content             object
sentiment_score    float64
dtype: object

In [78]:
sentiment_news["date"] = pd.to_datetime(sentiment_news["date"], format="%d-%b-%y")

In [79]:
sentiment_news.head()

Unnamed: 0,Column1,date,category,publisher,content,sentiment_score
0,3386,2019-07-01,Malaysia Government Policy,The Malaysian Reserve,malaysia economy grow 46 percent 2019 world re...,0.902186
1,3854,2019-07-01,Nestle,USA Today,nestle toll house selling edible cookie wait n...,0.040744
2,3913,2019-07-01,IOI,Yonhap News,girl band ioi get back together new album oct ...,0.031064
3,3930,2019-07-01,IOI,Bandwagon Asia,k pop girl group ioi reunite october chung ha ...,0.113659
4,3942,2019-07-01,IOI,CNN Indonesia,9 orang personel ioi reuni pada oktober 2019 i...,0.048623


In [6]:
def sentiment_groupby_day(data):
    data.drop(columns=data.columns[0], axis=1, inplace=True)
    #data.drop(columns=data.columns[1], axis=1, inplace=True)
    #data.drop(columns=data.columns[1], axis=1, inplace=True)
    #data.drop(columns=data.columns[1], axis=1, inplace=True)
    data = data.groupby(['date'])['sentiment_score'].mean().reset_index().rename(columns={'mean':'sentiment score'})
    data = data.set_index('date') 
    
    data["sentiment_score_1"] = data.shift(1)
    
    data = data.sort_index() # sort by datetime
    return data

In [7]:
data = sentiment_groupby_day(sentiment_news)

In [8]:
data.head()

Unnamed: 0_level_0,sentiment_score,sentiment_score_1
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2019-07-01,0.149586,
2019-07-02,0.1018,0.149586
2019-07-03,0.260722,0.1018
2019-07-04,0.021849,0.260722
2019-07-05,0.104767,0.021849


In [9]:
data.to_csv(r"Data/sentiment_result/sentiment_lagged.csv", sep="\t")

In [85]:
def filter_news_sentiment(start, end, data):
    data["date"] = data["date"].apply(lambda x: pd.to_datetime(x, unit="ns", utc=True).floor('D').date())
    axiata_news = data[(data['category'] == 'Axiata')].reset_index()
    bnm_news = data[(data['category'] == 'Bank Negara Malaysia')].reset_index()
    bumi_armada_news = data[(data['category'] == 'Bumi Armada')].reset_index()
    cimb_news = data[(data['category'] == 'CIMB')].reset_index()
    construction_news = data[(data['category'] == 'Construction')].reset_index()
    consumer_news = data[(data['category'] == 'Consumer')].reset_index()
    covid_news = data[(data['category'] == 'Covid-19 Malaysia')].reset_index()
    dialog_news = data[(data['category'] == 'Dialog') & (data['category']=='Dialog Malaysia')].reset_index()
    digi_news = data[(data['category'] == 'Digi')].reset_index()
    econpile_news = data[(data['category'] == 'Econpile')].reset_index()
    ecoworld_news = data[(data['category'] == 'Ecoworld')].reset_index()
    ekovest_news = data[(data['category'] == 'Ekovest')].reset_index()
    energy_news = data[(data['category'] == 'Energy')].reset_index()
    finance_news = data[(data['category'] == 'Finance')].reset_index()
    fn_news = data[(data['category'] == 'Fraser and Neave')].reset_index()
    gadang_news = data[(data['category'] == 'Gadang')].reset_index()
    genting_news = data[(data['category'] == 'Genting')].reset_index()
    greentech_news = data[(data['category'] == 'Greentech')].reset_index()
    hapseng_news = data[(data['category'] == 'Hap Seng')].reset_index()
    hartalega_news = data[(data['category'] == 'Hartalega')].reset_index()
    healthcare_news = data[(data['category'] == 'Healthcare')].reset_index()
    hong_leong_news = data[(data['category'] == 'Hong Leong')].reset_index()
    igb_news = data[(data['category'] == 'IGB')].reset_index()
    ihh_news = data[(data['category'] == 'IHH')].reset_index()
    inari_news = data[(data['category'] == 'Inari')].reset_index()
    industrial_news = data[(data['category'] == 'Industrial Product')].reset_index()
    ioi_news = data[(data['category'] == 'IOI')].reset_index()
    klcc_news = data[(data['category'] == 'KLCC')].reset_index()
    klci_news = data[(data['category'] == 'KLCI')].reset_index()
    kossan_news = data[(data['category'] == 'Kossan')].reset_index()
    kpj_news = data[(data['category'] == 'KPJ')].reset_index()
    malakoff_news = data[(data['category'] == 'Malakoff')].reset_index()
    mab_news = data[(data['category'] == 'Malaysia Airport')].reset_index()
    my_economy_news = data[(data['category'] == 'Malaysia Economy')].reset_index()
    my_employment_news = data[(data['category'] == 'Malaysia Employment')].reset_index()
    my_gov_policy_news = data[(data['category'] == 'Malaysia Government Policy')].reset_index()
    my_loan_news = data[(data['category'] == 'Malaysia Loan')].reset_index()
    my_mco_news = data[(data['category'] == 'Malaysia MCO')].reset_index()
    my_moratorium_news = data[(data['category'] == 'Malaysia Moratorium')].reset_index()
    my_recover_plan_news = data[(data['category'] == 'Malaysia Recover Plan')].reset_index()
    my_stimulus_news = data[(data['category'] == 'Malaysia Stimulus')].reset_index()
    matrix_news = data[(data['category'] == 'Matrix')].reset_index()
    maxis_news = data[(data['category'] == 'Maxis')].reset_index()
    maybank_news = data[(data['category'] == 'Maybank')].reset_index()
    misc_news = data[(data['category'] == 'MISC')].reset_index()
    my_eg_news = data[(data['category'] == 'MY EG')].reset_index()
    nestle_news = data[(data['category'] == 'Nestle')].reset_index()
    osk_news = data[(data['category'] == 'OSK')].reset_index()
    pavilion_news = data[(data['category'] == 'Pavilion')].reset_index()
    petronas_news = data[(data['category'] == 'Petronas')].reset_index()
    plantation_news = data[(data['category'] == 'Plantation')].reset_index()
    ppb_news = data[(data['category'] == 'PPB Group')].reset_index()
    property_news = data[(data['category'] == 'Property')].reset_index()
    pb_news = data[(data['category'] == 'Public Bank')].reset_index()
    ql_news = data[(data['category'] == 'QL Resources')].reset_index()
    reit_news = data[(data['category'] == 'REIT')].reset_index()
    rhb_news = data[(data['category'] == 'RHB')].reset_index()
    sarawak_news = data[(data['category'] == 'Sarawak Oil Palm')].reset_index()
    sime_news = data[(data['category'] == 'Sime Darby')].reset_index()
    sp_setia_news = data[(data['category'] == 'SP Setia')].reset_index()
    sunway_news = data[(data['category'] == 'Sunway')].reset_index()
    tech_news = data[(data['category'] == 'Technology')].reset_index()
    telecommunication_news = data[(data['category'] == 'Telecommunication')].reset_index()
    tel_my_news = data[(data['category'] == 'Telekom Malaysia') & (data['category'] == 'TM')].reset_index()
    tenaga_news = data[(data['category'] == 'Tenaga')].reset_index()
    topglove_news = data[(data['category'] == 'Topglove')].reset_index()
    transportation_news = data[(data['category'] == 'Transportation')].reset_index()
    uoa_news = data[(data['category'] == 'UOA')].reset_index()
    utilities_news = data[(data['category'] == 'Utilities')].reset_index()
    vitrox_news = data[(data['category'] == 'Vitrox')].reset_index()
    yinson_news = data[(data['category'] == 'Yinson')].reset_index()
    ytl_news = data[(data['category'] == 'YTL')].reset_index()
   
    data1 = split_news_category(axiata_news, "Axiata")
    data2 = split_news_category(bnm_news, "Bank Negara Malaysia")
    all_data = data1.append(data2, ignore_index=True)
    
    data3 = split_news_category(bumi_armada_news, "Bumi Armada")
    all_data = all_data.append(data3, ignore_index=True)
    
    data4 = split_news_category(cimb_news, "CIMB")
    all_data = all_data.append(data4, ignore_index=True)
    
    data5 = split_news_category(construction_news, "Construction")
    all_data = all_data.append(data5, ignore_index=True)
    
    data6 = split_news_category(consumer_news, "Consumer")
    all_data = all_data.append(data6, ignore_index=True)
    
    data7 = split_news_category(covid_news, "Covid-19 Malaysia")
    all_data = all_data.append(data7, ignore_index=True)
    
    data8 = split_news_category(dialog_news, "Dialog")
    all_data = all_data.append(data8, ignore_index=True)
    
    data9 = split_news_category(digi_news, "Digi")
    all_data = all_data.append(data9, ignore_index=True)
    
    data10 = split_news_category(econpile_news, "Econpile")
    all_data = all_data.append(data10, ignore_index=True)
    
    data11 = split_news_category(ecoworld_news, "Ecoworld")
    all_data = all_data.append(data11, ignore_index=True)
    
    data12 = split_news_category(ekovest_news, "Ekovest")
    all_data = all_data.append(data12, ignore_index=True)
    
    data13 = split_news_category(energy_news, "Energy")
    all_data = all_data.append(data13, ignore_index=True)
    
    data14 = split_news_category(finance_news, "Finance")
    all_data = all_data.append(data14, ignore_index=True)
    
    data15 = split_news_category(fn_news, "Fraser and Neave")
    all_data = all_data.append(data15, ignore_index=True)
    
    data17 = split_news_category(gadang_news, "Gadang")
    all_data = all_data.append(data17, ignore_index=True)
    
    data18 = split_news_category(genting_news, "Genting")
    all_data = all_data.append(data18, ignore_index=True)
    
    data19 = split_news_category(greentech_news, "Greentech")
    all_data = all_data.append(data19, ignore_index=True)
    
    data20 = split_news_category(hapseng_news, "Hap Seng")
    all_data = all_data.append(data20, ignore_index=True)
    
    data21 = split_news_category(hartalega_news, "Hartalega")
    all_data = all_data.append(data21, ignore_index=True)
    
    data22 = split_news_category(healthcare_news, "Healthcare")
    all_data = all_data.append(data22, ignore_index=True)
    
    data23 = split_news_category(hong_leong_news, "Hong Leong")
    all_data = all_data.append(data23, ignore_index=True)
    
    data24 = split_news_category(igb_news, "IGB")
    all_data = all_data.append(data24, ignore_index=True)
    
    data25 = split_news_category(ihh_news, "IHH")
    all_data = all_data.append(data25, ignore_index=True)
    
    data26 = split_news_category(inari_news, "Inari")
    all_data = all_data.append(data26, ignore_index=True)
    
    data27 = split_news_category(industrial_news, "Industrial Product")
    all_data = all_data.append(data27, ignore_index=True)
    
    data28 = split_news_category(ioi_news, "IOI")
    all_data = all_data.append(data28, ignore_index=True)
    
    data29 = split_news_category(klcc_news, "KLCC")
    all_data = all_data.append(data29, ignore_index=True)
    
    data30 = split_news_category(klci_news, "KLCI")
    all_data = all_data.append(data30, ignore_index=True)
    
    data31 = split_news_category(kossan_news, "Kossan")
    all_data = all_data.append(data31, ignore_index=True)
    
    data32 = split_news_category(kpj_news, "KPJ")
    all_data = all_data.append(data32, ignore_index=True)
    
    data33 = split_news_category(malakoff_news, "Malakoff")
    all_data = all_data.append(data33, ignore_index=True)
    
    data34 = split_news_category(mab_news, "Malaysia Airport")
    all_data = all_data.append(data34, ignore_index=True)
    
    data35 = split_news_category(my_economy_news, "Malaysia Economy")
    all_data = all_data.append(data35, ignore_index=True)
    
    data36 = split_news_category(my_employment_news, "Malaysia Employment")
    all_data = all_data.append(data36, ignore_index=True)
    
    data37 = split_news_category(my_gov_policy_news, "Malaysia Government Policy")
    all_data = all_data.append(data37, ignore_index=True)
    
    data38 = split_news_category(my_loan_news, "Malaysia Loan")
    all_data = all_data.append(data38, ignore_index=True)
    
    data39 = split_news_category(my_mco_news, "Malaysia MCO")
    all_data = all_data.append(data39, ignore_index=True)
    
    data40 = split_news_category(my_moratorium_news, "Malaysia Moratorium")
    all_data = all_data.append(data40, ignore_index=True)
    
    data41 = split_news_category(my_recover_plan_news, "Malaysia Recover Plan")
    all_data = all_data.append(data41, ignore_index=True)
    
    data42 = split_news_category(my_stimulus_news, "Malaysia Stimulus")
    all_data = all_data.append(data42, ignore_index=True)
    
    data43 = split_news_category(matrix_news, "Matrix")
    all_data = all_data.append(data43, ignore_index=True)
    
    data44 = split_news_category(maxis_news, "Maxis")
    all_data = all_data.append(data44, ignore_index=True)
    
    data45 = split_news_category(maybank_news, "Maybank")
    all_data = all_data.append(data45, ignore_index=True)
    
    data46 = split_news_category(misc_news, "MISC")
    all_data = all_data.append(data46, ignore_index=True)
    
    data47 = split_news_category(my_eg_news, "My EG")
    all_data = all_data.append(data47, ignore_index=True)
    
    data48 = split_news_category(nestle_news, "Nestle")
    all_data = all_data.append(data48, ignore_index=True)
    
    data49 = split_news_category(osk_news, "OSK")
    all_data = all_data.append(data49, ignore_index=True)
    
    data50 = split_news_category(pavilion_news, "Pavilion")
    all_data = all_data.append(data50, ignore_index=True)
    
    data51 = split_news_category(petronas_news, "Petronas")
    all_data = all_data.append(data51, ignore_index=True)
    
    data52 = split_news_category(plantation_news, "Plantation")
    all_data = all_data.append(data52, ignore_index=True)
    
    data53 = split_news_category(ppb_news, "PPB Group")
    all_data = all_data.append(data53, ignore_index=True)
    
    data54 = split_news_category(property_news, "Property")
    all_data = all_data.append(data54, ignore_index=True)
    
    data55 = split_news_category(pb_news, "Public Bank")
    all_data = all_data.append(data55, ignore_index=True)
    
    data56 = split_news_category(ql_news, "QL Resources")
    all_data = all_data.append(data56, ignore_index=True)
    
    data57 = split_news_category(reit_news, "REIT")
    all_data = all_data.append(data57, ignore_index=True)
    
    data58 = split_news_category(rhb_news, "RHB")
    all_data = all_data.append(data58, ignore_index=True)
    
    data59 = split_news_category(sarawak_news, "Sarawak Oil Palm")
    all_data = all_data.append(data59, ignore_index=True)
    
    data60 = split_news_category(sime_news, "Sime Darby")
    all_data = all_data.append(data60, ignore_index=True)
    
    data61 = split_news_category(sp_setia_news, "SP Setia")
    all_data = all_data.append(data61, ignore_index=True)
    
    data62 = split_news_category(sunway_news, "Sunway")
    all_data = all_data.append(data62, ignore_index=True)
    
    data63 = split_news_category(tech_news, "Technology")
    all_data = all_data.append(data63, ignore_index=True)
    
    data64 = split_news_category(telecommunication_news, "Telecommunication")
    all_data = all_data.append(data64, ignore_index=True)
    
    data65 = split_news_category(tel_my_news, "Telekom Malaysia")
    all_data = all_data.append(data65, ignore_index=True)
    
    data66 = split_news_category(tenaga_news, "Tenaga")
    all_data = all_data.append(data66, ignore_index=True)
    
    data67 = split_news_category(topglove_news, "Topglove")
    all_data = all_data.append(data67, ignore_index=True)
    
    data68 = split_news_category(transportation_news, "Transportation")
    all_data = all_data.append(data68, ignore_index=True)
    
    data69 = split_news_category(uoa_news, "UOA")
    all_data = all_data.append(data69, ignore_index=True)
    
    data70 = split_news_category(utilities_news, "Utilities")
    all_data = all_data.append(data70, ignore_index=True)
    
    data71 = split_news_category(vitrox_news, "Vitrox")
    all_data = all_data.append(data71, ignore_index=True)
    
    data72 = split_news_category(yinson_news, "Yinson")
    all_data = all_data.append(data72, ignore_index=True)
    
    data73 = split_news_category(ytl_news, "YTL")
    all_data = all_data.append(data73, ignore_index=True)
    
    return all_data

In [86]:
def split_news_category(data, category):
    #group daily sentiment score
    data = data.drop(['content'], axis=1)
    data = data.groupby(['date'])['sentiment_score'].mean().reset_index().rename(columns={'mean':'sentiment score'})
    data = data.set_index('date') 
    data = data.sort_index() # sort by datetime
    data = data.reset_index()
    
    data = data.set_index('date')
    
    data["sentiment_score_1"] = data.shift(1)
    data["category"] = category
    data.reset_index(inplace=True)
    data.rename(columns={"index":"date"},inplace=True)
    return data

In [87]:
start = dt.datetime(2020,1,1)
end = dt.datetime(2020,12,31)
data = filter_news_sentiment(start, end, sentiment_news)

In [88]:
data.head()

Unnamed: 0,date,sentiment_score,sentiment_score_1,category
0,2019-07-02,0.022497,,Axiata
1,2019-07-04,0.048938,0.022497,Axiata
2,2019-07-08,0.423183,0.048938,Axiata
3,2019-08-06,0.020717,0.423183,Axiata
4,2019-08-09,0.04724,0.020717,Axiata


In [89]:
data.to_csv(r"Data/sentiment_result/sentiment_lagged_with_category.csv", sep="\t")