## 1. Preparation

In [59]:
# load the packages 
import pandas as pd
import numpy as np
from scipy import spatial
import datetime

import beautifultools as bt
import qgrid
from pandas.core.common import flatten
from collections import Counter

import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing
import scipy.stats
import spacy

from collections import Counter
import random
random.seed(3)

from sklearn.preprocessing import normalize
from RandomWalk import random_walk
import re
import string
import nltk
nltk.data.path.append('/home/ec2-user/SageMaker/nltk_data/')

In [2]:
# import the dataset
wsj = pd.read_csv('wsj_full1.csv') # wsj dataset
sp100 = pd.read_csv('..//data/LogReturnData.csv')

# select the relevant topics
tp_li = [0, 2, 3, 8, 9, 14, 16, 17, 19, 20, 21, 24]
wsj_selected = wsj[wsj['Topic_Num'].isin(tp_li)] 

# only the log returns of S&P100 is selected
oex = sp100[['Date', '^OEX']]



In [3]:
# label the return with positive & negative, 1 refers to positive log return, 0 refers to negative log return
oex['direction'] = 1
oex.loc[oex[oex['^OEX'] < 0].index, 'direction'] = -1

# drop NaN value
oex = oex.dropna()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value)


In [4]:
wsj1 = wsj_selected.copy() # make a copy of wsj_selected

# select relevant columns, polarity calculated with Mcdonald dict for future comparison
wsj1 = wsj1[['Title', 'Text', 'Date']]

# convert the date to datetime 
wsj1['Date'] = wsj1['Date'].apply(lambda x: datetime.datetime.strptime(x, "%Y-%m-%d").date())
oex['Date'] = oex['Date'].apply(lambda x: datetime.datetime.strptime(x, "%Y-%m-%d").date())


## 2. Text Preparation

In [5]:
# load stopping words
sp = spacy.load('en_core_web_sm')
all_stopwords = sp.Defaults.stop_words # list of stop words

# remove 'up', 'down' from stop words
all_stopwords.remove('up')
all_stopwords.remove('down')

## Change Numbers info for placeholder keep signs 
txt_new = []
reg = re.compile(r"([\\+\\-])?[0-9]+[0-9\\.]*")
for lines in wsj1["Text"].values:
    txt_new.append(reg.sub(" \\1NUM", lines))

## Define punctuation to replace (Exclude +, -, and %)
new_punct = string.punctuation + "“”’"
for symb in ["%", "+", "-", "&"]:
    new_punct = new_punct.replace(symb, "")

## String list
txt_corp = []
for doc in txt_new:
    ## Change everything to lowercase and exclude string that are only punctuations and stop words
    aux = [elem.lower() for elem in doc.split() if elem not in set(new_punct)]
    nstop = [wo for wo in aux if wo not in all_stopwords]
    txt_corp.append(nstop)

## Remove strings that only have punctuation signs
exclude = [""]
txt_end = []
for doc in txt_corp:
    new_list = [elem.translate(str.maketrans('', '', new_punct)) for elem in doc]
    txt_end.append([elem for elem in new_list if elem not in exclude])
    

In [6]:
wsj1['corpus'] = txt_end
wsj1.head()

Unnamed: 0,Title,Text,Date,corpus
1,Rate for 30-Year Mortgage Falls to Lowest on R...,"\nIn a year of financial firsts, this one stan...",2020-07-17,"[year, financial, firsts, stands, out, mortgag..."
5,Dollar's Surge Is Hurdle for Shares,\nInvestors worried about the impact of the co...,2020-02-10,"[investors, worried, impact, coronavirus, worl..."
6,Banking &amp; Finance: Clayton Dubilier Fund H...,\nClayton Dubilier & Rice could collect about ...,2020-10-22,"[clayton, dubilier, &, rice, collect, num, bil..."
7,S&amp;P Edges Higher to Record Close,\nThe S&P 500 and Nasdaq Composite notched rec...,2020-08-26,"[s&p, num, nasdaq, composite, notched, records..."
9,Campbell And Loeb Are Close To a Deal,Campbell Soup Co. is nearing a settlement with...,2018-11-26,"[campbell, soup, co, nearing, settlement, inve..."


In [7]:
# label article with direction of sp100
wsj1['logDate'] = wsj1['Date'].apply(lambda x: x + datetime.timedelta(days=1))

In [8]:
wsj1.to_csv('cleaned_corpus.csv') # save the cleaned corpus to csv file

In [9]:
df1 = wsj1.set_index('logDate').join(oex.set_index('Date')) # with lag
df2 = wsj1.set_index('Date').join(oex.set_index('Date')) # without lag

# remove NaN value
df1 = df1.dropna()
df2 = df2.dropna()
# reset the index
df1 = df1.reset_index()
df2 = df2.reset_index()

df1 = df1.drop('Date', 1) # drop the date column
df2 = df2.drop('logDate', 1) # drop the date column

In [10]:
# rename the column
df1.columns = ['date', 'Title', 'Text', 'corpus', '^OEX', 'direction']
df2.columns = ['date', 'Title', 'Text', 'corpus', '^OEX', 'direction']

In [11]:
df1.groupby('date')['Title'].count().describe() # number of articles everyday, index column refers to date

count    472.000000
mean      30.023305
std        6.244954
min       13.000000
25%       26.000000
50%       30.000000
75%       34.000000
max       56.000000
Name: Title, dtype: float64

## 3. Predictive Screening to get the seed words
### 3.1 seed words with lag = 1

In [45]:
# split the data into training & testing dataset to avoid data learkage
train_lag = df1.groupby('date').apply(lambda x: x.sample(frac=0.1))
train_ind = [index[1] for index in train_lag.index.tolist()]

In [46]:
df1['data'] = 'test'
df1.loc[train_ind, 'data'] = 'train'

In [47]:
# create a datadframe that contains the positive/negative words
def create_df(i, train, df):
    words = df[(df['direction'] == i) & (df['data'] == train)].corpus.tolist()
    words = sum(words, []) # flattern list of lists
    word_dict = dict(Counter(words)) # word count
    count_df = pd.DataFrame.from_dict(word_dict, orient = 'index') # convert dict to df

    count_df = count_df.reset_index()
    count_df.columns = ['word', 'freq']
    return count_df
    

In [48]:
# for training dataset
pos_word = create_df(1, 'train', df1)
neg_word = create_df(-1, 'train', df1)

neg_word.columns = ['word', 'neg_freq']
# pos_word.columns = ['word', 'neg_freq']
word = pos_word.set_index('word').join(neg_word.set_index('word')) # join pos_word, neg_word dataframe

In [49]:
def filter_df(df, num):
    # replace NaN with 0
    df = df.fillna(0)
    
    # reset index
    df = df.reset_index()

    # select only the word with frequency higher than 50
    df['total_freq'] = df['freq'] + df['neg_freq']
    df = df[df['total_freq'] >= num]
    
    df['pos_prob'] = df['freq']/(df['freq'] + df['neg_freq']) # prob that specific word appear in a positive article
    df['neg_prob'] = 1 - df['pos_prob']
    
    return df

In [50]:
df_prob = filter_df(word, 50).sort_values(by = ['pos_prob'], ascending=False) 
df_prob.head()

Unnamed: 0,word,freq,neg_freq,total_freq,pos_prob,neg_prob
3677,etfs,103,5.0,108.0,0.953704,0.046296
1285,sears,60,3.0,63.0,0.952381,0.047619
1544,etf,79,4.0,83.0,0.951807,0.048193
5918,coffee,51,3.0,54.0,0.944444,0.055556
194,pipeline,51,3.0,54.0,0.944444,0.055556


### Determine the threshold with binomial Confidence interval

In [67]:
################# to be confirmed #################
import statsmodels.stats.proportion as smp

thres = 0.56
pos = df_prob[df_prob['pos_prob'] >= thres]
count = len(pos)
num = len(df_prob)

print('confidence interval of positive seed words: ', smp.proportion_confint (count, num, alpha=0.05, method='wilson'))
print('confidence interval of negative seed words: ', smp.proportion_confint (num - count, num, alpha=0.05, method='wilson'))

confidence interval of positive seed words:  (0.5473500022088275, 0.5924352982371425)
confidence interval of negative seed words:  (0.4075647017628575, 0.4526499977911725)


In [68]:
################## to be confirmed ###############
df_prob['polar'] = 'positive'
df_prob.loc[df_prob[df_prob['pos_prob'] < 0.56].index, 'polar'] = 'negative'

In [69]:
df_prob.to_csv('seed_words_lag.csv')

In [70]:
df_prob.head()

Unnamed: 0,word,freq,neg_freq,total_freq,pos_prob,neg_prob,polar
3677,etfs,103,5.0,108.0,0.953704,0.046296,positive
1285,sears,60,3.0,63.0,0.952381,0.047619,positive
1544,etf,79,4.0,83.0,0.951807,0.048193,positive
5918,coffee,51,3.0,54.0,0.944444,0.055556,positive
194,pipeline,51,3.0,54.0,0.944444,0.055556,positive


### 3.2 seed words without lag

In [52]:
train = df2.groupby('date').apply(lambda x: x.sample(frac=0.1))
train_ind = [index[1] for index in train_lag.index.tolist()]

df2['data'] = 'test'
df2.loc[train_ind, 'data'] = 'train'

# for training dataset
pos_word = create_df(1, 'train', df2)
neg_word = create_df(-1, 'train', df2)

neg_word.columns = ['word', 'neg_freq']
# pos_word.columns = ['word', 'neg_freq']
word = pos_word.set_index('word').join(neg_word.set_index('word')) # join pos_word, neg_word dataframe

# word

Unnamed: 0_level_0,freq,neg_freq
word,Unnamed: 1_level_1,Unnamed: 2_level_1
detroit,17,12.0
--,1385,1063.0
new,1314,1031.0
trade,592,430.0
agreement,156,84.0
...,...,...
income-based,1,
straits,1,
hot-spots,1,
face-coverings,1,


In [53]:
df_wolag = filter_df(word, 50).sort_values(by = ['pos_prob'], ascending=False)

In [54]:
df_wolag.head()

Unnamed: 0,word,freq,neg_freq,total_freq,pos_prob,neg_prob
17308,cocoa,78,1.0,79.0,0.987342,0.012658
3866,measles,81,13.0,94.0,0.861702,0.138298
1043,mcdonalds,59,12.0,71.0,0.830986,0.169014
2608,children,113,27.0,140.0,0.807143,0.192857
4113,senate,45,11.0,56.0,0.803571,0.196429


In [73]:
########### to be confirmed #############
import statsmodels.stats.proportion as smp

thres = 0.555
pos = df_wolag[df_wolag['pos_prob'] >= thres]
count = len(pos)
num = len(df_prob)

print('confidence interval of positive seed words: ', smp.proportion_confint (count, num, alpha=0.05, method='wilson'))
print('confidence interval of negative seed words: ', smp.proportion_confint (num - count, num, alpha=0.05, method='wilson'))

confidence interval of positive seed words:  (0.5337788246494162, 0.5790208965598802)
confidence interval of negative seed words:  (0.42097910344011996, 0.4662211753505839)


In [74]:
########### to be confirmed #############
df_wolag['polar'] = 'positive'
df_wolag.loc[df_wolag[df_wolag['pos_prob'] < 0.555].index, 'polar'] = 'negative'

In [75]:
df_wolag.to_csv('wsj_seed_word.csv')

## 4. Embedding

two possible ways to reduce the dimension of the embeddings before sentprop:
1. PCA https://towardsdatascience.com/dimension-reduction-techniques-with-python-f36ca7009e5c
2. t-SNE https://arxiv.org/abs/1708.03629; https://github.com/vyraun/Half-Size

In [77]:
# import the packages
import gensim.downloader as api
import tempfile
from gensim import corpora
from gensim.test.utils import datapath
from gensim import utils
from gensim.models import Word2Vec
import string
import json
from nltk.stem import WordNetLemmatizer

In [78]:
# text preparation
cleaned_cors = pd.read_csv('cleaned_corpus.csv') # import the cleaned dataframe

## Change Numbers info for placeholder keep signs 
txt_new = []
reg = re.compile(r"([\\+\\-])?[0-9]+[0-9\\.]*")
for lines in cleaned_cors["Text"].values:
    txt_new.append(reg.sub(" \\1NUM", lines))

## Define punctuation to replace (Exclude +, -, and %)
new_punct = string.punctuation + "“”’"
for symb in ["%", "+", "-", "&"]:
    new_punct = new_punct.replace(symb, "")

## String list
txt_corp = []
for doc in txt_new:
    ## Change everything to lowercase and exclude string that are only punctuations
    aux = [elem.lower() for elem in doc.split() if elem not in set(new_punct)]
    txt_corp.append(aux)

## Remove strings that only have punctuation signs
exclude = [""]
txt_end = []
for doc in txt_corp:
    new_list = [elem.translate(str.maketrans('', '', new_punct)) for elem in doc]
    txt_end.append([elem for elem in new_list if elem not in exclude])

In [79]:
dicts = corpora.Dictionary(txt_end)

## Define function to get embeddings to memory
def get_wv(model, dicts):
    """ Get word embeddings in memory"""
    w2v_embed = {}
    missing = []
    for val in dicts.values():
        try: 
            it = model.wv[val]
        except:
            missing.append(val)
            it = None 
        w2v_embed[val] = it
    return w2v_embed, missing

In [80]:
print('number of unique words: ', len(dicts))

dicts.filter_extremes(no_below=20, no_above=0.8, keep_n=None, keep_tokens=None)
print('number of unique words after fitlering: ', len(dicts))

number of unique words:  116106
number of unique words after fitlering:  17429


### 4.1 pre-trained word embedding

In [81]:
path = 'GoogleNews-vectors-negative300.bin'

model = Word2Vec(txt_corp, size = 300, min_count = 25)
model.intersect_word2vec_format(path,
                                lockf=1.0,
                                binary=True)

model.train(txt_corp, total_examples=model.corpus_count, epochs=25)
w2v_embed, mis  = get_wv(model, dicts)

In [82]:
embeds_1df = pd.DataFrame(w2v_embed)

In [83]:
embeds_1df.to_csv('pre_embedding.csv')


### 4.2 Self-trained embedding

In [84]:
model_t = Word2Vec(txt_corp, window=5, min_count=25, workers=4, size = 50)
model_t.train(txt_corp, epochs=50, total_words = model_t.corpus_total_words,
              total_examples = model_t.corpus_count)
embeds_2 = get_wv(model_t, dicts)
a, b = embeds_2

In [85]:
embeds_2df = pd.DataFrame(a)

In [86]:
# save the embedding to csv
embeds_2df.to_csv('self_embedding.csv')