In [1]:
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize, sent_tokenize
from sklearn import metrics
from gensim.models import Word2Vec
import re
import os

#### Load Data

In [2]:
word_dict= pd.read_csv('pos_neg_list.csv')
pos_list = word_dict[word_dict['Positive']==1]['Word'].tolist()
neg_list = word_dict[word_dict['Negative']==1]['Word'].tolist()

# define negation word list
negation_list = ['not','no','nobody','none','never','neither','cannot']

full_df = pd.read_csv( "economic_sentiment_data.csv")

full_df = full_df[['sentence','sentiment','polarity']]

#### Apply word2vec to the sentences

In [3]:
def clean_text(raw):
    raw = re.sub(r"</br>",".", raw)
    raw = re.sub(r"[.]+",".", raw)
    raw = re.sub(r"[-+]?\d*\.\d+|\d+","", raw)
    raw = re.sub("\d","", raw)
    raw = re.sub(r'[%-]',"", raw)
    return raw

In [4]:
paragraphs = full_df.sentence.tolist()

paragraphs = list(map(clean_text, paragraphs))

paragraphs = list(map(sent_tokenize,paragraphs))

sentences = [sent for para in paragraphs for sent in para ]

In [5]:
def sentence_to_wordlist(raw):
    clean = re.sub("[^a-zA-Z]"," ", raw)
    words = clean.split()
    return words

sentences = list(map(sentence_to_wordlist, sentences))

In [6]:
# Load pre-trained imf w2v model
data_path = os.path.join('U:\My Documents\Python\Text Mining\Data\w2v','imf_160.w2v')
imf_w2v = Word2Vec.load(data_path)

#### Gennerate new negative table

In [16]:
print(len(neg_list))
neg_list_int = [x for x in neg_list if x in imf_w2v.wv.vocab] # filter out those not in word2vec vocab
print(len(neg_list_int))

neg_list_table = pd.DataFrame(neg_list_int, columns=['original_word'])
neg_list_table.head()

neg_list_aug = list(map(lambda a: [[x[0],x[1]] for x in imf_w2v.wv.most_similar(a)], neg_list_int))

neg_table = list(map(pd.DataFrame, neg_list_aug))

neg_table = pd.concat(neg_table,axis = 0)

neg_table['distance_rank'] = neg_table.index

neg_table.rename(columns={0: 'word', 1: 'distance'},inplace= True)

np.repeat(a=(1,2,3) , repeats= 10)

neg_table['original_word_rank'] = np.repeat(range(len(neg_list_int)),10)

neg_table = neg_table.merge(neg_list_table, how = 'outer', left_on= 'original_word_rank', right_index= True)

295
288


#### Generate new positive table

In [18]:
print(len(pos_list))
pos_list_int = [x for x in pos_list if x in imf_w2v.wv.vocab] # filter out those not in word2vec vocab
print(len(pos_list_int))

pos_list_table = pd.DataFrame(pos_list_int, columns=['original_word'])
pos_list_table.head()

pos_list_aug = list(map(lambda a: [[x[0],x[1]] for x in imf_w2v.wv.most_similar(a)], pos_list_int))

pos_table = list(map(pd.DataFrame, pos_list_aug))

pos_table = pd.concat(pos_table,axis = 0)

pos_table['distance_rank'] = pos_table.index

pos_table.rename(columns={0: 'word', 1: 'distance'},inplace= True)

np.repeat(a=(1,2,3) , repeats= 10)

pos_table['original_word_rank'] = np.repeat(range(len(pos_list_int)),10)

pos_table = pos_table.merge(pos_list_table, how = 'outer', left_on= 'original_word_rank', right_index= True)

96
95


In [19]:
writer = pd.ExcelWriter(path = 'aug_pos_neg_list.xlsx')
        
pos_table.to_excel(writer, 'pos')
neg_table.to_excel(writer, 'neg')

writer.save()  

In [3]:
def get_sentiment_score(paragraph, pos_list, neg_list, negation_list):
    
    '''return sentiment score, only negate positive words'''
    
    new_tokens = word_tokenize(paragraph)
    new_tokens =[x.lower() for x in new_tokens]
    
    window = 3
    
    n_pos = sum([new_tokens.count(x) for x in pos_list])
    n_neg = sum([new_tokens.count(x) for x in neg_list])
    n_total = len(new_tokens)
    
    ## calculate number of negation words in the window of +/-3 next to n_pos 
    
    pos_index = [i for i, val in enumerate(new_tokens) if val in pos_list]
    pos_range_lower = np.array(pos_index) - window
    pos_range_upper = np.array(pos_index) + window
    
    negation_index = [i for i, val in enumerate(new_tokens) if val in negation_list]
    
    
    pos_range_lower = np.repeat(pos_range_lower, len(negation_index))    
    pos_range_upper = np.repeat(pos_range_upper, len(negation_index))
    
    negation_index = np.repeat(negation_index, n_pos)
    
    n_negation = np.sum( (pos_range_lower < negation_index) & (pos_range_upper > negation_index) )
    
    sentiment_score = (n_pos-n_negation - n_neg) / n_total
    
    return sentiment_score

### Generate prediction and predict_lable using the above bow approach

In [4]:
full_df['prediciton'] = full_df.sentence.apply(lambda x: get_sentiment_score(paragraph=x, pos_list= pos_list, neg_list= neg_list, negation_list= negation_list))

full_df['predict_label'] = full_df.prediciton.apply(lambda x: 1 if x>0 else 0)

### Get accuracy for full sample and sub samples

In [5]:
metrics.accuracy_score(full_df['polarity'], full_df['predict_label'])

0.6605333333333333

In [6]:
metrics.accuracy_score(full_df.iloc[:3000]['polarity'], full_df.iloc[:3000]['predict_label'])

0.663

In [7]:
metrics.accuracy_score(full_df.iloc[3000:]['polarity'], full_df.iloc[3000:]['predict_label'])

0.6506666666666666

### Several Examples

In [8]:
pred_sentences = [
  '''While the RMB in 2017 was broadly in line with
economic fundamentals and desirable policies, the current account surplus was moderately
stronger. This reflects structural distortions and policies that cause excessive savings, such as low
social spending. Addressing these distortions and the resulting external imbalance would benefit
both China and the global economy.''',
  '''Favorable domestic and external conditions reduced capital outflows and exchange
rate pressure. The RMB was broadly stable against the basket published by the China Foreign
Exchange Trade System (CFETS) in 2017, but with more fluctuation versus the dollar, and it has
appreciated by about 2 percent in real effective terms in the first half of 2018. The current account
surplus continued to decline but, reflecting distortions and policy gaps that encourage excessive
savings, the external position for 2017 is assessed as moderately stronger than the level consistent
with medium-term fundamentals and desirable policies, with the exchange rate broadly in line
(Appendix I).''',
    '''Large outflows and pressure on
the exchange rate could resume due to tighter
and more volatile global financial conditions,
especially a surging dollar. Investor sentiment
towards emerging markets has recently
weakened, and this could intensify, potentially
spreading to China.''',
  '''. Uncoordinated financial and local government regulatory action could have
unintended consequences that trigger disorderly repricing of corporate/LGFV credit risks, losses
for investors, and rollover risks for financial institutions''',
  '''But a lack of decisive reforms in deleveraging and rebalancing would add to the
Faster reform progress could pave the way for higher and
more sustainable GDP growth, already-high stock of vulnerabilities and worsen resource allocation, leading to more rapidly
diminishing returns over the medium term. This scenario also raises the probability of a disruptive
adjustment to Chinese demand which would result in a contractionary impulse to the global
economy, as well as spillovers through commodity prices and financial markets. '''
]

In [13]:
[get_sentiment_score(x, pos_list= pos_list, neg_list= neg_list, negation_list= negation_list) for x in pred_sentences]

[-0.05172413793103448,
 0.008771929824561403,
 -0.045454545454545456,
 -0.06451612903225806,
 -0.047058823529411764]