In [1]:
import pandas as pd
import numpy as np
import re
import os

from nltk.tokenize import word_tokenize, sent_tokenize
from sklearn import metrics
from gensim.models import Word2Vec

#### load data: 1. lists with positive and negative words; 2. economic news dataset

In [2]:
os.getcwd()

'/Users/apple/Desktop/Python/Text Mining/Word Embedding'

In [3]:
# 1.
word_dict= pd.read_csv(os.path.join('../Sentiment Analysis','pos_neg_list.csv'))
pos_list = word_dict[word_dict['Positive']==1]['Word'].tolist()
neg_list = word_dict[word_dict['Negative']==1]['Word'].tolist()

# define negation word list
negation_list = ['not','no','nobody','none','never','neither','cannot']

# 2.
full_df = pd.read_csv(os.path.join('../Sentiment Analysis',"economic_sentiment_data.csv") )
full_df = full_df[['sentence','sentiment','polarity']]

#### clean and tokenize sentences

In [4]:
def clean_text(raw):
    '''clean a paragraph and breaks into sentences'''
    raw = re.sub(r"</br>",".", raw)
    raw = re.sub(r"[.]+",".", raw)
    raw = re.sub(r"[-+]?\d*\.\d+|\d+","", raw)
    raw = re.sub("\d","", raw)
    raw = re.sub(r'[%-]',"", raw)
    raw = sent_tokenize(raw)
    return raw

In [5]:
paragraphs = full_df.sentence.tolist()

paragraphs = list(map(clean_text, paragraphs))

sentences = [sent for para in paragraphs for sent in para ]

In [6]:
def sentence_to_wordlist(raw):
    '''clean and tokenize each sentence into words'''
    clean = re.sub("[^a-zA-Z]"," ", raw)
    words = clean.split()
    return words

sentences = list(map(sentence_to_wordlist, sentences))

#### apply pre-trained word2vec to the positive and negative word lists to find most similar words

In [7]:
#1. download word2vec model
# ##specify download path and extract path 
# download_path = "imf_w2v.zip"
# download_link = "https://www.dropbox.com/sh/6um97x52kweebfx/AACSxB0E9weItCbyQwUqvuWRa?dl=1"
# extract_path = './data'
# data_util.download_data(download_path,download_link,extract_path)

#2. load pre-trained imf w2v model
data_path = os.path.join('model','imf_160.w2v')
imf_w2v = Word2Vec.load(data_path)

#### generate new negative list

In [8]:
# filter out those not in word2vec vocab
print('Original negative list length: {}'.format(len(neg_list)))
neg_list_org = [x for x in neg_list if x in imf_w2v.wv.vocab] 
print('Thse in the word2vec vocab: {}'.format(len(neg_list_org)))
neg_list_df = pd.DataFrame(neg_list_org, columns=['original_word']) # useful for merger later

# get augmented list
neg_df = list(map(lambda a: [[x[0],x[1]] for x in imf_w2v.wv.most_similar(a)], neg_list_org)) 
neg_df = list(map(pd.DataFrame, neg_df))
neg_df = pd.concat(neg_df,axis = 0)

neg_df['similarity_rank'] = neg_df.index
neg_df.rename(columns={0: 'word', 1: 'similarity'},inplace= True)

# merge with original word
neg_df['original_word_rank'] = np.repeat(range(len(neg_list_org)),10)
neg_df = neg_df.merge(neg_list_df, how = 'outer', left_on= 'original_word_rank', right_index= True)

Original negative list length: 295
Thse in the word2vec vocab: 288


#### Generate new positive table

In [9]:
print(len(pos_list))
pos_list_int = [x for x in pos_list if x in imf_w2v.wv.vocab] # filter out those not in word2vec vocab
print(len(pos_list_int))

pos_list_df = pd.DataFrame(pos_list_int, columns=['original_word'])
pos_list_df.head()

pos_list_aug = list(map(lambda a: [[x[0],x[1]] for x in imf_w2v.wv.most_similar(a)], pos_list_int))

pos_df = list(map(pd.DataFrame, pos_list_aug))

pos_df = pd.concat(pos_df,axis = 0)

pos_df['similarity_rank'] = pos_df.index

pos_df.rename(columns={0: 'word', 1: 'similarity'},inplace= True)

np.repeat(a=(1,2,3) , repeats= 10)

pos_df['original_word_rank'] = np.repeat(range(len(pos_list_int)),10)

pos_df = pos_df.merge(pos_list_df, how = 'outer', left_on= 'original_word_rank', right_index= True)

96
95


#### filter and save based on: 1. similarity larger than 75 %ile; 2. new word not in the original word lists (pos & neg)

In [10]:
full_list = pos_list + neg_list

In [11]:
neg_df = neg_df[(neg_df.similarity >= 0.623453) & (~neg_df.word.isin(full_list))]
neg_df['original_word_label'] = 'negative'
print(neg_df.shape)

#pos_df.describe()
pos_df = pos_df[(pos_df.similarity >= 0.593445) & (~pos_df.word.isin(full_list))]
pos_df['original_word_label'] = 'positive'
print(pos_df.shape)

(348, 6)
(154, 6)


In [18]:
full_df = pd.concat([neg_df , pos_df], axis = 0)

full_df = full_df.groupby('word',as_index= False).agg({'similarity': 'max',
                                            'original_word_rank': 'count',
                                            'original_word': 'first',
                                            'similarity_rank':'mean',
                                            'original_word_label':'first'})

full_df.rename({'similarity': 'max_sim', 'original_word_rank': 'count_in_original_word','similarity_rank':'mean_rank',
               }, inplace = True)

In [19]:
writer = pd.ExcelWriter(path = 'aug_pos_neg_list.xlsx')
        
full_df.to_excel(writer, 'full_df')

writer.save()  

#### calculate sentiment score on new list (need to be manually labeled from the previous results)

In [None]:
def get_sentiment_score(paragraph, pos_list, neg_list, negation_list):
    
    '''return sentiment score, only negate positive words'''
    
    new_tokens = word_tokenize(paragraph)
    new_tokens =[x.lower() for x in new_tokens]
    
    window = 3
    
    n_pos = sum([new_tokens.count(x) for x in pos_list])
    n_neg = sum([new_tokens.count(x) for x in neg_list])
    n_total = len(new_tokens)
    
    ## calculate number of negation words in the window of +/-3 next to n_pos 
    
    pos_index = [i for i, val in enumerate(new_tokens) if val in pos_list]
    pos_range_lower = np.array(pos_index) - window
    pos_range_upper = np.array(pos_index) + window
    
    negation_index = [i for i, val in enumerate(new_tokens) if val in negation_list]
    
    
    pos_range_lower = np.repeat(pos_range_lower, len(negation_index))    
    pos_range_upper = np.repeat(pos_range_upper, len(negation_index))
    
    negation_index = np.repeat(negation_index, n_pos)
    
    n_negation = np.sum( (pos_range_lower < negation_index) & (pos_range_upper > negation_index) )
    
    sentiment_score = (n_pos-n_negation - n_neg) / n_total
    
    return sentiment_score

### Generate prediction and predict_lable using the above bow approach

In [None]:
full_df['prediciton'] = full_df.sentence.apply(lambda x: get_sentiment_score(paragraph=x, pos_list= pos_list, neg_list= neg_list, negation_list= negation_list))

full_df['predict_label'] = full_df.prediciton.apply(lambda x: 1 if x>0 else 0)

### Get accuracy for full sample and sub samples

In [None]:
metrics.accuracy_score(full_df['polarity'], full_df['predict_label'])

In [None]:
metrics.accuracy_score(full_df.iloc[:3000]['polarity'], full_df.iloc[:3000]['predict_label'])

In [None]:
metrics.accuracy_score(full_df.iloc[3000:]['polarity'], full_df.iloc[3000:]['predict_label'])

### Several Examples

In [None]:
pred_sentences = [
  '''While the RMB in 2017 was broadly in line with
economic fundamentals and desirable policies, the current account surplus was moderately
stronger. This reflects structural distortions and policies that cause excessive savings, such as low
social spending. Addressing these distortions and the resulting external imbalance would benefit
both China and the global economy.''',
  '''Favorable domestic and external conditions reduced capital outflows and exchange
rate pressure. The RMB was broadly stable against the basket published by the China Foreign
Exchange Trade System (CFETS) in 2017, but with more fluctuation versus the dollar, and it has
appreciated by about 2 percent in real effective terms in the first half of 2018. The current account
surplus continued to decline but, reflecting distortions and policy gaps that encourage excessive
savings, the external position for 2017 is assessed as moderately stronger than the level consistent
with medium-term fundamentals and desirable policies, with the exchange rate broadly in line
(Appendix I).''',
    '''Large outflows and pressure on
the exchange rate could resume due to tighter
and more volatile global financial conditions,
especially a surging dollar. Investor sentiment
towards emerging markets has recently
weakened, and this could intensify, potentially
spreading to China.''',
  '''. Uncoordinated financial and local government regulatory action could have
unintended consequences that trigger disorderly repricing of corporate/LGFV credit risks, losses
for investors, and rollover risks for financial institutions''',
  '''But a lack of decisive reforms in deleveraging and rebalancing would add to the
Faster reform progress could pave the way for higher and
more sustainable GDP growth, already-high stock of vulnerabilities and worsen resource allocation, leading to more rapidly
diminishing returns over the medium term. This scenario also raises the probability of a disruptive
adjustment to Chinese demand which would result in a contractionary impulse to the global
economy, as well as spillovers through commodity prices and financial markets. '''
]

In [None]:
[get_sentiment_score(x, pos_list= pos_list, neg_list= neg_list, negation_list= negation_list) for x in pred_sentences]