## This file uses pre-trained w2v model to find similar words in the pos-neg word list

In [1]:
import pandas as pd
import numpy as np
import re
import os
import sys
import csv 

from nltk.tokenize import word_tokenize, sent_tokenize
from sklearn import metrics
import gensim
from gensim.models import Word2Vec

#### apply pre-trained word2vec to the positive and negative word lists to find most similar words

In [2]:
#1. download word2vec model
# ##specify download path and extract path 
# download_path = "imf_w2v.zip"
# download_link = "https://www.dropbox.com/sh/6um97x52kweebfx/AACSxB0E9weItCbyQwUqvuWRa?dl=1"
# extract_path = './data'
# data_util.download_data(download_path,download_link,extract_path)

#2. load pre-trained imf w2v model
model_path = os.path.join('model','imf_160.w2v')
imf_w2v = Word2Vec.load(model_path)

#### load data: 1. lists with positive and negative words; 2. economic news dataset

In [3]:
# 1.
word_dict= pd.read_csv(os.path.join('../Sentiment Analysis','pos_neg_list.csv'))
pos_list = word_dict[word_dict['Positive']==1]['Word'].tolist()
neg_list = word_dict[word_dict['Negative']==1]['Word'].tolist()

# define negation word list
negation_list = ['not','no','nobody','none','never','neither','cannot']

# 2.
full_df = pd.read_csv(os.path.join('../Sentiment Analysis',"economic_sentiment_data.csv") )
full_df = full_df[['sentence','sentiment','polarity']]

#### clean and tokenize sentences for word2vec training

In [4]:
def clean_text(raw):
    '''clean a paragraph and breaks into sentences'''
    raw = re.sub(r"</br>",".", raw)
    raw = re.sub(r"[.]+",".", raw)
    raw = re.sub(r"[-+]?\d*\.\d+|\d+","", raw)
    raw = re.sub("\d","", raw)
    raw = re.sub(r'[%-]',"", raw)
    raw = sent_tokenize(raw)
    return raw

In [5]:
paragraphs = full_df.sentence.tolist()

paragraphs = list(map(clean_text, paragraphs))

sentences = [sent for para in paragraphs for sent in para ]

In [6]:
def sentence_to_wordlist(raw):
    '''clean and tokenize each sentence into words'''
    clean = re.sub("[^a-zA-Z]"," ", raw)
    words = clean.split()
    return words

sentences = list(map(sentence_to_wordlist, sentences))

#### generate new negative list

In [8]:
# filter out those not in word2vec vocab
print('Original negative list length: {}'.format(len(neg_list)))
neg_list_org = [x for x in neg_list if x in imf_w2v.wv.vocab] 
print('Those in the word2vec vocab: {}'.format(len(neg_list_org)))
neg_list_df = pd.DataFrame(neg_list_org, columns=['original_word']) # useful for merger later

# get augmented list
neg_df = list(map(lambda a: [[x[0],x[1]] for x in imf_w2v.wv.most_similar(a)], neg_list_org)) 
neg_df = list(map(pd.DataFrame, neg_df))
neg_df = pd.concat(neg_df,axis = 0)

neg_df['similarity_rank'] = neg_df.index
neg_df.rename(columns={0: 'word', 1: 'similarity'},inplace= True)

# merge with original word
neg_df['original_word_rank'] = np.repeat(range(len(neg_list_org)),10)
neg_df = neg_df.merge(neg_list_df, how = 'outer', left_on= 'original_word_rank', right_index= True)

Original negative list length: 295
Thse in the word2vec vocab: 288


#### generate new positive table

In [None]:
print(len(pos_list))
pos_list_int = [x for x in pos_list if x in imf_w2v.wv.vocab] # filter out those not in word2vec vocab
print(len(pos_list_int))

pos_list_df = pd.DataFrame(pos_list_int, columns=['original_word'])
pos_list_df.head()

pos_list_aug = list(map(lambda a: [[x[0],x[1]] for x in imf_w2v.wv.most_similar(a)], pos_list_int))

pos_df = list(map(pd.DataFrame, pos_list_aug))

pos_df = pd.concat(pos_df,axis = 0)

pos_df['similarity_rank'] = pos_df.index

pos_df.rename(columns={0: 'word', 1: 'similarity'},inplace= True)

np.repeat(a=(1,2,3) , repeats= 10)

pos_df['original_word_rank'] = np.repeat(range(len(pos_list_int)),10)

pos_df = pos_df.merge(pos_list_df, how = 'outer', left_on= 'original_word_rank', right_index= True)

#### filter and save based on: 1. similarity larger than 75 %ile; 2. new word not in the original word lists (pos & neg)

In [None]:
full_list = pos_list + neg_list

In [None]:
neg_df = neg_df[(neg_df.similarity >= 0.623453) & (~neg_df.word.isin(full_list))]
neg_df['original_word_label'] = 'negative'
print(neg_df.shape)

#pos_df.describe()
pos_df = pos_df[(pos_df.similarity >= 0.593445) & (~pos_df.word.isin(full_list))]
pos_df['original_word_label'] = 'positive'
print(pos_df.shape)

In [None]:
full_df = pd.concat([neg_df , pos_df], axis = 0)

full_df = full_df.groupby('word',as_index= False).agg({'similarity': 'max',
                                            'original_word_rank': 'count',
                                            'original_word': 'first',
                                            'similarity_rank':'mean',
                                            'original_word_label':'first'})

full_df.rename({'similarity': 'max_sim', 'original_word_rank': 'count_in_original_word','similarity_rank':'mean_rank',
               }, inplace = True)

#### save results

In [None]:
writer = pd.ExcelWriter(path = 'aug_pos_neg_list.xlsx')
        
full_df.to_excel(writer, 'full_df')

writer.save()  