In [1]:
# -*- coding: utf-8 -*-
from __future__ import print_function
import os
import re

import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from gensim.models import KeyedVectors
from scipy.spatial.distance import cosine
from sklearn.metrics import euclidean_distances
from pyemd import emd

import pandas as pd
from nltk.tokenize import word_tokenize
import nltk
import string

In [2]:
def check_value(w):
    w_string = str(w.encode('utf-8'))
    if(w_string in vocab_dict):
        return(vocab_dict[w_string])
        
    return 0

def lexicon_rate(lexicon, comment):
    vect = CountVectorizer(token_pattern=pattern, strip_accents=None).fit([lexicon, comment])
    v_1, v_2 = vect.transform([lexicon, comment])
    v_1 = v_1.toarray().ravel()
    v_2 = v_2.toarray().ravel()
    W_ = W[[check_value(w) for w in vect.get_feature_names()]]
    D_ = euclidean_distances(W_)
    v_1 = v_1.astype(np.double)
    v_2 = v_2.astype(np.double)
    v_1 /= v_1.sum()
    v_2 /= v_2.sum()
    D_ = D_.astype(np.double)
    D_ /= D_.max()
    lex=emd(v_1, v_2, D_)
    return(lex)

In [3]:
%%time
wv = KeyedVectors.load_word2vec_format('embedding/w2v-v1.wordvectors', unicode_errors="ignore")
wv.init_sims()

CPU times: user 2min 12s, sys: 4.08 s, total: 2min 16s
Wall time: 2min 16s


In [4]:
pattern = "(?u)\\b[\\w-]+\\b"

fp = np.memmap("data/embed.dat", dtype=np.double, mode='w+', shape=wv.vectors_norm.shape)
fp[:] = wv.vectors_norm[:]
with open("data/embed.vocab", "w") as f:
    for _, w in sorted((voc.index, word) for word, voc in wv.vocab.items()):
        print(w.encode('utf-8'), file=f)

vocab_len = len(wv.vocab)
del fp

W = np.memmap("data/embed.dat", dtype=np.double, mode="r", shape=(vocab_len, 300))

with open("data/embed.vocab") as f:
    vocab_list = map(str.strip, f.readlines())
vocab_dict={w:k for k, w in enumerate(vocab_list)}

In [5]:
# para cada fonte, percorrer a lista de arquivos em determianda pasta
folders = [
    'Indicadores/brasil_escola',
    # 'Indicadores/clue_words_pt',
    'Indicadores/acrobata_das_letras',
    'Indicadores/mundo_educacao',
    'Indicadores/Subjetivity_lexicon',
    'Indicadores/PersuativeEssays_UKP',
    'Indicadores/2020_Jonathan'
]


def get_indicators_by_sources(folders):
    indicators = dict()
    for folder in folders:
        argumentacao = list()
        for file_name in os.listdir(folder):    
            file_path = os.path.join(folder, file_name)
            with open(file_path) as f:
                argumentacao += [w.rstrip() for w in f.readlines()]
        argumentacao = ' '.join(argumentacao)
        indicators[folder.split('/')[1]] = argumentacao
    return indicators
    
def get_all_indicators():
    with open('Indicadores/preprocessed_indicators_woclue.txt') as f:
        argumentacao = [w.rstrip() for w in f.readlines()]
    return ' '.join(argumentacao)

# argumentacoes = get_indicators_by_sources(folders)
argumentacoes = get_all_indicators()

In [6]:
if (type(argumentacoes) == dict):
    n_folder=6
    argumentacao = argumentacoes[folders[n_folder].split('/')[1]]
else:
    argumentacao = argumentacoes

In [7]:
# # redacoes corretas:
# df = pd.read_csv("redações/jonathan_ann/redacoes_extraidas_com_nao_argumento.csv")
# content = list(df.T.to_dict().values())

## redacoes:
# df = pd.read_csv("redações/redacoes_extraidas.csv")
# df_not_arg = pd.read_csv("redações/Unlabeled/unlabeled.csv")
# all_df = pd.concat([df, df_not_arg], ignore_index=True)
# content = list(all_df.T.to_dict().values())

# nlx-group:
df = pd.read_csv("nlx-data-arguments/data-nlx-group.csv")
content = list(df.T.to_dict().values())

In [8]:
punctuation = string.punctuation \
        .replace('-', '') \
        .replace('_', '')
punctuation += "—«»"
punctuation = r"[{}]".format(punctuation)
re_trim = re.compile(r' +', re.UNICODE)

def loadStopWordsPT(filename):
    lines = [line.rstrip('\n').strip() for line in open(filename)]
    return lines

        
def clean_stopwords(text, stop_words_list):
    list_words = clean(text).split()
    list_clean_text = []
    for word in list_words:
        if word not in stop_words_list:
            list_clean_text.append(word)
    return " ".join(list_clean_text)

def clean(text):
    text = text.lower()
    text = re.sub(punctuation, " ", text)
    text = re_trim.sub(' ', text)

    return text

In [9]:
raw_stop_words = loadStopWordsPT('data/stop_words_preprocessed.txt')
argumentacao = clean_stopwords(argumentacao, raw_stop_words)

for i in range(len(content)):
    content[i]['text'] = clean_stopwords(content[i]['text'], raw_stop_words)
    sent_text = nltk.sent_tokenize(content[i]['text'])
    assert len(sent_text) <= 1
    content[i]['text'] = [sentence.encode("utf-8") for sentence in sent_text]

In [10]:
content[1]

{'id': 710,
 'label': 'argumento',
 'text': [b'nesse caso nunca podemos fazer mal atacar matar diretamente crian\xc3\xa7a atrav\xc3\xa9s aborto bem salvando vida m\xc3\xa3e possa resultar']}

In [11]:
def restore_text(list_sentences):
    original_text = ''
    for sentence in list_sentences:
        original_text += sentence.decode("utf-8")
    return original_text

In [12]:
def remove_dirty_sentences(list_sentences):
    clean_sentences = []
    for sentence in list_sentences:
        if(len(sentence.split()) > 2):
            clean_sentences.append(sentence)
    return clean_sentences

In [13]:
for i in range(len(content)):
    content[i]['text'] = remove_dirty_sentences(content[i]['text'])
content = [lista for lista in content if len(lista['text']) > 0]

In [14]:
%%time

lexicons_rates = list()
for sentences in content:
    arg_avg = 0

    for sentence in sentences['text']:
        arg_avg += lexicon_rate(argumentacao, sentence)
    arg_avg = arg_avg / float(len(sentences['text']))
    
    rates = list([sentences['id'], restore_text(sentences['text']),sentences['label'],arg_avg])
    lexicons_rates.append(rates)

CPU times: user 1h 7min 17s, sys: 51min 15s, total: 1h 58min 32s
Wall time: 20min 39s


In [15]:
df = pd.DataFrame(lexicons_rates, columns=['id','text','label','wmd_rate'])

In [16]:
# df.to_csv(f"body/redacoes_corretas/wmd_rc_{folders[n_folder].split('/')[1]}.csv", index=False)
# df.to_csv(f"body/redacoes_corretas/wmd_rc_all_indicators_wocluewords.csv", index=False)
df.to_csv(f"body/nlx/wmd_nlx_all_indicators_woclue.csv", index=False)

In [17]:
os.system('spd-say "cabou"')

0

In [79]:
# wv.wmdistance()
wv.wmdistance(['a'], ['a'])

0.0