## Pre-processamento do dataset para alimentar a rede neural keras para classificação de perguntas (1) e respostas (0)

In [None]:
import string
import nltk
from nltk.corpus import stopwords
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
import pandas as pd
import re
import numpy as np
import math
import random

### Opening movie reviews

In [None]:
messages = pd.read_csv('./chatdata/movie_lines_normalized.tsv', header = None, delimiter="\t", quoting=3, encoding='ISO-8859-2')

In [None]:
messages.columns = ['msg_line', 'user_id', 'movie_id', 'msg']

In [None]:
messages.head(10)

### Cleaning the msg_line of the conversations

In [None]:
#remove charactes
def remove_char(txt):
    return re.sub('[^0-9]','', txt)

In [None]:
#leaving just the number of the index, so L872 changes to 872
messages['msg_line_clean'] = [remove_char(msg) for msg in messages['msg_line']]

In [None]:
#change the column type to number
messages['msg_line_clean'] = pd.to_numeric(messages['msg_line_clean'])

In [None]:
messages = messages.sort_values(by=['msg_line_clean'])

In [None]:
#set the column as the index
messages = messages.set_index('msg_line_clean')

In [None]:
messages.head(10)

### Removing entities

In [None]:
entities = pd.read_csv('./chatdata/entity_list_complete.tsv', header = None, delimiter="\t", quoting=3, encoding='ISO-8859-2')

In [None]:
entities.columns = ['ent', 'type']

In [None]:
entities['ent'] = entities['ent'].str.lower()

In [None]:
entities.head()

In [None]:
entities['ent_len'] = [len(e) for e in entities['ent']]

In [None]:
s = entities['ent_len'].sort_values(ascending=False).index

In [None]:
entities = entities.reindex(s)

In [None]:
entities = entities.reset_index(drop=True)

In [None]:
entities.head()

In [None]:
data = messages['msg']

In [None]:
ent_list =  ['PERSON', 'ORG', 'NORP', 'FAC', 'GPE', 'LOC', 'PRODUCT', 'EVENT', 'WORK_OF_ART', 'LAW', 'LANGUAGE']
#ent_list =  ['LANGUAGE']

ent = list()
for i in range(len(entities.index)):
    if entities['type'][i] in ent_list:
        ent.append(entities['ent'][i])

In [None]:
ent = list(set(ent))
print(len(ent))
print(ent)

In [None]:
dict = {}
for n in ent:
    dict[n] = len(n)

In [None]:
#sort dict by biggest values
dict = {k: v for k, v in sorted(dict.items(), key=lambda item: item[1], reverse=True)}

In [None]:
dict

In [None]:
def remove_entity(corpus):
    corpus = corpus.split(' ')
    corpus = [c for c in corpus if c not in list(dict.keys())]
    return ' '.join(corpus)    

In [None]:
%%time
messages['msg_pre_processed'] = [remove_entity(m) for m in list(data)]

### Opening conversation sequence

In [None]:
conv_seq.columns = ['user1_id', 'user2_id', 'movie_id', 'sequence']

In [None]:
conv_seq.head(10)

### Build conversation sequence

In [None]:
def split_conversation(txt):
    txt_alt = txt.split(' ')
    return txt_alt

In [None]:
def seq_to_list(seq):
    seq_list = [remove_char(s) for s in seq]
    return seq_list

In [None]:
#initializing the msg_2 column
messages['msg_2'] = '-'

In [None]:
def link_conversations(seq_list, df, filter1, filter2):
    i = 0
    while i in range(len(seq_list)):
        if i+1 < len(seq_list):
            next_msg = df.loc[int(seq_list[i+1]), filter1]
            df.at[int(seq_list[i]), filter2] = next_msg
        i+=1

In [None]:
#link each message with its answer
for conv in conv_seq['sequence']:
    #split each sequence by space
    seq = split_conversation(conv)

    #remove the char L from the sequences
    txt_alt = [remove_char(s) for s in seq]

    #use the conversation sequence to build the target answer for each message
    link_conversations(txt_alt, messages, 'msg', 'msg_2')

In [None]:
messages.head(30)

## Pre processing the msg

In [None]:
data = messages['msg']

In [None]:
lemmatizer = WordNetLemmatizer()
def pre_processing_text(corpus):
    #remove html tags
    corpus = re.sub(r'<.*?>', '', str(corpus))
    
    #remove non-alphanumeric characters
    corpus = re.sub(r'[^a-z A-Z 0-9 \s]', '', str(corpus))
    
    #remove duplicated spaces
    corpus = re.sub(r' +', ' ', str(corpus))
    
    #capitalization
    corpus = corpus.lower()
    
    #tokenization
    corpus = re.findall(r"\w+(?:'\w+)?|[^\w\s]", corpus)
    
    #lammatization
    corpus = [lemmatizer.lemmatize(c) for c in corpus]
    
    #remove punctuation
    corpus = [t for t in corpus if t not in string.punctuation]
    
    #remove stopwords
    #it makes the model worst
    #stopwords_ = stopwords.words("english")
    #corpus = [t for t in corpus if t not in stopwords_]
    
    corpus = ' '.join(corpus)

    return corpus

In [None]:
%%time
data_pre_processed = [pre_processing_text(str(m)) for m in data]
data_pre_processed

In [None]:
messages['msg_pre_processed'] = data_pre_processed

### Checking and removing duplicated messages in msg (not used)

In [None]:
data = messages['msg_pre_processed']

### Removing nan msg origined by '' messages

In [None]:
#filling the nan messages with a string- not necessary
#messages = messages.fillna('UNKNOWN')
messages = messages.dropna()

### Removing apostrophes (need for embedding) (not used)

### Filling '-' messages with a generic one

In [None]:
#return generic answer
def generic_answer(txt):
  asw_list = ['talk more about it',
              'can you explain it better?',
              'I need to think more about it',
              'maybe...'
              ]
  if txt == '-':
    return random.choice(asw_list)
  return txt

In [None]:
#seting a generic answer to the messages without answer
messages['msg_2'] = [generic_answer(msg) for msg in messages['msg_2']]

### Tagging the msg with classes

In [None]:
def define_target(corpus):
    
    if '?' in corpus:
        return 1
    else:
        return 0

In [None]:
data = messages['msg']

In [None]:
messages['target'] = [define_target(m) for m in data]

In [None]:
messages['target'] = messages['target'].astype(int)

### Save data

In [None]:
messages

In [None]:
messages.to_csv('./chatdata/movie_lines_pre_processed_keras.tsv', index=False, sep='\t', header=False)