## Pre-processamento do dataset para alimentar a rede neural keras para classificação de perguntas (1) e respostas (0)

In [1]:
import string
import nltk
from nltk.corpus import stopwords
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
import pandas as pd
import re
import numpy as np
import math
import random

[nltk_data] Downloading package wordnet to /home/douglas/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


### Opening movie reviews

In [2]:
messages = pd.read_csv('./chatdata/movie_lines_normalized.tsv', header = None, delimiter="\t", quoting=3, encoding='ISO-8859-2')

In [3]:
messages.columns = ['msg_line', 'user_id', 'movie_id', 'msg']

In [4]:
messages.head(10)

Unnamed: 0,msg_line,user_id,movie_id,msg
0,L1045,u0,m0,They do not!
1,L1044,u2,m0,They do to!
2,L985,u0,m0,I hope so.
3,L984,u2,m0,She okay?
4,L925,u0,m0,Let's go.
5,L924,u2,m0,Wow
6,L872,u0,m0,Okay -- you're gonna need to learn how to lie.
7,L871,u2,m0,No
8,L870,u0,m0,"""""""I'm kidding. You know how sometimes you jus..."
9,L869,u0,m0,Like my fear of wearing pastels?


### Cleaning the msg_line of the conversations

In [5]:
#remove charactes
def remove_char(txt):
    return re.sub('[^0-9]','', txt)

In [6]:
#leaving just the number of the index, so L872 changes to 872
messages['msg_line_clean'] = [remove_char(msg) for msg in messages['msg_line']]

In [7]:
#change the column type to number
messages['msg_line_clean'] = pd.to_numeric(messages['msg_line_clean'])

In [8]:
messages = messages.sort_values(by=['msg_line_clean'])

In [9]:
#set the column as the index
messages = messages.set_index('msg_line_clean')

In [10]:
messages.head(10)

Unnamed: 0_level_0,msg_line,user_id,movie_id,msg
msg_line_clean,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
49,L49,u0,m0,Did you change your hair?
50,L50,u3,m0,No.
51,L51,u0,m0,You might wanna think about it
59,L59,u9,m0,I missed you.
60,L60,u8,m0,It says here you exposed yourself to a group o...
61,L61,u9,m0,It was a bratwurst. I was eating lunch.
62,L62,u8,m0,With the teeth of your zipper?
63,L63,u7,m0,You the new guy?
64,L64,u2,m0,So they tell me...
65,L65,u7,m0,C'mon. I'm supposed to give you the tour.


### Removing entities

In [11]:
entities = pd.read_csv('./chatdata/entity_list_complete.tsv', header = None, delimiter="\t", quoting=3, encoding='ISO-8859-2')

In [12]:
entities.columns = ['ent', 'type']

In [13]:
entities['ent'] = entities['ent'].str.lower()

In [14]:
entities.head()

Unnamed: 0,ent,type
0,kinda,ORG
1,the dallas times herald,ORG
2,queen louisa,PERSON
3,a.m,GPE
4,cousin hop,PERSON


In [15]:
entities['ent_len'] = [len(e) for e in entities['ent']]

In [16]:
s = entities['ent_len'].sort_values(ascending=False).index

In [17]:
entities = entities.reindex(s)

In [18]:
entities = entities.reset_index(drop=True)

In [19]:
entities.head()

Unnamed: 0,ent,type,ent_len
0,"""""""how can the bolshevik cause gain respect am...",WORK_OF_ART,237
1,"""""""the premier wishes to inform the government...",WORK_OF_ART,192
2,""""""" come tuesday twelve a.m. bingo these like-...",WORK_OF_ART,182
3,"""""""the suggestion of the president regarding t...",WORK_OF_ART,155
4,"""""""the management of boyd's takes pleasure in ...",WORK_OF_ART,146


In [20]:
data = messages['msg']

In [21]:
ent_list =  ['PERSON', 'ORG', 'NORP', 'FAC', 'GPE', 'LOC', 'PRODUCT', 'EVENT', 'WORK_OF_ART', 'LAW', 'LANGUAGE']
#ent_list =  ['LANGUAGE']

ent = list()
for i in range(len(entities.index)):
    if entities['type'][i] in ent_list:
        ent.append(entities['ent'][i])

In [22]:
ent = list(set(ent))
print(len(ent))
print(ent)

27848


In [23]:
dict = {}
for n in ent:
    dict[n] = len(n)

In [24]:
#sort dict by biggest values
dict = {k: v for k, v in sorted(dict.items(), key=lambda item: item[1], reverse=True)}

In [25]:
dict

{'"""how can the bolshevik cause gain respect among the moslems if your three representatives buljanoff iranoff and kopalski get so drunk that they throw a carpet out of their hotel window and complain to the management that it didn\'t fly"': 237,
 '"""the premier wishes to inform the government of the united states that it will be impossible for him to attend the meeting suggested by the president unless the meeting is held in moscow."""': 192,
 '""" come tuesday twelve a.m. bingo these like-minded deviates log on and start yakking it up: explicit sex crime gossip who did what to whom who wants to do what when why and how."""': 182,
 '"""the suggestion of the president regarding the possibility of a meeting in moscow would be unacceptable to her majesty\'s government at the present time."': 155,
 '"""the management of boyd\'s takes pleasure in requesting the company of mr. richard starkey that\'s you in their recently refinished gaming rooms."': 146,
 '"""well jim i says, it makes me 

In [26]:
def remove_entity(corpus):
    corpus = corpus.split(' ')
    corpus = [c for c in corpus if c not in list(dict.keys())]
    return ' '.join(corpus)    

In [None]:
%%time
messages['msg_pre_processed'] = [remove_entity(m) for m in list(data)]

### Opening conversation sequence

In [None]:
conv_seq.columns = ['user1_id', 'user2_id', 'movie_id', 'sequence']

In [None]:
conv_seq.head(10)

### Build conversation sequence

In [None]:
def split_conversation(txt):
    txt_alt = txt.split(' ')
    return txt_alt

In [None]:
def seq_to_list(seq):
    seq_list = [remove_char(s) for s in seq]
    return seq_list

In [None]:
#initializing the msg_2 column
messages['msg_2'] = '-'

In [None]:
def link_conversations(seq_list, df, filter1, filter2):
    i = 0
    while i in range(len(seq_list)):
        if i+1 < len(seq_list):
            next_msg = df.loc[int(seq_list[i+1]), filter1]
            df.at[int(seq_list[i]), filter2] = next_msg
        i+=1

In [None]:
#link each message with its answer
for conv in conv_seq['sequence']:
    #split each sequence by space
    seq = split_conversation(conv)

    #remove the char L from the sequences
    txt_alt = [remove_char(s) for s in seq]

    #use the conversation sequence to build the target answer for each message
    link_conversations(txt_alt, messages, 'msg', 'msg_2')

In [None]:
messages.head(30)

## Pre processing the msg

In [None]:
data = messages['msg']

In [None]:
lemmatizer = WordNetLemmatizer()
def pre_processing_text(corpus):
    #remove html tags
    corpus = re.sub(r'<.*?>', '', str(corpus))
    
    #remove non-alphanumeric characters
    corpus = re.sub(r'[^a-z A-Z 0-9 \s]', '', str(corpus))
    
    #remove duplicated spaces
    corpus = re.sub(r' +', ' ', str(corpus))
    
    #capitalization
    corpus = corpus.lower()
    
    #tokenization
    corpus = re.findall(r"\w+(?:'\w+)?|[^\w\s]", corpus)
    
    #lammatization
    corpus = [lemmatizer.lemmatize(c) for c in corpus]
    
    #remove punctuation
    corpus = [t for t in corpus if t not in string.punctuation]
    
    #remove stopwords
    #it makes the model worst
    #stopwords_ = stopwords.words("english")
    #corpus = [t for t in corpus if t not in stopwords_]
    
    corpus = ' '.join(corpus)

    return corpus

In [None]:
%%time
data_pre_processed = [pre_processing_text(str(m)) for m in data]
data_pre_processed

In [None]:
messages['msg_pre_processed'] = data_pre_processed

### Checking and removing duplicated messages in msg (not used)

In [None]:
data = messages['msg_pre_processed']

### Removing nan msg origined by '' messages

In [None]:
#filling the nan messages with a string- not necessary
#messages = messages.fillna('UNKNOWN')
messages = messages.dropna()

### Removing apostrophes (need for embedding) (not used)

### Filling '-' messages with a generic one

In [None]:
#return generic answer
def generic_answer(txt):
  asw_list = ['talk more about it',
              'can you explain it better?',
              'I need to think more about it',
              'maybe...'
              ]
  if txt == '-':
    return random.choice(asw_list)
  return txt

In [None]:
#seting a generic answer to the messages without answer
messages['msg_2'] = [generic_answer(msg) for msg in messages['msg_2']]

### Tagging the msg with classes

In [None]:
def define_target(corpus):
    
    if '?' in corpus:
        return 1
    else:
        return 0

In [None]:
data = messages['msg']

In [None]:
messages['target'] = [define_target(m) for m in data]

In [None]:
messages['target'] = messages['target'].astype(int)

### Save data

In [None]:
messages

In [None]:
messages.to_csv('./chatdata/movie_lines_pre_processed_keras.tsv', index=False, sep='\t', header=False)