## A intenção do projeto é criar um chatbot baseado em reviews de filmes para que se possa fazer perguntas e manter uma conversa livre sobre este tema

- link do banco de dados https://www.kaggle.com/Cornell-University/movie-dialog-corpus?select=movie_lines.tsv
- referências
>- https://shanebarker.com/blog/deep-learning-chatbot/
> -https://towardsdatascience.com/how-to-create-a-chatbot-with-python-deep-learning-in-less-than-an-hour-56a063bdfc44

In [37]:
import string
import nltk
from nltk.corpus import stopwords
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
import pandas as pd
import re
import numpy as np
import math
import random

[nltk_data] Downloading package wordnet to /home/douglas/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


### Opening movie reviews

In [38]:
messages = pd.read_csv('./chatdata/movie_lines_normalized.tsv', header = None, delimiter="\t", quoting=3, encoding='ISO-8859-2')

In [39]:
messages.columns = ['msg_line', 'user_id', 'movie_id', 'msg']

In [40]:
messages.head(10)

Unnamed: 0,msg_line,user_id,movie_id,msg
0,L1045,u0,m0,They do not!
1,L1044,u2,m0,They do to!
2,L985,u0,m0,I hope so.
3,L984,u2,m0,She okay?
4,L925,u0,m0,Let's go.
5,L924,u2,m0,Wow
6,L872,u0,m0,Okay -- you're gonna need to learn how to lie.
7,L871,u2,m0,No
8,L870,u0,m0,"""""""I'm kidding. You know how sometimes you jus..."
9,L869,u0,m0,Like my fear of wearing pastels?


In [41]:
messages.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 304713 entries, 0 to 304712
Data columns (total 4 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   msg_line  304713 non-null  object
 1   user_id   304713 non-null  object
 2   movie_id  304713 non-null  object
 3   msg       304713 non-null  object
dtypes: object(4)
memory usage: 9.3+ MB


In [42]:
messages.describe()

Unnamed: 0,msg_line,user_id,movie_id,msg
count,304713,304713,304713,304713
unique,304713,9035,659,265277
top,L189750,u4525,m289,What?
freq,1,537,1530,1679


### Cleaning the msg_line of the conversations

In [43]:
#remove charactes
def remove_char(txt):
    return re.sub('[^0-9]','', txt)

In [44]:
#leaving just the number of the index, so L872 changes to 872
messages['msg_line_clean'] = [remove_char(msg) for msg in messages['msg_line']]

In [45]:
#change the column type to number
messages['msg_line_clean'] = pd.to_numeric(messages['msg_line_clean'])

In [46]:
messages = messages.sort_values(by=['msg_line_clean'])

In [47]:
#set the column as the index
messages = messages.set_index('msg_line_clean')

In [48]:
messages.head(10)

Unnamed: 0_level_0,msg_line,user_id,movie_id,msg
msg_line_clean,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
49,L49,u0,m0,Did you change your hair?
50,L50,u3,m0,No.
51,L51,u0,m0,You might wanna think about it
59,L59,u9,m0,I missed you.
60,L60,u8,m0,It says here you exposed yourself to a group o...
61,L61,u9,m0,It was a bratwurst. I was eating lunch.
62,L62,u8,m0,With the teeth of your zipper?
63,L63,u7,m0,You the new guy?
64,L64,u2,m0,So they tell me...
65,L65,u7,m0,C'mon. I'm supposed to give you the tour.


### Opening conversation sequence

In [49]:
#read the file with the conversation sequence
conv_seq = pd.read_csv('./chatdata/movie_conversations.tsv', header = None, delimiter="\t", quoting=3, encoding='ISO-8859-2')

In [50]:
conv_seq.columns = ['user1_id', 'user2_id', 'movie_id', 'sequence']

In [51]:
conv_seq.head(10)

Unnamed: 0,user1_id,user2_id,movie_id,sequence
0,u0,u2,m0,['L194' 'L195' 'L196' 'L197']
1,u0,u2,m0,['L198' 'L199']
2,u0,u2,m0,['L200' 'L201' 'L202' 'L203']
3,u0,u2,m0,['L204' 'L205' 'L206']
4,u0,u2,m0,['L207' 'L208']
5,u0,u2,m0,['L271' 'L272' 'L273' 'L274' 'L275']
6,u0,u2,m0,['L276' 'L277']
7,u0,u2,m0,['L280' 'L281']
8,u0,u2,m0,['L363' 'L364']
9,u0,u2,m0,['L365' 'L366']


In [52]:
conv_seq.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 83097 entries, 0 to 83096
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   user1_id  83097 non-null  object
 1   user2_id  83097 non-null  object
 2   movie_id  83097 non-null  object
 3   sequence  83097 non-null  object
dtypes: object(4)
memory usage: 2.5+ MB


In [53]:
conv_seq.describe()

Unnamed: 0,user1_id,user2_id,movie_id,sequence
count,83097,83097,83097,83097
unique,5420,5608,617,83097
top,u4331,u1475,m289,['L488577' 'L488578' 'L488579']
freq,193,187,338,1


### Build conversation sequence

In [54]:
def split_conversation(txt):
    txt_alt = txt.split(' ')
    return txt_alt

In [55]:
def seq_to_list(seq):
    seq_list = [remove_char(s) for s in seq]
    return seq_list

In [56]:
#initializing the msg_2 column
messages['msg_2'] = '-'

In [57]:
def link_conversations(seq_list, df, filter1, filter2):
    i = 0
    while i in range(len(seq_list)):
        if i+1 < len(seq_list):
            next_msg = df.loc[int(seq_list[i+1]), filter1]
            df.at[int(seq_list[i]), filter2] = next_msg
        i+=1

In [58]:
#link each message with its answer
for conv in conv_seq['sequence']:
    #split each sequence by space
    seq = split_conversation(conv)

    #remove the char L from the sequences
    txt_alt = [remove_char(s) for s in seq]

    #use the conversation sequence to build the target answer for each message
    link_conversations(txt_alt, messages, 'msg', 'msg_2')

In [59]:
messages.head(30)

Unnamed: 0_level_0,msg_line,user_id,movie_id,msg,msg_2
msg_line_clean,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
49,L49,u0,m0,Did you change your hair?,No.
50,L50,u3,m0,No.,You might wanna think about it
51,L51,u0,m0,You might wanna think about it,-
59,L59,u9,m0,I missed you.,It says here you exposed yourself to a group o...
60,L60,u8,m0,It says here you exposed yourself to a group o...,It was a bratwurst. I was eating lunch.
61,L61,u9,m0,It was a bratwurst. I was eating lunch.,With the teeth of your zipper?
62,L62,u8,m0,With the teeth of your zipper?,-
63,L63,u7,m0,You the new guy?,So they tell me...
64,L64,u2,m0,So they tell me...,C'mon. I'm supposed to give you the tour.
65,L65,u7,m0,C'mon. I'm supposed to give you the tour.,-


## Pre processing the msg

In [60]:
data = messages['msg']

In [61]:
lemmatizer = WordNetLemmatizer()
def pre_processing_text(corpus):   
    #remove duplicated spaces
    corpus = re.sub(r' +', ' ', corpus)
    
    #capitalization
    corpus = corpus.lower()
    
    #tokenization
    corpus = re.findall(r"\w+(?:'\w+)?|[^\w\s]", corpus)
    
    #lammatization
    corpus = [lemmatizer.lemmatize(c) for c in corpus]
    
    #remove punctuation
    corpus = [t for t in corpus if t not in string.punctuation]
    
    #remove stopwords
    #it makes the model worst
    #stopwords_ = stopwords.words("english")
    #corpus = [t for t in corpus if t not in stopwords_]
    
    corpus = ' '.join(corpus)

    return corpus

In [62]:
%%time
data_pre_processed = [pre_processing_text(m) for m in data]
data_pre_processed

CPU times: user 20.7 s, sys: 906 ms, total: 21.6 s
Wall time: 21.8 s


['did you change your hair',
 'no',
 'you might wanna think about it',
 'i missed you',
 'it say here you exposed yourself to a group of freshman girl',
 'it wa a bratwurst i wa eating lunch',
 'with the teeth of your zipper',
 'you the new guy',
 'so they tell me',
 "c'mon i'm supposed to give you the tour",
 'so which dakota you from',
 "north actually how'd you",
 'i wa kidding people actually live there',
 "yeah a couple we're outnumbered by the cow though",
 'how many people were in your old school',
 'thirty two',
 'get out',
 'how many people go here',
 'couple thousand most of them evil',
 "that i'm used to",
 'yeah but these guy have never seen a horse they just jack off to clint eastwood',
 'that girl i',
 'you burn you pine you perish',
 'who is she',
 "bianca stratford sophomore don't even think about it",
 'why not',
 "i could start with your haircut but it doesn't matter she's not allowed to date until her older sister doe and that's an impossibility",
 "katarina stratfor

In [63]:
messages['msg_pre_processed'] = data_pre_processed

### Checking for duplicated messages in msg

In [64]:
data = messages['msg_pre_processed']

In [65]:
dict = {}
for n in data:
    if n in dict:
        dict[n] = dict[n] + 1
    else:
        dict[n] = 1

In [66]:
#sort dict by biggest values
dict = {k: v for k, v in sorted(dict.items(), key=lambda item: item[1], reverse=True)}

In [67]:
dict

{'what': 1985,
 'yes': 1762,
 'no': 1692,
 'yeah': 1346,
 'why': 588,
 "i don't know": 452,
 'okay': 421,
 'oh': 347,
 'thank you': 322,
 'unknown': 310,
 'sure': 307,
 'hello': 291,
 'who': 286,
 'yes sir': 277,
 'why not': 265,
 'i know': 249,
 'uh huh': 247,
 'really': 243,
 'what is it': 239,
 'what do you mean': 237,
 'huh': 237,
 'hi': 234,
 'right': 231,
 'thanks': 215,
 "what's that": 198,
 'well': 193,
 "i'm sorry": 189,
 'what are you doing': 185,
 'nothing': 184,
 "that's right": 180,
 'where': 167,
 'what happened': 167,
 'what are you talking about': 163,
 'of course': 161,
 'hey': 151,
 'how': 139,
 'excuse me': 135,
 'good': 135,
 'so': 131,
 'sorry': 129,
 'shut up': 118,
 'oh yeah': 113,
 'fine': 112,
 'what do you want': 111,
 'shit': 105,
 "what's wrong": 99,
 'jesus': 99,
 'where are you going': 99,
 'come on': 97,
 'please': 97,
 'all right': 92,
 'when': 91,
 'what do you think': 89,
 'who are you': 89,
 'no sir': 87,
 'what are you doing here': 84,
 'i see': 84,


In [68]:
#example of duplcated msg
messages[messages['msg_pre_processed'] == 'did you change your hair']

Unnamed: 0_level_0,msg_line,user_id,movie_id,msg,msg_2,msg_pre_processed
msg_line_clean,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
49,L49,u0,m0,Did you change your hair?,No.,did you change your hair
151439,L151439,u4736,m314,Did you change your hair?,A little.,did you change your hair


In [69]:
#get the repeated messages
d_list = list()
for k in dict:
    if dict[k] > 1:
        d_list.append(k)

In [70]:
d_list

['what',
 'yes',
 'no',
 'yeah',
 'why',
 "i don't know",
 'okay',
 'oh',
 'thank you',
 'unknown',
 'sure',
 'hello',
 'who',
 'yes sir',
 'why not',
 'i know',
 'uh huh',
 'really',
 'what is it',
 'what do you mean',
 'huh',
 'hi',
 'right',
 'thanks',
 "what's that",
 'well',
 "i'm sorry",
 'what are you doing',
 'nothing',
 "that's right",
 'where',
 'what happened',
 'what are you talking about',
 'of course',
 'hey',
 'how',
 'excuse me',
 'good',
 'so',
 'sorry',
 'shut up',
 'oh yeah',
 'fine',
 'what do you want',
 'shit',
 "what's wrong",
 'jesus',
 'where are you going',
 'come on',
 'please',
 'all right',
 'when',
 'what do you think',
 'who are you',
 'no sir',
 'what are you doing here',
 'i see',
 'sir',
 'what for',
 'and',
 'maybe',
 "i don't think so",
 'you okay',
 "i can't",
 'me',
 "what's this",
 'like what',
 "what's the matter",
 "i don't understand",
 'great',
 'ok',
 'but',
 'oh my god',
 'you',
 "what's going on",
 'oh no',
 'dad',
 'about what',
 'me too',

In [71]:
#messages = messages.drop_duplicates(subset=['msg_pre_processed'])

In [72]:
#example of duplcated msg
messages[messages['msg_pre_processed'] == 'did you change your hair']

Unnamed: 0_level_0,msg_line,user_id,movie_id,msg,msg_2,msg_pre_processed
msg_line_clean,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
49,L49,u0,m0,Did you change your hair?,No.,did you change your hair
151439,L151439,u4736,m314,Did you change your hair?,A little.,did you change your hair


In [73]:
messages.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 304713 entries, 49 to 666576
Data columns (total 6 columns):
 #   Column             Non-Null Count   Dtype 
---  ------             --------------   ----- 
 0   msg_line           304713 non-null  object
 1   user_id            304713 non-null  object
 2   movie_id           304713 non-null  object
 3   msg                304713 non-null  object
 4   msg_2              304713 non-null  object
 5   msg_pre_processed  304713 non-null  object
dtypes: object(6)
memory usage: 26.3+ MB


In [74]:
messages.describe()

Unnamed: 0,msg_line,user_id,movie_id,msg,msg_2,msg_pre_processed
count,304713,304713,304713,304713,304713,304713
unique,304713,9035,659,265277,192878,257095
top,L189750,u4525,m289,What?,-,what
freq,1,537,1530,1679,83097,1985


### Removing nan msg origined by '' messages

In [75]:
#filling the nan messages with a string
#messages = messages.fillna('UNKNOWN')

### Removing apostrophes (need for embedding)

In [76]:
#The quality of the model became worst
#messages['msg_pre_processed'] = [ word.replace("\'","") for word in messages['msg_pre_processed']]

### Filling '-' messages with a generic one

In [77]:
#return generic answer
def generic_answer(txt):
  asw_list = ['talk more about it',
              'can you explain it better?',
              'I need to think more about it',
              'maybe...'
              ]
  if txt == '-':
    return random.choice(asw_list)
  return txt

In [78]:
#seting a generic answer to the messages without answer
messages['msg_2'] = [generic_answer(msg) for msg in messages['msg_2']]

In [79]:
messages.head(30)

Unnamed: 0_level_0,msg_line,user_id,movie_id,msg,msg_2,msg_pre_processed
msg_line_clean,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
49,L49,u0,m0,Did you change your hair?,No.,did you change your hair
50,L50,u3,m0,No.,You might wanna think about it,no
51,L51,u0,m0,You might wanna think about it,talk more about it,you might wanna think about it
59,L59,u9,m0,I missed you.,It says here you exposed yourself to a group o...,i missed you
60,L60,u8,m0,It says here you exposed yourself to a group o...,It was a bratwurst. I was eating lunch.,it say here you exposed yourself to a group of...
61,L61,u9,m0,It was a bratwurst. I was eating lunch.,With the teeth of your zipper?,it wa a bratwurst i wa eating lunch
62,L62,u8,m0,With the teeth of your zipper?,I need to think more about it,with the teeth of your zipper
63,L63,u7,m0,You the new guy?,So they tell me...,you the new guy
64,L64,u2,m0,So they tell me...,C'mon. I'm supposed to give you the tour.,so they tell me
65,L65,u7,m0,C'mon. I'm supposed to give you the tour.,maybe...,c'mon i'm supposed to give you the tour


### Tagging the msg with classes

In [80]:
def define_target(corpus):
    
    if '?' in corpus:
        return 1
    else:
        return 0

In [81]:
data = messages['msg']

In [82]:
messages['target'] = [define_target(m) for m in data]

In [83]:
messages['target'] = messages['target'].astype(int)

In [84]:
messages.head(20)

Unnamed: 0_level_0,msg_line,user_id,movie_id,msg,msg_2,msg_pre_processed,target
msg_line_clean,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
49,L49,u0,m0,Did you change your hair?,No.,did you change your hair,1
50,L50,u3,m0,No.,You might wanna think about it,no,0
51,L51,u0,m0,You might wanna think about it,talk more about it,you might wanna think about it,0
59,L59,u9,m0,I missed you.,It says here you exposed yourself to a group o...,i missed you,0
60,L60,u8,m0,It says here you exposed yourself to a group o...,It was a bratwurst. I was eating lunch.,it say here you exposed yourself to a group of...,0
61,L61,u9,m0,It was a bratwurst. I was eating lunch.,With the teeth of your zipper?,it wa a bratwurst i wa eating lunch,0
62,L62,u8,m0,With the teeth of your zipper?,I need to think more about it,with the teeth of your zipper,1
63,L63,u7,m0,You the new guy?,So they tell me...,you the new guy,1
64,L64,u2,m0,So they tell me...,C'mon. I'm supposed to give you the tour.,so they tell me,0
65,L65,u7,m0,C'mon. I'm supposed to give you the tour.,maybe...,c'mon i'm supposed to give you the tour,0


### Save data

In [86]:
messages.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 304713 entries, 49 to 666576
Data columns (total 7 columns):
 #   Column             Non-Null Count   Dtype 
---  ------             --------------   ----- 
 0   msg_line           304713 non-null  object
 1   user_id            304713 non-null  object
 2   movie_id           304713 non-null  object
 3   msg                304713 non-null  object
 4   msg_2              304713 non-null  object
 5   msg_pre_processed  304713 non-null  object
 6   target             304713 non-null  int64 
dtypes: int64(1), object(6)
memory usage: 28.6+ MB


In [89]:
messages

Unnamed: 0_level_0,msg_line,user_id,movie_id,msg,msg_2,msg_pre_processed,target
msg_line_clean,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
49,L49,u0,m0,Did you change your hair?,No.,did you change your hair,1
50,L50,u3,m0,No.,You might wanna think about it,no,0
51,L51,u0,m0,You might wanna think about it,talk more about it,you might wanna think about it,0
59,L59,u9,m0,I missed you.,It says here you exposed yourself to a group o...,i missed you,0
60,L60,u8,m0,It says here you exposed yourself to a group o...,It was a bratwurst. I was eating lunch.,it say here you exposed yourself to a group of...,0
...,...,...,...,...,...,...,...
666522,L666522,u9034,m616,So far only their scouts. But we have had repo...,maybe...,so far only their scout but we have had report...,0
666546,L666546,u9027,m616,Splendid site Crealock splendil I want to esta...,Certainly Sin,splendid site crealock splendil i want to esta...,0
666547,L666547,u9029,m616,Certainly Sin,talk more about it,certainly sin,0
666575,L666575,u9028,m616,Choose your targets men. That's right Watch th...,Keep steady. You're the best shots of the Twen...,choose your target men that's right watch thos...,0


In [85]:
messages.to_csv('./chatdata/movie_lines_pre_processed.tsv', index=False, sep='\t', header=True)