## A intenção do projeto é criar um chatbot baseado em reviews de filmes para que se possa fazer perguntas e manter uma conversa livre

- link do banco de dados https://www.kaggle.com/Cornell-University/movie-dialog-corpus?select=movie_lines.tsv
- referências
>- https://shanebarker.com/blog/deep-learning-chatbot/
> -https://towardsdatascience.com/how-to-create-a-chatbot-with-python-deep-learning-in-less-than-an-hour-56a063bdfc44

In [81]:
import pandas as pd
import re
import numpy as np

In [82]:
#setting parameters for data visualization
np.set_printoptions(threshold=None, precision=2)
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)
pd.set_option('precision', 2)

### Opening movie reviews

In [83]:
messages = pd.read_csv('./chatdata/movie_lines.tsv', header = None, delimiter="\t", quoting=3, encoding='ISO-8859-2')

In [84]:
messages.columns = ['msg_line', 'user_id', 'movie_id', 'user_name', 'msg']

In [85]:
messages.head(10)

Unnamed: 0,msg_line,user_id,movie_id,user_name,msg
0,L1045,u0,m0,BIANCA,They do not!
1,L1044,u2,m0,CAMERON,They do to!
2,L985,u0,m0,BIANCA,I hope so.
3,L984,u2,m0,CAMERON,She okay?
4,L925,u0,m0,BIANCA,Let's go.
5,L924,u2,m0,CAMERON,Wow
6,L872,u0,m0,BIANCA,Okay -- you're gonna need to learn how to lie.
7,L871,u2,m0,CAMERON,No
8,"""L870",u0,m0,BIANCA,I'm kidding. You know how sometimes you just ...
9,L869,u0,m0,BIANCA,Like my fear of wearing pastels?


In [86]:
messages.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 304713 entries, 0 to 304712
Data columns (total 5 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     --------------   ----- 
 0   msg_line   304713 non-null  object
 1   user_id    304713 non-null  object
 2   movie_id   304713 non-null  object
 3   user_name  304670 non-null  object
 4   msg        303815 non-null  object
dtypes: object(5)
memory usage: 11.6+ MB


In [87]:
messages.describe()

Unnamed: 0,msg_line,user_id,movie_id,user_name,msg
count,304713,304713,304713,304670,303815
unique,304713,9035,659,5929,265093
top,L362967,u4525,m289,JACK,What?
freq,1,537,1530,3032,1679


## Pre-processing data

### msg_line

In [88]:
messages['msg_line'].size

304713

In [89]:
#removing " (quotes)
messages['msg_line'] = [line.replace('\"','') for line in messages['msg_line']]

### user_id

In [90]:
#nothing to do

### user_name

In [91]:
data = messages['user_name']

In [92]:
#There are some user_name concatenaed with messages. For these, the value will be copied to msg column
incorrect_user_names = set([n for n in data if not(str(n).isupper())])
incorrect_user_names

{"A.T.L I'm not sure.  We have no launch protocol; the entry of the passenger is supposed to initiate activation.",
 "A.T.L There is no abort procedure -- we don't know how we turned the damn thing on let alone how to turn it off.",
 "A.T.L There's no plug to pull.",
 'A.T.L We have benzel activation repeat we have benzel activation.  Control to Arroway you okay in there?  Repeat Control to Arroway come back.',
 "A.T.L We've lost contact.",
 "BUSTER You're* a doctor Homer--you don't smell like ether.",
 'C.O ""We\'ll all have lunch.""  Good idea. Oh and let\'s be sure to invite this sociologist too -- just in case we want to have a FUCKING BRIDGE GAME AFTERWARDS!"',
 "C.O -- nothing I can do about it unless you're suggesting I infringe on their civil liberties -- which I'd happily do if you'll just trim a little fat off the Constitution.",
 "C.O All right lieutenant give me a name and specifics I'll have the X.O. file an action first thing in the morning.  A name?",
 "C.O And if you ju

In [93]:
#copy incorrec_user_name to msg column
for i in range(len(data)):
    if data[i] in incorrect_user_names:
        messages['msg'][i] = data[i]

In [94]:
#checking if the data is copied
messages[messages['user_name'] == 'C.O Because I was civil now you\'re complaining.']

Unnamed: 0,msg_line,user_id,movie_id,user_name,msg
31938,L241782,u1008,m66,C.O Because I was civil now you're complaining.,C.O Because I was civil now you're complaining.


In [95]:
#the user name will not be used in this database as there is the user id
messages = messages.drop(columns=['user_name'])

In [96]:
messages.describe()

Unnamed: 0,msg_line,user_id,movie_id,msg
count,304713,304713,304713,304405
unique,304713,9035,659,265679
top,L362967,u4525,m289,What?
freq,1,537,1530,1679


In [97]:
#saving data to file because msg column depends on it
messages.to_csv('./chatdata/movie_lines_normalized.tsv', index=False, sep='\t', header=False)

### msg

In [98]:
messages = pd.read_csv('./chatdata/movie_lines_normalized.tsv', header = None, delimiter="\t", quoting=3, encoding='ISO-8859-2')

In [99]:
messages.columns = ['msg_line', 'user_id', 'movie_id', 'msg']

In [100]:
messages.head(10)

Unnamed: 0,msg_line,user_id,movie_id,msg
0,L1045,u0,m0,They do not!
1,L1044,u2,m0,They do to!
2,L985,u0,m0,I hope so.
3,L984,u2,m0,She okay?
4,L925,u0,m0,Let's go.
5,L924,u2,m0,Wow
6,L872,u0,m0,Okay -- you're gonna need to learn how to lie.
7,L871,u2,m0,No
8,L870,u0,m0,"""I'm kidding. You know how sometimes you just..."
9,L869,u0,m0,Like my fear of wearing pastels?


In [101]:
data = messages['msg']

In [102]:
#remove the name of the users

In [106]:
#replace nan msg with UNKNOWN
messages = messages.fillna('UNKNOWN')
data = messages['msg']

In [107]:
#replace '' messages
messages['msg'] = [re.sub(r' +', ' ', m) for m in data]
messages = messages.replace(' ', 'NO ANSWER')

In [108]:
messages[messages['msg'] == 'nan']

Unnamed: 0,msg_line,user_id,movie_id,msg


In [109]:
messages[messages['msg'].isna()]

Unnamed: 0,msg_line,user_id,movie_id,msg


In [25]:
#check non-english words

## Save to file

In [110]:
messages.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 304713 entries, 0 to 304712
Data columns (total 4 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   msg_line  304713 non-null  object
 1   user_id   304713 non-null  object
 2   movie_id  304713 non-null  object
 3   msg       304713 non-null  object
dtypes: object(4)
memory usage: 9.3+ MB


In [111]:
messages.describe()

Unnamed: 0,msg_line,user_id,movie_id,msg
count,304713,304713,304713,304713
unique,304713,9035,659,265277
top,L362967,u4525,m289,What?
freq,1,537,1530,1679


In [112]:
messages.to_csv('./chatdata/movie_lines_normalized.tsv', index=False, sep='\t', header=False)