## A intenção do projeto é criar um chatbot baseado em reviews de filmes para que se possa fazer perguntas e manter uma conversa livre

- link do banco de dados https://www.kaggle.com/Cornell-University/movie-dialog-corpus?select=movie_lines.tsv
- referências
>- https://shanebarker.com/blog/deep-learning-chatbot/
> -https://towardsdatascience.com/how-to-create-a-chatbot-with-python-deep-learning-in-less-than-an-hour-56a063bdfc44

In [1]:
import pandas as pd
import re
import numpy as np

In [2]:
#setting parameters for data visualization
np.set_printoptions(threshold=None, precision=2)
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)
pd.set_option('precision', 2)

### Opening movie reviews

In [3]:
messages = pd.read_csv('./chatdata/movie_characters_metadata.tsv', header = None, delimiter="\t", quoting=3, encoding='ISO-8859-2')

In [4]:
messages.columns = ['user_id', 'user_name', 'movie_id', 'movie_name', 'tag_1', 'tag_2']

In [5]:
messages.head(10)

Unnamed: 0,user_id,user_name,movie_id,movie_name,tag_1,tag_2
0,u0,BIANCA,m0,10 things i hate about you,f,4
1,u1,BRUCE,m0,10 things i hate about you,?,?
2,u2,CAMERON,m0,10 things i hate about you,m,3
3,u3,CHASTITY,m0,10 things i hate about you,?,?
4,u4,JOEY,m0,10 things i hate about you,m,6
5,u5,KAT,m0,10 things i hate about you,f,2
6,u6,MANDELLA,m0,10 things i hate about you,f,7
7,u7,MICHAEL,m0,10 things i hate about you,m,5
8,u8,MISS PERKY,m0,10 things i hate about you,?,?
9,u9,PATRICK,m0,10 things i hate about you,m,1


In [6]:
messages.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9035 entries, 0 to 9034
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   user_id     9035 non-null   object
 1   user_name   9035 non-null   object
 2   movie_id    9035 non-null   object
 3   movie_name  9035 non-null   object
 4   tag_1       9035 non-null   object
 5   tag_2       9035 non-null   object
dtypes: object(6)
memory usage: 423.6+ KB


In [7]:
messages.describe()

Unnamed: 0,user_id,user_name,movie_id,movie_name,tag_1,tag_2
count,9035,9035,9035,9035,9035,9035
unique,9035,5357,617,617,5,58
top,u6123,MAN,m289,casino,?,?
freq,1,44,44,44,6020,6339


## Pre-processing data

### user_id

In [8]:
data = messages['user_id']

In [9]:
#removing " (quotes)
messages['user_id'] = [u.replace('\"','') for u in data]
data = messages['user_id']

In [10]:
data_clean = [m.split('u') for m in data]
data_clean = set([ l[0] for l in data_clean])
data_clean

{''}

### user_name

In [9]:
data = messages['user_name']

In [10]:
#remove repeated spaces
messages['user_name'] = [re.sub(r" +"," ",n) for n in data]
data = messages['user_name']

In [11]:
#changing repeated MAN for other nickname
import random

for i in range(len(data)):
    if data[i] == 'MAN':
       data[i] = 'MAN'+str(random.random())
    
messages['user_name'] = data

In [12]:
messages[messages['user_id'] == 'u610']

Unnamed: 0,user_id,user_name,movie_id,movie_name,tag_1,tag_2
610,u610,MAN0.5417786984991223,m38,bottle rocket,?,?


In [15]:
#removing duplications of user_name
dict = {}
for n in data:
    if n in dict:
        dict[n] = dict[n] + 1
    else:
        dict[n] = 1

In [26]:
for i in range(len(data)):
    if data[i] in list(dict.keys()):
        data[i] = data[i] + str(random.random(sed=42))

In [27]:
#remove special chars
messages['user_name'] =  [re.sub('[^A-Za-z0-9]+', ' ', n) for n in data]
data = messages['user_name']

In [28]:
#check are all unique
messages.describe()

Unnamed: 0,user_id,user_name,movie_id,movie_name,tag_1,tag_2
count,9035,9035,9035,9035,9035,9035
unique,9035,9035,617,617,5,58
top,u6123,GEORGE0 09089621489912036,m289,casino,?,?
freq,1,1,44,44,6020,6339


In [96]:
#need user database

## Save to file

In [None]:
#to do