In [1]:
from dateutil.parser import parse
from IPython.display import display
import numpy as np
import pandas as pd
import datetime
import nltk
import re
import string

nltk.download('stopwords')

EPOCH = datetime.datetime(1970, 1, 1)
STOPWORDS = set(nltk.corpus.stopwords.words('english'))

[nltk_data] Downloading package stopwords to /home/gdsv/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
reviews = pd.read_csv('./scraping/data/reviews.csv')
details = pd.read_csv('./scraping/data/details.csv')

# Etapa 2 — Pré-processamento de dados
## 2.1 Ajuste de tipos de atributos

Lorem ipsum dolor sit amet, consectetur adipiscing elit. Aenean sodales varius bibendum. Aliquam erat volutpat. Integer pretium nisi sit amet dolor vestibulum dapibus. Donec ipsum mi, sollicitudin eget leo eget, sollicitudin consectetur sapien. Cras ligula purus, mollis at elementum sit amet, ullamcorper eu sem. Integer vitae magna vehicula dolor consequat consectetur. Nulla nec arcu ac nibh luctus sagittis.

Nulla tincidunt risus in est fringilla, ut venenatis neque rhoncus. Praesent euismod et urna tempus accumsan. Mauris ultricies dolor vel lacus pretium auctor. Sed sed placerat justo. Nullam viverra neque interdum metus aliquet, non euismod neque interdum. Quisque interdum egestas magna, vestibulum feugiat nisl accumsan et. Nam finibus pulvinar hendrerit. Sed nec pulvinar ligula.

In [3]:
print(reviews.dtypes)

movie           object
reviewer        object
metascore       object
review_score     int64
review_text     object
review_date     object
details         object
dtype: object


Duis auctor molestie aliquet. Integer nibh dolor, tristique a varius at, vehicula eu odio. Vestibulum nulla nisl, maximus euismod felis quis, mollis iaculis odio. Nulla vitae ipsum at nulla eleifend bibendum. Donec semper, ligula nec scelerisque facilisis, magna lacus auctor ex, in lacinia velit augue ac turpis. Proin vestibulum ornare mauris, ac accumsan justo. Praesent quis mi commodo dolor pharetra commodo id fermentum eros. Ut egestas arcu vitae consequat pellentesque. Pellentesque sagittis felis sed enim fermentum, non fringilla neque tincidunt. Nam tempor, sem et tincidunt dictum, nulla sapien auctor ex, eu rhoncus nisl nisl laoreet nunc. Maecenas hendrerit id felis eu mollis. Pellentesque nec pellentesque eros, a euismod mauris. Maecenas varius massa eu felis dignissim ornare.

In [4]:
def metascore_parser(score):
    if score != 'tbd':
        return int(score)
    return np.nan

def date_parser(date):
    global EPOCH
    if date is np.nan or date == 'TBA':
        return EPOCH
    return parse(date)

def tokenize(text):
    global STOPWORDS
    tokens = []
    for word in text.lower().split():
        if word not in STOPWORDS:
            exp = re.compile('[{}]+'.format(string.punctuation))
            tokens.append(re.sub(exp, '', word))
    return tokens

Lorem ipsum dolor sit amet, consectetur adipiscing elit. Aenean sodales varius bibendum. Aliquam erat volutpat. Integer pretium nisi sit amet dolor vestibulum dapibus. Donec ipsum mi, sollicitudin eget leo eget, sollicitudin consectetur sapien. Cras ligula purus, mollis at elementum sit amet, ullamcorper eu sem. Integer vitae magna vehicula dolor consequat consectetur. Nulla nec arcu ac nibh luctus sagittis.

In [5]:
reviews.review_text = reviews['review_text'].apply(tokenize)
reviews.metascore = reviews['metascore'].apply(metascore_parser)
reviews.review_date = reviews['review_date'].apply(date_parser)
reviews.reviewer = reviews['reviewer'].astype('category')
reviews = reviews.drop('details', axis=1)

display(reviews.head())
display(reviews.dtypes)

Unnamed: 0,movie,reviewer,metascore,review_score,review_text,review_date
0,The Grandmaster,Manohla Dargis,73.0,100,"[grandmaster, is, persuasive, triumph, style, ...",2013-08-22
1,Of Gods and Men,A.O. Scott,86.0,100,"[gods, men, supple, suspenseful, appropriately...",2011-02-24
2,Kubo and the Two Strings,Glenn Kenny,84.0,100,"[action, gorgeously, fluid, idiosyncratic, 3d,...",2016-08-18
3,Jane,Ben Kenigsberg,87.0,100,"[jane, delight, familiar, ms, goodall, provide...",2017-10-19
4,Bird People,A.O. Scott,70.0,100,"[plot, twists, ms, ferran, here, transform, —,...",2014-09-11


movie                   object
reviewer              category
metascore              float64
review_score             int64
review_text             object
review_date     datetime64[ns]
dtype: object

Lorem ipsum dolor sit amet, consectetur adipiscing elit. Aenean sodales varius bibendum. Aliquam erat volutpat. Integer pretium nisi sit amet dolor vestibulum dapibus. Donec ipsum mi, sollicitudin eget leo eget, sollicitudin consectetur sapien. Cras ligula purus, mollis at elementum sit amet, ullamcorper eu sem. Integer vitae magna vehicula dolor consequat consectetur. Nulla nec arcu ac nibh luctus sagittis.

Nulla tincidunt risus in est fringilla, ut venenatis neque rhoncus. Praesent euismod et urna tempus accumsan. Mauris ultricies dolor vel lacus pretium auctor. Sed sed placerat justo. Nullam viverra neque interdum metus aliquet, non euismod neque interdum. Quisque interdum egestas magna, vestibulum feugiat nisl accumsan et. Nam finibus pulvinar hendrerit. Sed nec pulvinar ligula.

In [6]:
print(details.dtypes)

movie           object
release_date    object
acclaim_rate    object
user_score      object
director        object
genres          object
languages       object
dtype: object


Duis auctor molestie aliquet. Integer nibh dolor, tristique a varius at, vehicula eu odio. Vestibulum nulla nisl, maximus euismod felis quis, mollis iaculis odio. Nulla vitae ipsum at nulla eleifend bibendum. Donec semper, ligula nec scelerisque facilisis, magna lacus auctor ex, in lacinia velit augue ac turpis. Proin vestibulum ornare mauris, ac accumsan justo. Praesent quis mi commodo dolor pharetra commodo id fermentum eros. Ut egestas arcu vitae consequat pellentesque. Pellentesque sagittis felis sed enim fermentum, non fringilla neque tincidunt. Nam tempor, sem et tincidunt dictum, nulla sapien auctor ex, eu rhoncus nisl nisl laoreet nunc. Maecenas hendrerit id felis eu mollis. Pellentesque nec pellentesque eros, a euismod mauris. Maecenas varius massa eu felis dignissim ornare.

In [7]:
def userscore_parser(score):
    if score != 'tbd':
        return float(score)*10
    return np.nan

In [8]:
details.acclaim_rate = details['acclaim_rate'].astype('category')
details.director = details['director'].astype('category')
details.genres = details['genres'].astype('category')
details.languages = details['languages'].astype('category')
details.user_score = details['user_score'].apply(userscore_parser)
details.release_date = details['release_date'].apply(date_parser)

display(details.head())
display(details.dtypes)

Unnamed: 0,movie,release_date,acclaim_rate,user_score,director,genres,languages
0,The Grandmaster,2013-08-23,Generally favorable reviews,67.0,Kar Wai Wong,"['Action', 'Biography', 'Drama']","['Mandarin', 'Japanese', 'Cantonese']"
1,Of Gods and Men,2011-02-25,Generally favorable reviews,70.0,Xavier Beauvois,"['Drama', 'History']","['French', 'Arabic']"
2,Kubo and the Two Strings,2016-08-19,Universal acclaim,81.0,Travis Knight,"['Adventure', 'Fantasy', 'Animation', 'Family']",['English']
3,Jane,2017-10-20,No score yet,,Brett Morgen,"['Biography', 'Documentary']",['English']
4,Bird People,2014-09-12,Generally favorable reviews,70.0,Pascale Ferran,"['Drama', 'Fantasy', 'Romance']","['English', 'French', 'Japanese']"


movie                   object
release_date    datetime64[ns]
acclaim_rate          category
user_score             float64
director              category
genres                category
languages             category
dtype: object

## 2.2 Tratamento de dados ausentes

## 2.3 Discretização

## 2.4 Integração de dados

Lorem ipsum dolor sit amet, consectetur adipiscing elit. Aenean sodales varius bibendum. Aliquam erat volutpat. Integer pretium nisi sit amet dolor vestibulum dapibus. Donec ipsum mi, sollicitudin eget leo eget, sollicitudin consectetur sapien. Cras ligula purus, mollis at elementum sit amet, ullamcorper eu sem. Integer vitae magna vehicula dolor consequat consectetur. Nulla nec arcu ac nibh luctus sagittis.

In [9]:
display(details[details.duplicated()])
display(details[details.movie=='I Love You, Daddy'])

Unnamed: 0,movie,release_date,acclaim_rate,user_score,director,genres,languages
10526,"I Love You, Daddy",1970-01-01,No score yet,,Louis C.K.,"['Drama', 'Comedy']",['English']


Unnamed: 0,movie,release_date,acclaim_rate,user_score,director,genres,languages
2779,"I Love You, Daddy",1970-01-01,No score yet,,Louis C.K.,"['Drama', 'Comedy']",['English']
10526,"I Love You, Daddy",1970-01-01,No score yet,,Louis C.K.,"['Drama', 'Comedy']",['English']


Duis auctor molestie aliquet. Integer nibh dolor, tristique a varius at, vehicula eu odio. Vestibulum nulla nisl, maximus euismod felis quis, mollis iaculis odio. Nulla vitae ipsum at nulla eleifend bibendum. Donec semper, ligula nec scelerisque facilisis, magna lacus auctor ex, in lacinia velit augue ac turpis. Proin vestibulum ornare mauris, ac accumsan justo. Praesent quis mi commodo dolor pharetra commodo id fermentum eros. Ut egestas arcu vitae consequat pellentesque. Pellentesque sagittis felis sed enim fermentum, non fringilla neque tincidunt. Nam tempor, sem et tincidunt dictum, nulla sapien auctor ex, eu rhoncus nisl nisl laoreet nunc. Maecenas hendrerit id felis eu mollis. Pellentesque nec pellentesque eros, a euismod mauris. Maecenas varius massa eu felis dignissim ornare.

In [10]:
display(reviews[reviews.movie=='I Love You, Daddy'])

Unnamed: 0,movie,reviewer,metascore,review_score,review_text,review_date
2779,"I Love You, Daddy",Manohla Dargis,56.0,80,"[“zama”, love, you, daddy, two, best, movies, ...",1970-01-01
10526,"I Love You, Daddy",Manohla Dargis,56.0,40,"[watched, love, you, daddy, second, time, joke...",1970-01-01


Lorem ipsum dolor sit amet, consectetur adipiscing elit. Aenean sodales varius bibendum. Aliquam erat volutpat. Integer pretium nisi sit amet dolor vestibulum dapibus. Donec ipsum mi, sollicitudin eget leo eget, sollicitudin consectetur sapien. Cras ligula purus, mollis at elementum sit amet, ullamcorper eu sem. Integer vitae magna vehicula dolor consequat consectetur. Nulla nec arcu ac nibh luctus sagittis.

Nulla tincidunt risus in est fringilla, ut venenatis neque rhoncus. Praesent euismod et urna tempus accumsan. Mauris ultricies dolor vel lacus pretium auctor. Sed sed placerat justo. Nullam viverra neque interdum metus aliquet, non euismod neque interdum. Quisque interdum egestas magna, vestibulum feugiat nisl accumsan et. Nam finibus pulvinar hendrerit. Sed nec pulvinar ligula.

In [11]:
movie_reviews = pd.merge(reviews, details, on='movie', left_index=True, right_index=True)
display(movie_reviews.head())
display(movie_reviews.shape)

Unnamed: 0,movie,reviewer,metascore,review_score,review_text,review_date,release_date,acclaim_rate,user_score,director,genres,languages
0,The Grandmaster,Manohla Dargis,73.0,100,"[grandmaster, is, persuasive, triumph, style, ...",2013-08-22,2013-08-23,Generally favorable reviews,67.0,Kar Wai Wong,"['Action', 'Biography', 'Drama']","['Mandarin', 'Japanese', 'Cantonese']"
1,Of Gods and Men,A.O. Scott,86.0,100,"[gods, men, supple, suspenseful, appropriately...",2011-02-24,2011-02-25,Generally favorable reviews,70.0,Xavier Beauvois,"['Drama', 'History']","['French', 'Arabic']"
2,Kubo and the Two Strings,Glenn Kenny,84.0,100,"[action, gorgeously, fluid, idiosyncratic, 3d,...",2016-08-18,2016-08-19,Universal acclaim,81.0,Travis Knight,"['Adventure', 'Fantasy', 'Animation', 'Family']",['English']
3,Jane,Ben Kenigsberg,87.0,100,"[jane, delight, familiar, ms, goodall, provide...",2017-10-19,2017-10-20,No score yet,,Brett Morgen,"['Biography', 'Documentary']",['English']
4,Bird People,A.O. Scott,70.0,100,"[plot, twists, ms, ferran, here, transform, —,...",2014-09-11,2014-09-12,Generally favorable reviews,70.0,Pascale Ferran,"['Drama', 'Fantasy', 'Romance']","['English', 'French', 'Japanese']"


(12411, 12)