In [None]:
import ast
import numpy as np
from scipy.stats import iqr
from pprint import pprint
from datetime import datetime
import re
import string

import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', lambda x: '{:⎯>13.3f}'.format(x))

# plotting
import matplotlib.pyplot as plt
from matplotlib.ticker import StrMethodFormatter

# fluff
from jupyterthemes import jtplot
jtplot.style()
from kindlib.fluff import *

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

In [None]:
mdf = pd.read_pickle('data/movies_metadata.pkl')

pprint(list(mdf.columns))
mdf.shape

### <center>Caracterização</center>

In [None]:
colstats = Colstats(mdf)

print('Total number of entries:', len(mdf))
colstat = []
for idx, row in colstats.iterrows():
    df = pd.DataFrame(data=row).drop('col').rename(columns={idx:colstats['col'].iloc[idx]}).T
    df.index.name = 'Feature'
    if int(df['nunique']) <= 10:
        df2 = pd.DataFrame(mdf[row['col']].unique(), columns=['values'])
        p(df, df2)
    else:
        display(df)
    print('')

In [None]:
mdf.columns

In [None]:
typecol = {
    'json' : [
        'belongs_to_collection',
        'genres',
        'production_companies',
        'production_countries',
        'spoken_languages',
    ],
    'string' : [
        'imdb_id',
        'original_language',
        'original_title',
        'overview',
        'status',
        'tagline',
        'title',
    ],
    'url' : [
        'homepage',
    ],
    'datetime' : [
        'release_date',
    ],
    'int' : [
        'budget',
        'id',
        'revenue',
        'runtime',
        'vote_count'
    ],
    'float' : [
        'popularity',
        'vote_average',
    ],
    'bool' : [
        'adult',
        'video',
    ],
    'other' : [
        'poster_path',
    ]
}
unused_cols = [
    'homepage',
    'original_language',
    'overview',
    'poster_path',
    'tagline',
    'video',
]
coltype = {c:k for k, v in typecol.items() for c in v if c not in unused_cols}

mdf = mdf.drop(columns=[c for c in mdf.columns if c not in coltype])
for c in typecol['datetime']:
    timedf = pd.to_datetime(clean(mdf[c].dropna(), wsep='-', asep=' '), errors='coerce', infer_datetime_format=True).dropna().astype(np.datetime64)
    mdf = mdf.iloc[timedf.index]
    mdf[c] = timedf
del(timedf)
for c in typecol['int']:
    mdf[c] = pd.to_numeric(mdf[c], errors='coerce')
    mdf = mdf.dropna(subset=[c])
    mdf[c] = mdf[c].astype('int')
for c in typecol['float']:
    mdf[c] = pd.to_numeric(mdf[c], errors='coerce')
    mdf = mdf.dropna(subset=[c])
    mdf[c] = mdf[c].astype('float')
for c in typecol['bool']:
    mdf[c] =  mdf[mdf['adult'] == 'False'].append(
        other=mdf[mdf['adult'] == 'True'], verify_integrity=True)['adult'] == 'True'

In [None]:
mdf.dtypes

Unreliable columns: <br><br>

| Dropped columns       | Reason                       | Alternative                              |
| --------------------- |:----------------------------:| ----------------------------------------:|
| belongs_to_collection | too many nans                | &nbsp;&nbsp;convert to bool              |
| homepage              | too many nans                | &nbsp;&nbsp;convert to bool              |
| runtime               | estimation pollutes analysis | &nbsp;&nbsp;use mean or median of others |

<br>Let's drop them:

In [None]:
mdf = mdf.drop(columns=['belongs_to_collection', 'runtime'])  # homepage was already dropped

In [None]:
# removing invalid dupes
print('Shape before removing dupes:', mdf.shape)
mdf = mdf.drop(index=mdf[mdf['id'].duplicated() | mdf['imdb_id'].duplicated()].index)
print('Shape after removing dupes:', mdf.shape)

# removing entries with zero budget or zero revenue
moneydf = mdf[mdf['budget'].apply(lambda x: bool(x))]
print('Number of movies with budget info:', moneydf.shape[0])

moneydf = mdf[mdf['revenue'].apply(lambda x: bool(x))]
print('Number of movies with revenue info:', moneydf.shape[0])

moneydf = mdf[['budget','revenue']]
moneydf = moneydf[moneydf['budget'].apply(lambda x: bool(x)) & moneydf['revenue'].apply(lambda x: bool(x))]
print('Number of movies with budget and revenue info:', moneydf.shape[0])

moneydf = mdf.loc[moneydf.index]

In [None]:

colstats = Colstats(moneydf)
for idx, row in colstats.iterrows():
    df = pd.DataFrame(data=row).drop('col').rename(columns={idx:colstats['col'].iloc[idx]}).T
    df.index.name = 'Feature'
    if int(df['nunique']) <= 10:
        df2 = pd.DataFrame(moneydf[row['col']].unique(), columns=['values'])
        p(df, df2)
    else:
        display(df)
    print('')
moneydf[['budget', 'popularity', 'revenue', 'vote_average']].describe().round(3)

<br><br>Now that our base is looking good for the basic types, let's try some plotting:

In [None]:
bw_methods = ['scott', 'silverman'] #, 0.1, 0.25, 0.5, 0.75, 1.0]
for c in moneydf[['budget', 'popularity', 'revenue', 'vote_average']].columns:
    fig, ax = plt.subplots()
    line = {}
    for idx, bw in enumerate(bw_methods):
        line[idx] = moneydf[c].plot.kde(bw_method=bw)
        line[idx].set_xlim(
            xmin=moneydf[c].quantile(0.25)-iqr(moneydf[c].values)*1.5,
            xmax=moneydf[c].quantile(0.75)+iqr(moneydf[c].values)*1.5)
        max_exp = int(np.floor(np.log10(moneydf[c].quantile(0.75)+iqr(moneydf[c].values)*1.5)))
        line[idx].ticklabel_format(axis='x', style='sci', scilimits=(max_exp,max_exp))
        line[idx].set_title(c, fontdict={'y': 0})
        line[idx].set_label(bw)
    plt.legend(bw_methods)
    plt.show()
    plt.close()
pass

<hr>

Genres

In [None]:
# possibly interesting data
genre_data = {}

print('Total entries so far:', len(moneydf))

# ignoring possibility of genre id collision
moneydf.apply(axis=1,
        func=lambda x:
        [
            genre_data.update({k: v})
            for (_, k), (_, v) in
            [
                g.items()
                    if len(g) > 0
                    else {}
                for g in ast.literal_eval(x['genres'])
            ]
        ]
)

print('')
print('Number of distinct genres:', len(genre_data))
pprint(genre_data)

<hr>

### <center>Testes de Hipótese</center>

#### I. H<sub>0</sub>: O revenue de filmes do mesmo gênero não é influenciado pela data de lançamento.

In [None]:
# relevant info for H0
gdf = moneydf[['budget', 'revenue', 'genres', 'release_date']]

# creating columns to indicate genre
for _, g in genre_data.items():
    gdf = gdf.assign(**{str(g) : lambda _: [0 for _ in range(gdf.shape[0])]})

# setting correct genre values to 1 on each movie
genres = list(genre_data.values())
def set_genres(row:pd.Series, ginfo):
    genre_series = pd.Series(np.zeros(len(row)), index=row.index, dtype=int)
    for genre_info in ast.literal_eval(ginfo):
        genre_series[genre_info['name']] = 1
    return genre_series

# updating genre df
gdf[genres] = gdf[['genres']+genres].apply(axis=1, func=lambda row: set_genres(row, row['genres'])).drop(columns=['genres'])
gdf = gdf.drop(columns=['genres'])
gdf['year'] = gdf['release_date'].apply(func=lambda x: x.year)
gdf.head(5)

In [None]:
# value correction -> real value = nominal value / gdp deflator 
price_deflator_df = pd.read_csv('data/GDPCTPI.csv').sort_values(by=['DATE'])
price_deflator_df['DATE'] = price_deflator_df['DATE'].astype(np.datetime64)
price_deflator_df = price_deflator_df.set_index(keys=['DATE'], verify_integrity=True)
price_deflator_df['y'] = price_deflator_df.apply(axis=1, func=lambda x: x.name.year)
price_deflator_df = pd.DataFrame(price_deflator_df.groupby(by=['y'])['GDPCTPI'].mean())
assert(len(price_deflator_df) == max(price_deflator_df.index) - min(price_deflator_df.index) + 1)

gdf = gdf.drop(gdf[gdf['year'] < min(price_deflator_df.index)].index)
gdf['gdpctpi'] = gdf['year'].apply(lambda x: price_deflator_df.loc[x])
gdf['budget_adjusted'] = gdf['budget'] / gdf['gdpctpi']
gdf['revenue_adjusted'] = gdf['revenue'] / gdf['gdpctpi']
gdf['profit_2012'] = (gdf['revenue_adjusted'] - gdf['budget_adjusted']).round(2)

In [None]:
gdf = gdf[['budget', 'budget_adjusted', 'revenue', 'revenue_adjusted', 'year',
        'profit_2012', 'gdpctpi', 'release_date', 'Animation', 'Comedy', 'Family',
        'Adventure', 'Fantasy', 'Drama', 'Romance', 'Action', 'Crime',
        'Thriller', 'History', 'Science Fiction', 'Mystery', 'Horror', 'War',
        'Foreign', 'Documentary', 'Western', 'Music', 'TV Movie']]

In [None]:
gdf

### Let's see the big winners and big losers!

In [None]:
p(
    mdf.loc[gdf.sort_values(by=['profit_2012'])[-5:].index][['title']],
   gdf.sort_values(by=['profit_2012'])[-5:][['profit_2012']]
)
p(
    mdf.loc[gdf.sort_values(by=['profit_2012'], ascending=False)[-5:].index][['title']],
   gdf.sort_values(by=['profit_2012'], ascending=False)[-5:][['profit_2012']]
)
pass

In [None]:
gdf.columns

In [None]:
bw_methods = ['scott', 'silverman'] #, 0.1, 0.25, 0.5, 0.75, 1.0]
for c in gdf[['budget_adjusted', 'revenue', 'profit_2012', 'year']].columns:
    fig, ax = plt.subplots()
    line = {}
    for idx, bw in enumerate(bw_methods):
        line[idx] = gdf[c].plot.kde(bw_method=bw)
        line[idx].set_xlim(
            xmin=gdf[c].quantile(0.25)-iqr(gdf[c].values)*1.5,
            xmax=gdf[c].quantile(0.75)+iqr(gdf[c].values)*1.5)
        max_exp = int(np.floor(np.log10(gdf[c].quantile(0.75)+iqr(gdf[c].values)*1.5)))
        if c != 'year':
            line[idx].ticklabel_format(axis='x', style='sci', scilimits=(max_exp,max_exp))
        else:
            line[idx].ticklabel_format(axis='x', style='plain')
        line[idx].set_title(c, fontdict={'y': 0})
        line[idx].set_label(bw)
    plt.legend(bw_methods)
    plt.show()
    plt.close()
pass

#### II. H<sub>0</sub>: A média de idade dos 10 atores principais está correlacionada com os ratings.

<hr>

### <center>Regressão</center>

In [None]:
mdf['popularity'].describe()

Vamos considerar como popular os filmes com um valor de 'popularity' maior que 2 (um pouco menor que a média)

In [None]:
new_mdf = mdf
new_mdf['popularity'] = (new_mdf['popularity'] > 2).astype(int)

In [None]:
new_mdf['popularity'].value_counts()

In [None]:
X = new_mdf[['vote_average', 'vote_count']]
y = new_mdf['popularity']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

logistic_regression = LogisticRegression()
logistic_regression.fit(X_train, y_train)
y_pred = logistic_regression.predict(X_test)

cm = metrics.confusion_matrix(y_test, y_pred)
print(cm)

accuracy = logistic_regression.score(X_test, y_test)
print(accuracy)

A acurácia foi de aproximadamente 90%

In [None]:
print(X_test)

In [None]:
# testando com outros valores
new_votes = {'vote_average': [6.7, 5.8, 3.6, 7.7, 9.8, 8.9],
            'vote_count': [10, 30, 49, 88, 19, 70]}

df = pd.DataFrame(new_votes, columns=['vote_average', 'vote_count'])
y_pred = logistic_regression.predict(df)
print(df)
print(y_pred)

In [None]:
# agora considerando o revenue do filme
new_mdf = moneydf
new_mdf['popularity'] = (new_mdf['popularity'] > 2).astype(int)
X = new_mdf[['vote_average', 'vote_count', 'revenue']]
y = new_mdf['popularity']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

logistic_regression = LogisticRegression()
logistic_regression.fit(X_train, y_train)
y_pred = logistic_regression.predict(X_test)

cm = metrics.confusion_matrix(y_test, y_pred)
print(cm)

accuracy = logistic_regression.score(X_test, y_test)
print(accuracy)

A acurácia foi para aproximadamente 91%

#### Least Squares

### <center>Classificação</center>

#### Hidden Markov Model

### <center>Agrupamento</center>

#### DBScan

### <center>Relatório Final:</center>
<br>

#### Sua análise dos dados deve apresentar:
- Uma caracterização (análise exploratória) inicial dos dados **(2pts)**
- Pelo menos, dois testes de hipótese/intervalos de confiança **(2pts)**
    - Os ICs podem ser apresentados nos resultados de regressão e classificação abaixo.
    - Os testes de hipótese também podem ser utilizados abaixo para comparar modelos.
- Pelo menos uma regressão **(3pts)**
- Pelo menos um algoritmo de aprendizado/classificação **(3pts)**

#### No seu relatório, você deve apresentar pelo menos os seguintes pontos:
- Introdução com Motivação e Pergunta de Pesquisa
- Metodologia
- Descreva sua base
- Quais métodos e modelos foram utilizados. Justifique os mesmos.
- Resultados. Sugiro separar em
    - Caracterização (análise exploratória)
    - Testes de hipótese podem vir aqui.
    - Previsão (uma ou duas sub-seções dependendo dos modelos utilizados)
    - Conclusões

Responda suas perguntas:
- Qual a melhor época do ano para anunciar e lançar um filme?
- Como a popularidade dos gêneros dos filmes evoluiu ao longo dos anos?
- Qual o peso de um ator/atriz popular no retorno financeiro de um filme com avaliação "ruim"?

### Vídeo
Vídeo no Youtube 5 minutos (pode ser um vídeo só de slides) **(5pts)**