In [1]:
from sklearn.pipeline import make_pipeline, make_union
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score
from pathlib import Path

import pandas as pd

In [2]:
# Montamos nuestro google drive en el colab

from google.colab import drive

drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [3]:
# Clonamos el repo para usar el codigo de la lib
!git clone https://github.com/elsonidoq/ml-practico-2022.git
!cd ml-practico-2022; git pull

Cloning into 'ml-practico-2022'...
remote: Enumerating objects: 421, done.[K
remote: Counting objects: 100% (108/108), done.[K
remote: Compressing objects: 100% (77/77), done.[K
remote: Total 421 (delta 71), reused 66 (delta 31), pack-reused 313[K
Receiving objects: 100% (421/421), 5.11 MiB | 12.79 MiB/s, done.
Resolving deltas: 100% (262/262), done.
Already up to date.


In [4]:
import sys
sys.path.append('ml-practico-2022')

In [5]:
from lib import data, transformers
from lib.model import get_features_pipe, get_model_pipe

In [6]:
from pathlib import Path

# Para trabajar en colab
PATH = Path('/content/gdrive/My Drive/ml-practico-data/')
movies_df = data.load_data(PATH)

Loading title basics...


  title_basics = load_title_basics(path)


Loading title ratings...
Loading movie directors...
Merging everything...


In [7]:
principals_df = pd.read_csv(PATH / 'title.principals.tsv.gz', sep='\t')

In [8]:
principals_df.head()

Unnamed: 0,tconst,ordering,nconst,category,job,characters
0,tt0000001,1,nm1588970,self,\N,"[""Self""]"
1,tt0000001,2,nm0005690,director,\N,\N
2,tt0000001,3,nm0374658,cinematographer,director of photography,\N
3,tt0000002,1,nm0721526,director,\N,\N
4,tt0000002,2,nm1335271,composer,\N,\N


In [9]:
principals_df.category.value_counts()

actor                  11279111
self                    8849657
actress                 8658481
writer                  7029439
director                5860057
producer                3274623
cinematographer         1766674
composer                1745098
editor                  1689718
production_designer      344620
archive_footage          324176
archive_sound              3253
Name: category, dtype: int64

In [10]:
# Sacado del codigo de directores

movies_stars = principals_df[principals_df.category.isin(['actress', 'actor'])].copy()

# Calculo un ranking por pelicula segun el ordering
movies_stars['star_rank'] = (
    movies_stars.sort_values('ordering')
        .groupby('tconst')
        .cumcount()
)

first_star = movies_stars[movies_stars.star_rank == 0][['nconst', 'tconst']].rename(columns={'nconst': '1st_star'})
second_star = movies_stars[movies_stars.star_rank == 1][['nconst', 'tconst']].rename(columns={'nconst': '2nd_star'})
third_star = movies_stars[movies_stars.star_rank == 2][['nconst', 'tconst']].rename(columns={'nconst': '3rd_star'})

In [11]:
stars_df = (
    first_star.merge(second_star, how='left', on='tconst')
              .merge(third_star, how='left', on='tconst')
)

In [12]:
stars_df.head()

Unnamed: 0,1st_star,tconst,2nd_star,3rd_star
0,nm0443482,tt0000005,nm0653042,
1,nm0179163,tt0000007,nm0183947,
2,nm0653028,tt0000008,,
3,nm0063086,tt0000009,nm0183823,nm1309758
4,nm3692297,tt0000011,,


In [13]:
stars_df[stars_df.tconst == 'tt0120338']

Unnamed: 0,1st_star,tconst,2nd_star,3rd_star
107543,nm0000138,tt0120338,nm0000701,nm0000708


In [14]:
movies_df = movies_df.merge(stars_df, on='tconst', how='left')

In [15]:
movies_df.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres,averageRating,numVotes,director,1st_star,2nd_star,3rd_star
0,tt0000502,movie,Bohemios,Bohemios,0,1905.0,\N,100.0,[no-genre],4.5,14,nm0063413,nm0215752,nm0252720,
1,tt0000574,movie,The Story of the Kelly Gang,The Story of the Kelly Gang,0,1906.0,\N,70.0,"[Action, Adventure, Biography]",6.0,772,nm0846879,nm0846887,nm0846894,nm1431224
2,tt0000591,movie,The Prodigal Son,L'enfant prodigue,0,1907.0,\N,90.0,[Drama],4.5,18,nm0141150,nm0906197,nm0332182,nm1323543
3,tt0000679,movie,The Fairylogue and Radio-Plays,The Fairylogue and Radio-Plays,0,1908.0,\N,120.0,"[Adventure, Fantasy]",5.3,66,nm0091767,nm0000875,nm0122665,nm0933446
4,tt0001184,movie,Don Juan de Serrallonga,Don Juan de Serrallonga,0,1910.0,\N,58.0,"[Adventure, Drama]",3.9,20,nm0063413,nm0699807,nm0735618,


# Ahora  vamos a experimentar!

Vamos a usar el transformer `CrewFeatures` definido en https://github.com/elsonidoq/ml-practico-2022/blob/main/lib/transformers/director_features.py#L5

In [16]:
rating_data = data.load_rating_train_dev_test(movies_df)

In [17]:
rating_data.keys()

dict_keys(['X_train', 'y_train', 'X_dev', 'y_dev', 'X_test', 'y_test'])

In [18]:
pipe = make_pipeline(
    transformers.CrewFeatures('1st_star', min_cnt_movies=3),
    DictVectorizer(sparse=False),
    StandardScaler(),
    LogisticRegression()
)

In [19]:
pipe.fit(rating_data['X_train'], rating_data['y_train'] > 7.5)

tr_auc = roc_auc_score(rating_data['y_train'] > 7.5, pipe.predict_proba(rating_data['X_train'])[:, 1])
dev_auc = roc_auc_score(rating_data['y_dev'] > 7.5, pipe.predict_proba(rating_data['X_dev'])[:, 1])

tr_auc, dev_auc

(0.6746784815771065, 0.5521969734682844)

In [20]:
pipe = make_pipeline(
    make_union(
        make_pipeline(transformers.CrewFeatures('1st_star', min_cnt_movies=3), DictVectorizer(sparse=False)),
        make_pipeline(transformers.CrewFeatures('2nd_star', min_cnt_movies=3), DictVectorizer(sparse=False)),
        make_pipeline(transformers.CrewFeatures('3rd_star', min_cnt_movies=3), DictVectorizer(sparse=False)),
    ),
    StandardScaler(),
    LogisticRegression()
)

In [21]:
pipe.fit(rating_data['X_train'], rating_data['y_train'] > 7.5)

tr_auc = roc_auc_score(rating_data['y_train'] > 7.5, pipe.predict_proba(rating_data['X_train'])[:, 1])
dev_auc = roc_auc_score(rating_data['y_dev'] > 7.5, pipe.predict_proba(rating_data['X_dev'])[:, 1])

tr_auc, dev_auc

(0.7314802118434764, 0.5824416066160876)

# Probando todo junto

In [22]:
pipe = make_pipeline(
    make_union(
        make_pipeline(transformers.CrewFeatures('1st_star', min_cnt_movies=3), DictVectorizer(sparse=False)),
        make_pipeline(transformers.CrewFeatures('2nd_star', min_cnt_movies=3), DictVectorizer(sparse=False)),
        make_pipeline(transformers.CrewFeatures('3rd_star', min_cnt_movies=3), DictVectorizer(sparse=False)),
        make_pipeline(transformers.CrewFeatures('director', min_cnt_movies=3), DictVectorizer(sparse=False)),
        make_pipeline(transformers.YearsAgo(), DictVectorizer(sparse=False)),
        make_pipeline(transformers.GenreDummies(), DictVectorizer(sparse=False)),
    ),
    StandardScaler(),
    LogisticRegression()
)

In [23]:
pipe.fit(rating_data['X_train'], rating_data['y_train'] > 7.5)

tr_auc = roc_auc_score(rating_data['y_train'] > 7.5, pipe.predict_proba(rating_data['X_train'])[:, 1])
dev_auc = roc_auc_score(rating_data['y_dev'] > 7.5, pipe.predict_proba(rating_data['X_dev'])[:, 1])

tr_auc, dev_auc

(0.8439064798577167, 0.7364522007289038)

In [24]:
pipe = make_pipeline(
    make_union(
        make_pipeline(transformers.CrewFeatures('director', min_cnt_movies=3), DictVectorizer(sparse=False)),
        make_pipeline(transformers.YearsAgo(), DictVectorizer(sparse=False)),
        make_pipeline(transformers.GenreDummies(), DictVectorizer(sparse=False)),
    ),
    StandardScaler(),
    LogisticRegression()
)

In [25]:
pipe.fit(rating_data['X_train'], rating_data['y_train'] > 7.5)

tr_auc = roc_auc_score(rating_data['y_train'] > 7.5, pipe.predict_proba(rating_data['X_train'])[:, 1])
dev_auc = roc_auc_score(rating_data['y_dev'] > 7.5, pipe.predict_proba(rating_data['X_dev'])[:, 1])

tr_auc, dev_auc

(0.8191894647150124, 0.7306138375864368)

In [26]:
pipe = make_pipeline(
    make_union(
        make_pipeline(transformers.YearsAgo(), DictVectorizer(sparse=False)),
        make_pipeline(transformers.GenreDummies(), DictVectorizer(sparse=False)),
    ),
    StandardScaler(),
    LogisticRegression()
)

In [27]:
pipe.fit(rating_data['X_train'], rating_data['y_train'] > 7.5)

tr_auc = roc_auc_score(rating_data['y_train'] > 7.5, pipe.predict_proba(rating_data['X_train'])[:, 1])
dev_auc = roc_auc_score(rating_data['y_dev'] > 7.5, pipe.predict_proba(rating_data['X_dev'])[:, 1])

tr_auc, dev_auc

(0.7430610117914456, 0.724399428708005)

In [44]:
from lightgbm import LGBMClassifier

pipe = make_pipeline(
    make_union(
        make_pipeline(transformers.CrewFeatures('1st_star', min_cnt_movies=3), DictVectorizer(sparse=False)),
        make_pipeline(transformers.CrewFeatures('2nd_star', min_cnt_movies=3), DictVectorizer(sparse=False)),
        make_pipeline(transformers.CrewFeatures('3rd_star', min_cnt_movies=3), DictVectorizer(sparse=False)),
        make_pipeline(transformers.CrewFeatures('director', min_cnt_movies=3), DictVectorizer(sparse=False)),
        make_pipeline(transformers.YearsAgo(), DictVectorizer(sparse=False)),
        make_pipeline(transformers.GenreDummies(), DictVectorizer(sparse=False)),
    ),
    LGBMClassifier(),
)

In [45]:
pipe.fit(rating_data['X_train'], rating_data['y_train'] > 7.5)

tr_auc = roc_auc_score(rating_data['y_train'] > 7.5, pipe.predict_proba(rating_data['X_train'])[:, 1])
dev_auc = roc_auc_score(rating_data['y_dev'] > 7.5, pipe.predict_proba(rating_data['X_dev'])[:, 1])

tr_auc, dev_auc

(0.9277334615494219, 0.7425097037252466)

# Word2Vec features

In [32]:
class EpochSaver: pass

from gensim.models import Word2Vec

w2v = Word2Vec.load(str(PATH / 'w2v/epoch_10'))

In [33]:
import numpy as np

default_vector = np.mean(w2v.wv.vectors, axis=0)

### TODO: revisar a quien se parece este default_vector

Seria mejor un vector se 0s? alguna otra agregacion sobre los datos?

In [34]:
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin

class W2VCrewFeatures(BaseEstimator, TransformerMixin):
    def __init__(self, w2v, fields, min_cnt_movies=2):
        self.fields = fields
        self.min_cnt_movies = min_cnt_movies
        self.w2v = w2v

    def fit(self, X, y):
        self.default_vector_ = np.mean(w2v.wv.vectors, axis=0)
        return self
    
    def _get_movie_vector(self, x_i):
        vectors = []
        for field in self.fields:
            person_id = x_i[field]
            if person_id not in self.w2v.wv or self.w2v.wv.vocab[person_id].count < self.min_cnt_movies: continue

            vectors.append(self.w2v.wv[person_id])

        if len(vectors) == 0:
            return self.default_vector_
        else:
            return np.mean(vectors, axis=0)
            
    def transform(self, X):
        return np.asarray([self._get_movie_vector(x_i) for x_i in X])

In [35]:
pipe = make_pipeline(
    W2VCrewFeatures(w2v, ['1st_star', '2nd_star', '3rd_star']),
    StandardScaler(),
    LogisticRegression()
)

In [36]:
pipe.fit(rating_data['X_train'], rating_data['y_train'] > 7.5)

tr_auc = roc_auc_score(rating_data['y_train'] > 7.5, pipe.predict_proba(rating_data['X_train'])[:, 1])
dev_auc = roc_auc_score(rating_data['y_dev'] > 7.5, pipe.predict_proba(rating_data['X_dev'])[:, 1])

tr_auc, dev_auc

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


(0.7106231305624312, 0.6450264787318621)

In [37]:
pipe = make_pipeline(
    W2VCrewFeatures(w2v, ['1st_star', '2nd_star', '3rd_star', 'director']),
    StandardScaler(),
    LogisticRegression()
)

In [38]:
pipe.fit(rating_data['X_train'], rating_data['y_train'] > 7.5)

tr_auc = roc_auc_score(rating_data['y_train'] > 7.5, pipe.predict_proba(rating_data['X_train'])[:, 1])
dev_auc = roc_auc_score(rating_data['y_dev'] > 7.5, pipe.predict_proba(rating_data['X_dev'])[:, 1])

tr_auc, dev_auc

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


(0.7270682277492554, 0.6633249840393988)

In [39]:
pipe = make_pipeline(
    make_union(
        W2VCrewFeatures(w2v, ['1st_star', '2nd_star', '3rd_star']),
        W2VCrewFeatures(w2v, ['director'])
    ),
    StandardScaler(),
    LogisticRegression()
)

In [40]:
pipe.fit(rating_data['X_train'], rating_data['y_train'] > 7.5)

tr_auc = roc_auc_score(rating_data['y_train'] > 7.5, pipe.predict_proba(rating_data['X_train'])[:, 1])
dev_auc = roc_auc_score(rating_data['y_dev'] > 7.5, pipe.predict_proba(rating_data['X_dev'])[:, 1])

tr_auc, dev_auc

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


(0.7368388318666648, 0.6650243113506439)

In [48]:
pipe = make_pipeline(
    make_union(
        W2VCrewFeatures(w2v, ['1st_star', '2nd_star', '3rd_star']),
        make_pipeline(transformers.CrewFeatures('1st_star', min_cnt_movies=3), DictVectorizer(sparse=False)),
        make_pipeline(transformers.CrewFeatures('2nd_star', min_cnt_movies=3), DictVectorizer(sparse=False)),
        make_pipeline(transformers.CrewFeatures('3rd_star', min_cnt_movies=3), DictVectorizer(sparse=False)),
        make_pipeline(transformers.CrewFeatures('director', min_cnt_movies=3), DictVectorizer(sparse=False)),
        make_pipeline(transformers.YearsAgo(), DictVectorizer(sparse=False)),
        make_pipeline(transformers.GenreDummies(), DictVectorizer(sparse=False)),
    ),
    LGBMClassifier()
)

In [49]:
pipe.fit(rating_data['X_train'], rating_data['y_train'] > 7.5)

tr_auc = roc_auc_score(rating_data['y_train'] > 7.5, pipe.predict_proba(rating_data['X_train'])[:, 1])
dev_auc = roc_auc_score(rating_data['y_dev'] > 7.5, pipe.predict_proba(rating_data['X_dev'])[:, 1])

tr_auc, dev_auc

(0.9305648897598409, 0.7522210999583409)