# A Dummy Model

## Preamble

In [1]:
import pandas

In [2]:
import matplotlib.pyplot as plt
plt.style.use('dark_background')

## Data Preparation

In [3]:
movies = catalog.load("movies")

2021-03-11 13:32:44,100 - kedro.io.data_catalog - INFO - Loading data from `movies` (ParquetDataSet)...


In [4]:
movies = movies.set_index("id")

In [5]:
movies["genre_names"] = movies["genres"].apply(lambda ls: [d["name"] for d in ls])

In [6]:
movies = movies[movies["genre_names"].map(lambda d: len(d)) > 0] # drop  movies without genres

In [7]:
genre_labels = pandas.get_dummies(
    movies["genre_names"].apply(pandas.Series).stack(),
    prefix="Genre"
).sum(level=0)

In [8]:
posters = movies[["original_title", "poster_path"]]

In [9]:
genre_labels

Unnamed: 0_level_0,Genre_Action,Genre_Adventure,Genre_Animation,Genre_Comedy,Genre_Crime,Genre_Documentary,Genre_Drama,Genre_Family,Genre_Fantasy,Genre_History,Genre_Horror,Genre_Music,Genre_Mystery,Genre_Romance,Genre_Science Fiction,Genre_TV Movie,Genre_Thriller,Genre_War,Genre_Western
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
2,0,0,0,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
5,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
6,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0
8,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9994,0,1,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0
9995,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0
9997,1,0,0,0,0,0,0,0,1,0,1,0,0,0,1,0,0,0,0
9998,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0


In [10]:
movies[~movies.index.isin(genre_labels.index)]

Unnamed: 0_level_0,adult,backdrop_path,belongs_to_collection,budget,genres,homepage,imdb_id,original_language,original_title,overview,...,tagline,title,video,vote_average,vote_count,belongs_to_collection.id,belongs_to_collection.name,belongs_to_collection.poster_path,belongs_to_collection.backdrop_path,genre_names
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1


## Metrics

In [11]:
from sklearn.metrics import f1_score

In [12]:
from sklearn.model_selection import cross_val_score

In [21]:
from sklearn.metrics import make_scorer

## Dummy Classifier

In [13]:
from sklearn.dummy import DummyClassifier

In [37]:
classifier = DummyClassifier(strategy="stratified")

In [38]:
classifier.fit(
    posters,
    genre_labels
)

DummyClassifier(strategy='stratified')

In [49]:
Y_pred = pandas.DataFrame(
    classifier.predict(posters),
    columns=genre_labels.columns
)
Y_pred

Unnamed: 0,Genre_Action,Genre_Adventure,Genre_Animation,Genre_Comedy,Genre_Crime,Genre_Documentary,Genre_Drama,Genre_Family,Genre_Fantasy,Genre_History,Genre_Horror,Genre_Music,Genre_Mystery,Genre_Romance,Genre_Science Fiction,Genre_TV Movie,Genre_Thriller,Genre_War,Genre_Western
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,1,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0
2,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
3,0,0,0,0,1,0,0,0,0,0,0,0,1,1,0,0,0,0,0
4,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5111,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
5112,0,0,0,0,0,1,0,0,0,0,0,0,1,1,0,0,0,0,0
5113,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0
5114,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0


In [50]:
Y_pred.sum(axis="columns").value_counts()

2    1512
3    1185
1    1176
4     582
0     371
5     211
6      68
7      10
8       1
dtype: int64

## Evaluation

In [29]:
scoring = make_scorer(f1_score, greater_is_better=True, average="samples")
cv = 5

In [30]:
cross_val_score(
    DummyClassifier(strategy="uniform"),
    posters,
    genre_labels,
    scoring=scoring,
    cv=cv,
).mean()

0.1832201722492231

In [36]:
cross_val_score(
    DummyClassifier(strategy="stratified"),
    posters,
    genre_labels,
    scoring=scoring,
    cv=cv,
).mean()

0.21850020347583238

In [35]:
cross_val_score(
    DummyClassifier(strategy="most_frequent"),
    posters,
    genre_labels,
    scoring=scoring,
    cv=cv,
).mean()

0.18613042871107385