In [1]:
%load_ext autoreload
%autoreload 2

In [8]:
import pandas as pd
from datasets import load_dataset
import tqdm as tqdm

regression_dataset_tmdb_raw_df = load_dataset("ada-datadruids/regression_dataset_tmdb")['train'].to_pandas()
# interpret release_date as datetime
regression_dataset_tmdb_raw_df['release_date'] = pd.to_datetime(regression_dataset_tmdb_raw_df['release_date'])
regression_dataset_tmdb_raw_df.head()

Unnamed: 0,id,title,vote_average,vote_count,status,release_date,revenue,runtime,adult,backdrop_path,...,overview,popularity,poster_path,tagline,genres,production_companies,production_countries,spoken_languages,keywords,based_on_book
0,27205,Inception,8.364,34495,Released,2010-07-15,825532764,148,False,/8ZTVqvKDQ8emSGUEMjsS4yHAwrp.jpg,...,"Cobb, a skilled thief who commits corporate es...",83.952,/oYuLEt3zVCKq57qu2F8dT7NIa6f.jpg,Your mind is the scene of the crime.,"Action, Science Fiction, Adventure","Legendary Pictures, Syncopy, Warner Bros. Pict...","United Kingdom, United States of America","English, French, Japanese, Swahili","rescue, mission, dream, airplane, paris, franc...",False
1,157336,Interstellar,8.417,32571,Released,2014-11-05,701729206,169,False,/pbrkL804c8yAv3zBZR4QPEafpAR.jpg,...,The adventures of a group of explorers who mak...,140.241,/gEU2QniE6E77NI6lCU6MxlNBvIx.jpg,Mankind was born on Earth. It was never meant ...,"Adventure, Drama, Science Fiction","Legendary Pictures, Syncopy, Lynda Obst Produc...","United Kingdom, United States of America",English,"rescue, future, spacecraft, race against time,...",False
2,155,The Dark Knight,8.512,30619,Released,2008-07-16,1004558444,152,False,/nMKdUUepR0i5zn0y1T4CsSB5chy.jpg,...,Batman raises the stakes in his war on crime. ...,130.643,/qJ2tW6WMUDux911r6m7haRef0WH.jpg,Welcome to a world without rules.,"Drama, Action, Crime, Thriller","DC Comics, Legendary Pictures, Syncopy, Isobel...","United Kingdom, United States of America","English, Mandarin","joker, sadism, chaos, secret identity, crime f...",False
3,19995,Avatar,7.573,29815,Released,2009-12-15,2923706026,162,False,/vL5LR6WdxWPjLPFRLe133jXWsh5.jpg,...,"In the 22nd century, a paraplegic Marine is di...",79.932,/kyeqWdyUXW608qlYkRqosgbbJyK.jpg,Enter the world of Pandora.,"Action, Adventure, Fantasy, Science Fiction","Dune Entertainment, Lightstorm Entertainment, ...","United States of America, United Kingdom","English, Spanish","future, society, culture clash, space travel, ...",False
4,24428,The Avengers,7.71,29166,Released,2012-04-25,1518815515,143,False,/9BBTo63ANSmhC4e6r62OJFuK2GL.jpg,...,When an unexpected enemy emerges and threatens...,98.082,/RYMX2wcKCBAr24UyPD7xwmjaTn.jpg,Some assembly required.,"Science Fiction, Action, Adventure",Marvel Studios,United States of America,"English, Hindi, Russian","new york city, superhero, shield, based on com...",False


# Feature Evolution Over Time

## Popularity over time (only movies based on books)

In [29]:
based_on_books_df = regression_dataset_tmdb_raw_df[regression_dataset_tmdb_raw_df['based_on_book'] == 1]

In [28]:
import pandas as pd
import plotly.express as px

based_on_books_df.loc[:, 'popularity'] = pd.to_numeric(based_on_books_df['popularity'], errors='coerce')
based_on_books_df.loc[:, 'year'] = pd.to_datetime(based_on_books_df['release_date'], errors='coerce').dt.year
based_on_books_df = based_on_books_df.dropna(subset=['popularity', 'year'])

based_on_books_df['year_group'] = (based_on_books_df['year'] // 3) * 3

trend = based_on_books_df.groupby('year_group')['popularity'].median().reset_index()

fig = px.box(based_on_books_df, x='year_group', y='popularity',
             title='Box Plot of Popularity per 3-Year Interval',
             labels={'year_group': 'Year Group', 'popularity': 'Popularity'},
             points=None)

fig.add_scatter(x=trend['year_group'], y=trend['popularity'], mode='lines+markers',
                name='Median Trend', line=dict(color='red', width=2))

mean_trend = based_on_books_df.groupby('year_group')['popularity'].mean().reset_index()
fig.add_scatter(x=mean_trend['year_group'], y=mean_trend['popularity'], mode='lines+markers',
                name='Mean Trend', line=dict(color='blue', width=2, dash='dash'))

fig.update_layout(
    xaxis_title='Year Group',
    yaxis_title='Popularity',
    width=1000,
    height=600
)

fig.show()


## Number of movies based on books over time

In [38]:
import plotly.express as px

based_on_books_df.loc[:, 'year'] = pd.to_datetime(based_on_books_df['release_date'], errors='coerce').dt.year
based_on_books_df = based_on_books_df.dropna(subset=['year'])

based_on_books_df['year_group'] = (based_on_books_df['year'] // 3) * 3

trend = based_on_books_df.groupby('year_group').size().reset_index()

fig = px.bar(trend, x='year_group', y=0,
                title='Number of Movies Based on Books per 3-Year Interval',
                labels={'year_group': 'Year Group', '0': 'Number of Movies'},
                text=0)

fig.update_layout(
    xaxis_title='Year Group',
    yaxis_title='Number of Movies',
    width=1000,
    height=600
)

fig.show()

## Number of languages per movie over time

In [37]:
based_on_books_df['num_languages'] = based_on_books_df['spoken_languages'].apply(lambda x: 1 if x is None else len(x.split(',')))
based_on_books_df['num_languages'] = based_on_books_df['num_languages'].replace(0, 1)

based_on_books_df.loc[:, 'year'] = pd.to_datetime(based_on_books_df['release_date'], errors='coerce').dt.year
based_on_books_df = based_on_books_df.dropna(subset=['year'])

# based_on_books_df['year_group'] = (based_on_books_df['year'] // 3) * 3

trend = based_on_books_df.groupby('year_group')['num_languages'].median().reset_index()

fig = px.box(based_on_books_df, x='year_group', y='num_languages',
                title='Box Plot of Number of Languages per Movie per 3-Year Interval',
                labels={'year_group': 'Year Group', 'num_languages': 'Number of Languages'},
                points=None)

fig.add_scatter(x=trend['year_group'], y=trend['num_languages'], mode='lines+markers',
                name='Median Trend', line=dict(color='red', width=2))

mean_trend = based_on_books_df.groupby('year_group')['num_languages'].mean().reset_index()
fig.add_scatter(x=mean_trend['year_group'], y=mean_trend['num_languages'], mode='lines+markers',
                name='Mean Trend', line=dict(color='blue', width=2, dash='dash'))

fig.update_layout(
    xaxis_title='Year Group',
    yaxis_title='Number of Languages',
    width=1000,
    height=600
)

fig.show()