In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import plotly.express as px # charts and graphs

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/imdb-dataset/title.basics.tsv/data.tsv
/kaggle/input/imdb-dataset/title.principals.tsv/data.tsv
/kaggle/input/imdb-dataset/title.ratings.tsv/data.tsv
/kaggle/input/imdb-dataset/name.basics.tsv/data.tsv
/kaggle/input/imdb-dataset/title.akas.tsv/data.tsv


In [3]:
# It's time to load the info into a Pandas Dataframe so we can start cleaning and organizing the data.

df = pd.read_table("../input/imdb-dataset/title.basics.tsv/data.tsv")

# I decided to drop nan values and clean the colunm with the year data in order to start the analizys.
df.dropna()
df = df.drop(df[df['startYear'].map(len) < 3].index)

# Let's take a look at the dataframe we've got.
df.head()

  exec(code_obj, self.user_global_ns, self.user_ns)


Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,\N,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,\N,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short"


In [4]:
# It seemed clear I didn't need all those colunms, so I selectet the ones I need.
# The syntax here has the same ideia of a SELECT statmant in SQL.
data = df[['tconst', 'startYear', 'originalTitle', 'runtimeMinutes', 'genres', 'titleType']].sort_values(by = ['startYear'])

# Here I started organizing the data and checking it's type.
data['startYear'] = pd.to_datetime(data['startYear'], format = '%Y')
data.info()
data.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8046249 entries, 5801285 to 4850837
Data columns (total 6 columns):
 #   Column          Dtype         
---  ------          -----         
 0   tconst          object        
 1   startYear       datetime64[ns]
 2   originalTitle   object        
 3   runtimeMinutes  object        
 4   genres          object        
 5   titleType       object        
dtypes: datetime64[ns](1), object(5)
memory usage: 429.7+ MB


Unnamed: 0,tconst,startYear,originalTitle,runtimeMinutes,genres,titleType
5801285,tt22543558,1874-01-01,Lansing State Journal,\N,News,tvSpecial
6246964,tt3155794,1874-01-01,Passage de Venus,1,"Documentary,Short",short
4512527,tt16763740,1877-01-01,Le jeu de corde,1,"Animation,Short",short
4512540,tt16763774,1877-01-01,Zimm. Boum. Boum!,1,"Animation,Short",short
3436535,tt14495706,1877-01-01,La Rosace Magique,1,"Animation,Short",short


In [5]:
#Let's change the run time format so it will get easier to do some math later.
data.drop(data.loc[data['runtimeMinutes']=='\\N'].index, inplace=True)
data['runtimeMinutes'] = data['runtimeMinutes'].astype(str).astype(int)
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2455667 entries, 6246964 to 2689873
Data columns (total 6 columns):
 #   Column          Dtype         
---  ------          -----         
 0   tconst          object        
 1   startYear       datetime64[ns]
 2   originalTitle   object        
 3   runtimeMinutes  int64         
 4   genres          object        
 5   titleType       object        
dtypes: datetime64[ns](1), int64(1), object(4)
memory usage: 131.1+ MB


In [6]:
# Checking out what we've got to make sure everything is right.
data['startYear'] = data['startYear'].dt.year
data.head()

Unnamed: 0,tconst,startYear,originalTitle,runtimeMinutes,genres,titleType
6246964,tt3155794,1874,Passage de Venus,1,"Documentary,Short",short
4512527,tt16763740,1877,Le jeu de corde,1,"Animation,Short",short
4512540,tt16763774,1877,Zimm. Boum. Boum!,1,"Animation,Short",short
3436535,tt14495706,1877,La Rosace Magique,1,"Animation,Short",short
4512498,tt16763674,1877,La danse sur la corde,1,"Animation,Short",short


In [42]:
# Lets see the average run time per type of title.
movies = data.groupby('titleType', as_index=False)['runtimeMinutes'].mean()
display(movies)

Unnamed: 0,titleType,runtimeMinutes
0,movie,89.653933
1,short,13.100226
2,tvEpisode,37.088896
3,tvMiniSeries,90.666506
4,tvMovie,72.456998
5,tvSeries,46.22728
6,tvShort,12.960436
7,tvSpecial,89.676835
8,video,67.218994
9,videoGame,110.881818


In [43]:
# Making sure on the differents types of titles.
#print(data['titleType'].unique())

In [7]:
# It's time to get only the 'movie' type and put this data into another dataframe.
moviedata = data[data['titleType'] == "movie"]
display(moviedata)

Unnamed: 0,tconst,startYear,originalTitle,runtimeMinutes,genres,titleType
8,tt0000009,1894,Miss Jerry,45,Romance,movie
5713276,tt2210499,1896,Birmingham,61,Documentary,movie
220601,tt0230366,1899,Jeffries-Sharkey Contest,135,"Documentary,News,Sport",movie
278845,tt0291338,1900,May Day Parade,66,"Documentary,News",movie
255523,tt0266894,1900,The Republican National Convention,53,Documentary,movie
...,...,...,...,...,...,...
6480152,tt3697000,2024,Pictures of Infinity,75,Documentary,movie
1623364,tt1117392,2024,The Ark and the Aardvark,118,Animation,movie
1525374,tt10999798,2025,Walking In Darkness II,96,Documentary,movie
5268798,tt20560680,2025,Into the Darkness.,90,Documentary,movie


In [45]:
# Now it's time to get the average run time of the movies to see if there's any change.
mean_time_movies = moviedata.groupby('startYear', as_index=False)['runtimeMinutes'].mean()
print(mean_time_movies)

     startYear  runtimeMinutes
0         1894       45.000000
1         1896       61.000000
2         1899      135.000000
3         1900       59.500000
4         1903       52.500000
..         ...             ...
123       2022       94.643377
124       2023       95.767184
125       2024       94.000000
126       2025       93.000000
127       2026      125.000000

[128 rows x 2 columns]


In [46]:
# With the data now organized, we can build our first chart. The idea is to get the average run time per year.
fig = px.bar(
    data_frame=mean_time_movies.groupby(['runtimeMinutes']).mean().reset_index(), 
    x="startYear", 
    y="runtimeMinutes"
)
fig.show()

In [47]:
# Now that we now the average, let's see the number of movies reseased per year.
movies_per_year = moviedata['originalTitle'].groupby([moviedata.startYear]).agg('count')
display(movies_per_year)
grafico = px.bar(movies_per_year, y='originalTitle')
grafico.show()

startYear
1894       1
1896       1
1899       1
1900       2
1903       2
        ... 
2022    9475
2023     451
2024       9
2025       2
2026       1
Name: originalTitle, Length: 128, dtype: int64

In [48]:
# Now let's see the number of titles per categorie.
movies_number = data['originalTitle'].groupby([data.titleType]).agg('count')
display(movies_number)

grafico1 = px.bar(movies_number, y='originalTitle')
grafico1.show()

titleType
movie            387497
short            572937
tvEpisode       1086974
tvMiniSeries      16633
tvMovie           93101
tvSeries          91033
tvShort            9352
tvSpecial         18489
video            179320
videoGame           330
Name: originalTitle, dtype: int64

In [8]:
# Now, let's bring another dataset to get more information out of this analizys.

ratings = pd.read_table('../input/imdb-dataset/title.ratings.tsv/data.tsv')

In [9]:
movie_ratings = pd.merge(moviedata, ratings, how='inner', on = 'tconst')
movie_ratings.head()

Unnamed: 0,tconst,startYear,originalTitle,runtimeMinutes,genres,titleType,averageRating,numVotes
0,tt0000009,1894,Miss Jerry,45,Romance,movie,5.3,200
1,tt0230366,1899,Jeffries-Sharkey Contest,135,"Documentary,News,Sport",movie,3.9,67
2,tt0291338,1900,May Day Parade,66,"Documentary,News",movie,5.8,16
3,tt0266894,1900,The Republican National Convention,53,Documentary,movie,5.9,40
4,tt0127962,1903,La vie et la passion de Jésus Christ,45,"Biography,Drama",movie,6.5,534


In [10]:
fig2 = px.bar(
    data_frame=movie_ratings.groupby(['startYear']).mean().reset_index(), 
    x="startYear", 
    y="averageRating"
)
fig2.show()

In [11]:
locations = pd.read_table('../input/imdb-dataset/title.akas.tsv/data.tsv')
locations.rename(columns = {'titleId':'tconst'}, inplace = True)


Columns (7) have mixed types.Specify dtype option on import or set low_memory=False.



In [12]:
local = pd.merge(moviedata, locations, how='inner', on = 'tconst')
local.head()

Unnamed: 0,tconst,startYear,originalTitle,runtimeMinutes,genres,titleType,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000009,1894,Miss Jerry,45,Romance,movie,1,Miss Jerry,\N,\N,original,\N,1
1,tt0000009,1894,Miss Jerry,45,Romance,movie,2,Fräulein Jerry,DE,\N,\N,literal title,0
2,tt0000009,1894,Miss Jerry,45,Romance,movie,3,Miss Jerry,HU,\N,imdbDisplay,\N,0
3,tt0000009,1894,Miss Jerry,45,Romance,movie,4,Miss Jerry,US,\N,imdbDisplay,\N,0
4,tt2210499,1896,Birmingham,61,Documentary,movie,1,Birmingham,GB,\N,imdbDisplay,\N,0


In [13]:
new_local = local.drop_duplicates(subset=['tconst'])
new_local.drop(new_local.loc[new_local['region']=='\\N'].index, inplace=True)
display(new_local)



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,tconst,startYear,originalTitle,runtimeMinutes,genres,titleType,ordering,title,region,language,types,attributes,isOriginalTitle
4,tt2210499,1896,Birmingham,61,Documentary,movie,1,Birmingham,GB,\N,imdbDisplay,\N,0
14,tt0266894,1900,The Republican National Convention,53,Documentary,movie,1,The Republican National Convention,US,\N,imdbDisplay,\N,0
16,tt11166772,1903,S. Lubin's Passion Play,60,Drama,movie,1,S. Lubin's Passion Play,US,\N,imdbDisplay,\N,0
18,tt0127962,1903,La vie et la passion de Jésus Christ,45,"Biography,Drama",movie,10,The Passion Play,US,\N,\N,\N,0
35,tt0178983,1904,Westinghouse Works,68,Documentary,movie,1,Westinghouse Works,US,\N,imdbDisplay,\N,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2276566,tt3697000,2024,Pictures of Infinity,75,Documentary,movie,1,Pictures of Infinity,CA,\N,\N,\N,0
2276569,tt1117392,2024,The Ark and the Aardvark,118,Animation,movie,10,The Ark and the Aardvark,NL,\N,\N,new title,0
2276588,tt10999798,2025,Walking In Darkness II,96,Documentary,movie,1,Walking In Darkness II,US,\N,\N,\N,0
2276589,tt20560680,2025,Into the Darkness.,90,Documentary,movie,1,Into the Darkness.,GB,\N,\N,\N,0


In [14]:
movies_per_region = new_local['originalTitle'].groupby([new_local.region]).count().reset_index(
  name='Count').sort_values(['Count'], ascending=False)
display(movies_per_region)

Unnamed: 0,region,Count
197,US,50804
68,GB,21160
91,IN,17959
98,JP,13532
215,XWW,12940
...,...,...
160,RE,1
106,KY,1
147,OM,1
139,NC,1


In [15]:
fig3 = px.bar(
    data_frame=movies_per_region, 
    x="region", 
    y="Count"
)
fig3.show()

In [16]:
movies_genre = moviedata['originalTitle'].groupby([new_local.genres]).count().reset_index(
  name='Count').sort_values(['Count'], ascending=False)
movies_genre.drop(movies_genre.loc[movies_genre['genres']=='\\N'].index, inplace=True)
movies_genre.head()

Unnamed: 0,genres,Count
458,Drama,4648
290,Comedy,2190
619,Western,1167
429,Documentary,1092
511,"Drama,Romance",968


In [17]:
s_top = movies_genre.head(10)
s_top.head()

Unnamed: 0,genres,Count
458,Drama,4648
290,Comedy,2190
619,Western,1167
429,Documentary,1092
511,"Drama,Romance",968


In [18]:
fig5 = px.pie(s_top, values='Count', names='genres', title='Top 10 movie genre')
fig5.show()