In [2]:
import pandas as pd

In [4]:

# Load data
basics = pd.read_csv("../data/title.basics.tsv.gz", sep='\t', dtype=str, na_values='\\N')
ratings = pd.read_csv("../data/title.ratings.tsv.gz", sep='\t', dtype=str, na_values='\\N')
akas = pd.read_csv("../data/title.akas.tsv.gz", sep='\t', dtype=str, na_values='\\N')

# Filter to only movies
basics = basics[basics['titleType'] == 'movie']

# Merge title.basics with ratings
df = basics.merge(ratings, on='tconst', how='left')

# Aggregate production countries from akas (all available regions)
region_df = akas[['titleId', 'region']].dropna()
region_agg = region_df.groupby('titleId')['region'].agg(lambda x: '|'.join(sorted(set(x)))).reset_index()
region_agg = region_agg.rename(columns={'titleId': 'tconst', 'region': 'production_country'})

# Merge country info
df = df.merge(region_agg, on='tconst', how='left')

In [9]:
# Select and rename columns
df_final = df[[
    'tconst', 'primaryTitle', 'originalTitle', 'startYear',
    'runtimeMinutes', 'genres', 'averageRating', 'numVotes', 'production_country'
]].rename(columns={
    'tconst': 'title_id',
    'primaryTitle': 'title',
    'originalTitle': 'original_title',
    'startYear': 'release_year',
    'runtimeMinutes': 'runtime_minutes',
    'genres': 'genre',
    'averageRating': 'imdb_rating',
    'numVotes': 'vote_count'
})

# Convert types
df_final['release_year'] = pd.to_numeric(df_final['release_year'], errors='coerce')
df_final['runtime_minutes'] = pd.to_numeric(df_final['runtime_minutes'], errors='coerce')
df_final['imdb_rating'] = pd.to_numeric(df_final['imdb_rating'], errors='coerce')
df_final['vote_count'] = pd.to_numeric(df_final['vote_count'], errors='coerce')

# Verify columns
print(df_final.columns)
print(df_final.head())

Index(['title_id', 'title', 'original_title', 'release_year',
       'runtime_minutes', 'genre', 'imdb_rating', 'vote_count',
       'production_country'],
      dtype='object')
    title_id                          title                 original_title  \
0  tt0000009                     Miss Jerry                     Miss Jerry   
1  tt0000147  The Corbett-Fitzsimmons Fight  The Corbett-Fitzsimmons Fight   
2  tt0000502                       Bohemios                       Bohemios   
3  tt0000574    The Story of the Kelly Gang    The Story of the Kelly Gang   
4  tt0000591               The Prodigal Son              L'enfant prodigue   

   release_year  runtime_minutes                       genre  imdb_rating  \
0        1894.0             45.0                     Romance          5.4   
1        1897.0            100.0      Documentary,News,Sport          5.3   
2        1905.0            100.0                         NaN          3.8   
3        1906.0             70.0  Action,Adve

In [13]:
# print the number of missing rows
print("Missing rows:")
print(df_final.isnull().sum())


Missing rows:
title_id                   0
title                      2
original_title             2
release_year          105075
runtime_minutes       263712
genre                  76989
imdb_rating           384402
vote_count            384402
production_country     14118
dtype: int64


In [14]:
df_final

Unnamed: 0,title_id,title,original_title,release_year,runtime_minutes,genre,imdb_rating,vote_count,production_country
0,tt0000009,Miss Jerry,Miss Jerry,1894.0,45.0,Romance,5.4,223.0,AU|DE|HU|US
1,tt0000147,The Corbett-Fitzsimmons Fight,The Corbett-Fitzsimmons Fight,1897.0,100.0,"Documentary,News,Sport",5.3,559.0,RU|US
2,tt0000502,Bohemios,Bohemios,1905.0,100.0,,3.8,21.0,ES
3,tt0000574,The Story of the Kelly Gang,The Story of the Kelly Gang,1906.0,70.0,"Action,Adventure,Biography",6.0,985.0,AU|DE|GB|HU|RS|SG|US
4,tt0000591,The Prodigal Son,L'enfant prodigue,1907.0,90.0,Drama,5.6,31.0,FR|US
...,...,...,...,...,...,...,...,...,...
714010,tt9916622,Rodolpho Teóphilo - O Legado de um Pioneiro,Rodolpho Teóphilo - O Legado de um Pioneiro,2015.0,57.0,Documentary,,,BR
714011,tt9916680,De la ilusión al desconcierto: cine colombiano...,De la ilusión al desconcierto: cine colombiano...,2007.0,100.0,Documentary,,,CO
714012,tt9916706,Dankyavar Danka,Dankyavar Danka,2013.0,,Comedy,7.7,9.0,IN
714013,tt9916730,6 Gunn,6 Gunn,2017.0,116.0,Drama,7.0,13.0,IN


In [None]:

# Load data
basics = pd.read_csv("../data/title.basics.tsv.gz", sep='\t', dtype=str, na_values='\\N')
ratings = pd.read_csv("../data/title.ratings.tsv.gz", sep='\t', dtype=str, na_values='\\N')
akas = pd.read_csv("../data/title.akas.tsv.gz", sep='\t', dtype=str, na_values='\\N')

# Filter to only movies
basics = basics[basics['titleType'] == 'movie']

# Merge title.basics with ratings
df = basics.merge(ratings, on='tconst', how='left')

# Aggregate production countries from akas (all available regions)
region_df = akas[['titleId', 'region']].dropna()
region_agg = region_df.groupby('titleId')['region'].agg(lambda x: '|'.join(sorted(set(x)))).reset_index()
region_agg = region_agg.rename(columns={'titleId': 'tconst', 'region': 'production_country'})

# Merge country info
df = df.merge(region_agg, on='tconst', how='left')

# Select and rename columns
df_final = df[[
    'tconst', 'primaryTitle', 'originalTitle', 'startYear',
    'runtimeMinutes', 'genres', 'averageRating', 'numVotes', 'production_country'
]].rename(columns={
    'tconst': 'title_id',
    'primaryTitle': 'title',
    'originalTitle': 'original_title',
    'startYear': 'release_year',
    'runtimeMinutes': 'runtime_minutes',
    'genres': 'genre',
    'averageRating': 'imdb_rating',
    'numVotes': 'vote_count'
})

# Convert types
df_final['release_year'] = pd.to_numeric(df_final['release_year'], errors='coerce')
df_final['runtime_minutes'] = pd.to_numeric(df_final['runtime_minutes'], errors='coerce')
df_final['imdb_rating'] = pd.to_numeric(df_final['imdb_rating'], errors='coerce')
df_final['vote_count'] = pd.to_numeric(df_final['vote_count'], errors='coerce')

# Verify columns
print(df_final.columns)
print(df_final.head())


Index(['title_id', 'title', 'original_title', 'release_year',
       'runtime_minutes', 'genre', 'imdb_rating', 'vote_count',
       'production_country'],
      dtype='object')
    title_id                          title                 original_title  \
0  tt0000009                     Miss Jerry                     Miss Jerry   
1  tt0000147  The Corbett-Fitzsimmons Fight  The Corbett-Fitzsimmons Fight   
2  tt0000502                       Bohemios                       Bohemios   
3  tt0000574    The Story of the Kelly Gang    The Story of the Kelly Gang   
4  tt0000591               The Prodigal Son              L'enfant prodigue   

   release_year  runtime_minutes                       genre  imdb_rating  \
0        1894.0             45.0                     Romance          5.4   
1        1897.0            100.0      Documentary,News,Sport          5.3   
2        1905.0            100.0                         NaN          3.8   
3        1906.0             70.0  Action,Adve

In [22]:
df_final.shape

(714015, 9)

In [15]:
# Convert year and minutes to integers
df_final['release_year'] = df_final['release_year'].astype('Int64')
df_final['runtime_minutes'] = df_final['runtime_minutes'].astype('Int64')

# Convert vote_count to integer
df_final['vote_count'] = df_final['vote_count'].astype('Int64')


In [16]:
df_final.head()

Unnamed: 0,title_id,title,original_title,release_year,runtime_minutes,genre,imdb_rating,vote_count,production_country
0,tt0000009,Miss Jerry,Miss Jerry,1894,45,Romance,5.4,223,AU|DE|HU|US
1,tt0000147,The Corbett-Fitzsimmons Fight,The Corbett-Fitzsimmons Fight,1897,100,"Documentary,News,Sport",5.3,559,RU|US
2,tt0000502,Bohemios,Bohemios,1905,100,,3.8,21,ES
3,tt0000574,The Story of the Kelly Gang,The Story of the Kelly Gang,1906,70,"Action,Adventure,Biography",6.0,985,AU|DE|GB|HU|RS|SG|US
4,tt0000591,The Prodigal Son,L'enfant prodigue,1907,90,Drama,5.6,31,FR|US


In [25]:
# save as parquet../data/movies.parquet
df_final.to_parquet("../data/movies.parquet", index=False, engine='pyarrow')

In [26]:
df_final.shape

(714015, 9)

In [17]:
def describe_dataframe_for_markdown(df):
    summary = []
    for col in df.columns:
        dtype = df[col].dtype
        non_null = df[col].notnull().sum()
        nulls = df[col].isnull().sum()
        unique = df[col].nunique(dropna=True)
        example = df[col].dropna().iloc[0] if df[col].notna().any() else "NaN"
        summary.append([col, str(dtype), non_null, nulls, unique, str(example)])

    header = "| Column | Data Type | Non-Missing | Missing | Unique Values | Example |"
    divider = "|--------|------------|--------------|---------|----------------|---------|"
    lines = [header, divider]
    for row in summary:
        lines.append("| " + " | ".join(map(str, row)) + " |")
    
    markdown_output = "\n".join(lines)
    print(markdown_output)

# Use it on your df_final
describe_dataframe_for_markdown(df_final)


| Column | Data Type | Non-Missing | Missing | Unique Values | Example |
|--------|------------|--------------|---------|----------------|---------|
| title_id | object | 714015 | 0 | 714015 | tt0000009 |
| title | object | 714013 | 2 | 613714 | Miss Jerry |
| original_title | object | 714013 | 2 | 627750 | Miss Jerry |
| release_year | Int64 | 608940 | 105075 | 139 | 1894 |
| runtime_minutes | Int64 | 450303 | 263712 | 510 | 45 |
| genre | object | 637026 | 76989 | 1494 | Romance |
| imdb_rating | float64 | 329613 | 384402 | 91 | 5.4 |
| vote_count | Int64 | 329613 | 384402 | 19061 | 223 |
| production_country | object | 699897 | 14118 | 118177 | AU|DE|HU|US |


## OLD PART ##

In [2]:
# Load this paquet file data/movies.parquet
df = pd.read_parquet("../data/movies.parquet")

In [3]:
df.columns

Index(['title_id', 'title', 'original_title', 'release_year',
       'runtime_minutes', 'genre', 'imdb_rating', 'vote_count',
       'production_country'],
      dtype='object')

In [64]:
# Filter out rows with missing values
df.shape

(608940, 9)

In [65]:
# Check for missing values
df.isnull().sum()

title_id                   0
title                      2
original_title             2
release_year               0
runtime_minutes       166443
genre                  70424
imdb_rating           279360
vote_count            279360
production_country      7928
dtype: int64

In [66]:
# Remove missing values
df = df.dropna()

In [67]:
# Check for missing values
df.isnull().sum()

title_id              0
title                 0
original_title        0
release_year          0
runtime_minutes       0
genre                 0
imdb_rating           0
vote_count            0
production_country    0
dtype: int64

In [54]:
df.rename(columns={
    'tconst': 'title_id',
    'primaryTitle': 'title',
    'originalTitle': 'original_title',
    'year': 'release_year',
    'runtimeMinutes': 'runtime_minutes',
    'genres': 'genre',
    'rating': 'imdb_rating',
    'votes': 'vote_count',
    'country': 'production_country'
}, inplace=True)


In [34]:
# Change all year to integer and remove NA and missing rows from year
df['release_year'] = df['release_year'].astype('Int64')
df = df.dropna(subset=['release_year'])

In [None]:
# Convert runtime minutes to integer 
df['runtime_minutes'] = df['runtime_minutes'].astype('Int64')

# Convert vote count to integer 
df['vote_count'] = df['vote_count'].astype('Int64')

In [69]:
# Save as parquet in data directory
output_path = data_dir / "movies.parquet"
df.to_parquet(output_path, index=False)
print(f"Movies data saved to {output_path}")

Movies data saved to ../data/movies.parquet


In [68]:
missing_values = df.isnull().sum()
print("Missing values in each column:")
print(missing_values[missing_values > 0])


Missing values in each column:
Series([], dtype: int64)
