# Final Project — IMDB + Wikimedia Stream (Colab RAM-safe)



It answers the project questions **Q1–Q14**




## 0) Install & Imports



In [22]:
!pip install -q pandas requests sseclient-py

import os, gzip, json, time
from datetime import datetime
from collections import defaultdict

import pandas as pd
import requests


## 1) Download IMDB datasets (Q1)

Source: https://datasets.imdbws.com/

We download only the files needed for Q1–Q14:
- `name.basics.tsv.gz` (people)
- `title.basics.tsv.gz` (titles, type, years, runtime, genres)
- `title.ratings.tsv.gz` (ratings, votes)
- `title.crew.tsv.gz` (directors)
- `title.akas.tsv.gz` (alternate titles)




In [None]:
DATA_URL = "https://datasets.imdbws.com/"
DATA_DIR = "data"
DATA_FILES = [
    "name.basics.tsv.gz",
    "title.basics.tsv.gz",
    "title.ratings.tsv.gz",
    "title.crew.tsv.gz",
    "title.akas.tsv.gz",
]

def download_imdb_file(filename, dest_dir=DATA_DIR):
    os.makedirs(dest_dir, exist_ok=True)
    dest_path = os.path.join(dest_dir, filename)
    if os.path.exists(dest_path):
        print(f"[OK] {filename} already exists.")
        return dest_path

    url = DATA_URL + filename
    print(f"[DL] {filename} ...")
    r = requests.get(url, stream=True, timeout=120)
    r.raise_for_status()
    with open(dest_path, "wb") as f:
        for chunk in r.iter_content(chunk_size=1024 * 1024):
            if chunk:
                f.write(chunk)
    print(f"[OK] Saved -> {dest_path}")
    return dest_path

paths = {}
for f in DATA_FILES:
    try:
        paths[f] = download_imdb_file(f)
    except Exception as e:
        print(f"[WARN] Failed to download {f}: {e}")
        print("      Download manually from https://datasets.imdbws.com/ and put it into data/")
        paths[f] = os.path.join(DATA_DIR, f)

paths


[DL] name.basics.tsv.gz ...
[OK] Saved -> data/name.basics.tsv.gz
[DL] title.basics.tsv.gz ...
[OK] Saved -> data/title.basics.tsv.gz
[DL] title.ratings.tsv.gz ...
[OK] Saved -> data/title.ratings.tsv.gz
[DL] title.crew.tsv.gz ...
[OK] Saved -> data/title.crew.tsv.gz
[DL] title.akas.tsv.gz ...
[OK] Saved -> data/title.akas.tsv.gz


{'name.basics.tsv.gz': 'data/name.basics.tsv.gz',
 'title.basics.tsv.gz': 'data/title.basics.tsv.gz',
 'title.ratings.tsv.gz': 'data/title.ratings.tsv.gz',
 'title.crew.tsv.gz': 'data/title.crew.tsv.gz',
 'title.akas.tsv.gz': 'data/title.akas.tsv.gz'}

## 2) RAM-safe TSV.GZ reader (chunks)

We read large files chunk-by-chunk to avoid using all RAM.


In [None]:
def iter_tsv_gz(path, usecols, chunksize=300_000):
    with gzip.open(path, "rt", encoding="utf-8") as f:
        for chunk in pd.read_csv(
            f,
            sep="\t",
            na_values="\\N",
            usecols=usecols,
            dtype="string",
            chunksize=chunksize,
            low_memory=True,
        ):
            yield chunk


# IMDB Questions (Q2–Q14)

We compute answers with minimal memory:

- People stats from `name.basics` in chunks.
- Title stats from `title.basics` in chunks.
- `title.ratings` is smaller and can usually be loaded fully (still safe to convert after).
- For director and alternate titles, we **scan** `title.crew` / `title.akas` until we find what we need.


In [None]:
# ---- Q2, Q3, Q7, Q8 (people) ----
people_cols = ["nconst", "primaryName", "birthYear", "deathYear", "primaryProfession", "knownForTitles"]

total_people = 0
missing_birth = 0
earliest_birth = None
latest_birth = None
earliest_person_row = None  # we'll keep one example row for Q5/Q6

for chunk in iter_tsv_gz(paths["name.basics.tsv.gz"], usecols=people_cols, chunksize=300_000):
    total_people += len(chunk)
    by = pd.to_numeric(chunk["birthYear"], errors="coerce")
    missing_birth += by.isna().sum()

    mn = by.min()
    mx = by.max()

    if pd.notna(mn) and (earliest_birth is None or mn < earliest_birth):
        earliest_birth = float(mn)
        # store a representative person with that birth year (first match in this chunk)
        idx = by.idxmin()
        earliest_person_row = chunk.loc[idx].to_dict()

    if pd.notna(mx) and (latest_birth is None or mx > latest_birth):
        latest_birth = float(mx)

pct_missing_birth = (missing_birth / total_people) * 100

print("Q2 total people:", total_people)
print("Q3 earliest birth year:", int(earliest_birth) if earliest_birth is not None else None)
print("Q7 most recent birth year:", int(latest_birth) if latest_birth is not None else None)
print("Q8 % without listed birth year:", pct_missing_birth)
print("\nEarliest person example (for Q5/Q6):")
earliest_person_row


Q2 total people: 14960705
Q3 earliest birth year: 4
Q7 most recent birth year: 2025
Q8 % without listed birth year: 95.57824982178313

Earliest person example (for Q5/Q6):


{'nconst': 'nm0784172',
 'primaryName': 'Lucio Anneo Seneca',
 'birthYear': '4',
 'deathYear': '65',
 'primaryProfession': 'writer',
 'knownForTitles': 'tt0043802,tt0218822,tt0049203,tt0972562'}

In [None]:
# ---- Q4: How many years ago was this person born? ----
current_year = datetime.utcnow().year
q4_years_ago = current_year - int(earliest_birth) if earliest_birth is not None else None
print("Q4 years ago:", q4_years_ago)


Q4 years ago: 2021


  current_year = datetime.utcnow().year


## Q5–Q6: Using only dataset data, is the earliest birth year "correct"?

We cannot verify historical truth from IMDB alone, but we *can* check **internal consistency**:

- If the person has `knownForTitles`, those titles should not have release years *before* the birth year (that would be suspicious).
- If data is missing, we report uncertainty.

Below we fetch the release years for the person's `knownForTitles` from `title.basics` by scanning in chunks.


In [None]:
def fetch_start_years_for_tconsts(tconst_set):
    # Scan title.basics and return startYear for tconsts in tconst_set
    title_cols_min = ["tconst", "startYear"]
    found = {}
    remaining = set(tconst_set)

    for chunk in iter_tsv_gz(paths["title.basics.tsv.gz"], usecols=title_cols_min, chunksize=400_000):
        if not remaining:
            break
        sub = chunk[chunk["tconst"].isin(remaining)]
        if len(sub) > 0:
            yrs = pd.to_numeric(sub["startYear"], errors="coerce")
            for t, y in zip(sub["tconst"].tolist(), yrs.tolist()):
                found[t] = y
                if t in remaining:
                    remaining.remove(t)
    return found

# Extract knownForTitles for the earliest person
known = []
if earliest_person_row and earliest_person_row.get("knownForTitles"):
    known = str(earliest_person_row["knownForTitles"]).split(",")

known = [k for k in known if k and k != "nan"]
known[:10], len(known)


(['tt0043802', 'tt0218822', 'tt0049203', 'tt0972562'], 4)

In [None]:
start_years = fetch_start_years_for_tconsts(set(known)) if known else {}
start_years


{'tt0043802': 1951, 'tt0049203': 1956, 'tt0218822': 2000, 'tt0972562': 2006}

In [None]:
birth = int(earliest_birth) if earliest_birth is not None else None

inconsistencies = []
for t, y in start_years.items():
    if pd.notna(y) and birth is not None and int(y) < birth:
        inconsistencies.append((t, int(y)))

print("Inconsistencies (titleYear < birthYear):", inconsistencies[:20])
print("Count inconsistencies:", len(inconsistencies))

# Q5 decision (dataset-only)
if birth is None:
    q5 = "Cannot determine (birth year missing)."
elif not known:
    q5 = "Uncertain: earliest person has no knownForTitles to cross-check; dataset-only verification is limited."
elif len(start_years) == 0:
    q5 = "Uncertain: could not retrieve startYears for knownForTitles; dataset-only verification limited."
elif len(inconsistencies) > 0:
    q5 = "Potentially incorrect/suspicious: found known titles with release year earlier than birth year (dataset internal inconsistency)."
else:
    q5 = "Plausible: no internal inconsistencies found between birth year and known titles (dataset-only check)."

print("Q5 dataset-only check:", q5)


Inconsistencies (titleYear < birthYear): []
Count inconsistencies: 0
Q5 dataset-only check: Plausible: no internal inconsistencies found between birth year and known titles (dataset-only check).


### Q6 Explanation (write in markdown)

- We cannot prove historical correctness without external sources.
- We **validate internal consistency**: `birthYear` vs. `startYear` of `knownForTitles`.
- If we find `startYear < birthYear`, it indicates an inconsistency *within the dataset*.


In [None]:
# ---- Q9/Q10/Q11 from title.basics (chunks) ----
title_cols = ["tconst", "titleType", "primaryTitle", "startYear", "runtimeMinutes", "genres"]

max_short_after_1900 = None
min_movie_after_1900 = None
genres_set = set()

for chunk in iter_tsv_gz(paths["title.basics.tsv.gz"], usecols=title_cols, chunksize=300_000):
    start = pd.to_numeric(chunk["startYear"], errors="coerce")
    runtime = pd.to_numeric(chunk["runtimeMinutes"], errors="coerce")

    # Q9
    mask_short = (chunk["titleType"] == "short") & (start > 1900)
    if mask_short.any():
        mx = runtime[mask_short].max()
        if pd.notna(mx):
            max_short_after_1900 = mx if max_short_after_1900 is None else max(max_short_after_1900, mx)

    # Q10
    mask_movie = (chunk["titleType"] == "movie") & (start > 1900)
    if mask_movie.any():
        mn = runtime[mask_movie].min()
        if pd.notna(mn):
            min_movie_after_1900 = mn if min_movie_after_1900 is None else min(min_movie_after_1900, mn)

    # Q11 (genres)
    g = chunk["genres"].dropna().astype("string")
    for val in g:
        for genre in str(val).split(","):
            if genre and genre != "nan":
                genres_set.add(genre)

print("Q9 longest short after 1900 (minutes):", int(max_short_after_1900) if max_short_after_1900 is not None else None)
print("Q10 shortest movie after 1900 (minutes):", int(min_movie_after_1900) if min_movie_after_1900 is not None else None)
print("Q11 number of genres:", len(genres_set))
print("Q11 genres:", sorted(genres_set))


Q9 longest short after 1900 (minutes): 1311
Q10 shortest movie after 1900 (minutes): 1
Q11 number of genres: 28
Q11 genres: ['Action', 'Adult', 'Adventure', 'Animation', 'Biography', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Family', 'Fantasy', 'Film-Noir', 'Game-Show', 'History', 'Horror', 'Music', 'Musical', 'Mystery', 'News', 'Reality-TV', 'Romance', 'Sci-Fi', 'Short', 'Sport', 'Talk-Show', 'Thriller', 'War', 'Western']


## Q12: Highest rated Comedy "movie" (tie-break by votes)

RAM-safe strategy:
1. Load `title.ratings` fully (usually smaller).
2. Scan `title.basics` in chunks to keep only **movie + Comedy** candidates (`tconst`, `primaryTitle`).
3. Join candidates with ratings and select:
   - max `averageRating`
   - tie-break: max `numVotes`


In [None]:
# Load ratings fully (usually OK on Colab)
with gzip.open(paths["title.ratings.tsv.gz"], "rt", encoding="utf-8") as f:
    df_ratings = pd.read_csv(f, sep="\t", na_values="\\N", dtype="string")

df_ratings["averageRating"] = pd.to_numeric(df_ratings["averageRating"], errors="coerce")
df_ratings["numVotes"] = pd.to_numeric(df_ratings["numVotes"], errors="coerce")

df_ratings.head()


Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,2189
1,tt0000002,5.5,309
2,tt0000003,6.4,2276
3,tt0000004,5.1,197
4,tt0000005,6.2,3014


In [None]:
# Scan title.basics for Comedy movies, keep only small candidate table
candidates = []

for chunk in iter_tsv_gz(paths["title.basics.tsv.gz"],
                        usecols=["tconst", "titleType", "primaryTitle", "genres"],
                        chunksize=400_000):
    is_movie = chunk["titleType"] == "movie"
    has_comedy = chunk["genres"].fillna("").str.contains("Comedy", regex=False)
    sub = chunk[is_movie & has_comedy][["tconst", "primaryTitle"]]
    if len(sub) > 0:
        candidates.append(sub)

df_candidates = pd.concat(candidates, ignore_index=True) if candidates else pd.DataFrame(columns=["tconst","primaryTitle"])
print("Comedy movie candidates:", len(df_candidates))
df_candidates.head()


Comedy movie candidates: 122485


Unnamed: 0,tconst,primaryTitle
0,tt0001028,Salome Mad
1,tt0001341,Jarní sen starého mládence
2,tt0002676,El bello Arturo
3,tt0002746,Checkers
4,tt0002798,Le dernier pardon


In [None]:
# Join candidates with ratings, then select best by rating then votes
df_comedy = df_candidates.merge(df_ratings, on="tconst", how="left")
df_comedy = df_comedy.dropna(subset=["averageRating", "numVotes"])

df_comedy_sorted = df_comedy.sort_values(by=["averageRating", "numVotes"], ascending=[False, False])
top_comedy = df_comedy_sorted.iloc[0]

print("Q12 Top-rated comedy movie:")
print(" - tconst:", top_comedy["tconst"])
print(" - title:", top_comedy["primaryTitle"])
print(" - averageRating:", float(top_comedy["averageRating"]))
print(" - numVotes:", int(top_comedy["numVotes"]))


Q12 Top-rated comedy movie:
 - tconst: tt32752452
 - title: Space Melody
 - averageRating: 10.0
 - numVotes: 6


## Q13: Director of the movie (RAM-safe scan of title.crew)

We scan `title.crew` until we find the row with the winning `tconst`, then map `nconst → primaryName` by scanning `name.basics`.


In [None]:
def find_directors_for_tconst(target_tconst):
    # Scan title.crew for the tconst
    crew_cols = ["tconst", "directors"]
    directors = None
    for chunk in iter_tsv_gz(paths["title.crew.tsv.gz"], usecols=crew_cols, chunksize=600_000):
        sub = chunk[chunk["tconst"] == target_tconst]
        if len(sub) > 0:
            directors = sub.iloc[0]["directors"]
            break
    if directors is None or pd.isna(directors):
        return []
    ids = [x for x in str(directors).split(",") if x and x != "nan"]
    return ids

director_ids = find_directors_for_tconst(top_comedy["tconst"])
director_ids


['nm4492923']

In [None]:
def find_people_names(nconst_set):
    # Scan name.basics to resolve nconst -> primaryName
    cols = ["nconst", "primaryName"]
    found = {}
    remaining = set(nconst_set)
    for chunk in iter_tsv_gz(paths["name.basics.tsv.gz"], usecols=cols, chunksize=600_000):
        if not remaining:
            break
        sub = chunk[chunk["nconst"].isin(remaining)]
        for n, nm in zip(sub["nconst"].tolist(), sub["primaryName"].tolist()):
            found[n] = nm
            if n in remaining:
                remaining.remove(n)
    return found

director_map = find_people_names(set(director_ids))
director_names = [director_map.get(i, i) for i in director_ids]

print("Q13 Director(s):", director_names)


Q13 Director(s): ['Leonardo Thimo']


## Q14: Alternate titles (RAM-safe scan of title.akas)

We scan `title.akas` and collect all alternate titles for the target `tconst`.


In [21]:
# Q14: Alternate titles (RAM-safe scan of title.akas)

import pandas as pd

def fetch_alternate_titles(title_id, chunksize=700_000):
    cols = ["titleId", "title", "region", "language"]
    out = []

    for chunk in iter_tsv_gz(paths["title.akas.tsv.gz"], usecols=cols, chunksize=chunksize):
        sub = chunk[chunk["titleId"] == title_id]
        if len(sub) > 0:
            out.append(sub)

    if not out:
        return pd.DataFrame(columns=["title", "region", "language"])

    res = pd.concat(out, ignore_index=True)[["title", "region", "language"]]
    # Optional: drop exact duplicates
    res = res.drop_duplicates()
    return res

# Use the tconst you found in Q12 (top comedy), or set it manually:
target_tconst = top_comedy["tconst"]  # e.g. "tt0068646"
df_alt = fetch_alternate_titles(target_tconst)

print("Q14 total alternate titles:", len(df_alt))
display(df_alt.head(30))


Q14 total alternate titles: 4


Unnamed: 0,title,region,language
0,Space Melody,,
1,H Melwdia Tou Diastimatos,GR,
2,Leonardo Thimo's Space Melody,CA,en
3,Space Melody,GR,




No charts were generated by quickchart
