In [6]:
import pandas as pd
import os

movies_csv_file = os.path.join("movies_combined_wide.csv")

# Read the CSV file if it exists
if os.path.exists(movies_csv_file):
    movies_df = pd.read_csv(movies_csv_file)
    movies_df["startYear"] = pd.to_numeric(movies_df["startYear"], errors="coerce")
    # Filter out rows where 'startYear' is before 2000
    movies_df = movies_df[movies_df["startYear"] >= 2010]
else:
    print(f"Error: File not found - {movies_csv_file}")
    movies_df = None
print("Data loaded successfully")
print(movies_df.head())


Data loaded successfully
          tconst titleType                      primaryTitle  \
11631  tt0011801     movie                  Tötet nicht mehr   
49483  tt0050396     short                     Final Curtain   
55746  tt0056840     short                          Aufsätze   
56269  tt0057369     short  Number 14: Late Superimpositions   
59190  tt0060361     short                          EMS nr 1   

                          originalTitle  isAdult  startYear  endYear  \
11631                  Tötet nicht mehr     True     2019.0      NaN   
49483                     Final Curtain     True     2012.0      NaN   
55746                          Aufsätze     True     2021.0      NaN   
56269  Number 14: Late Superimpositions     True     2023.0      NaN   
59190                          EMS nr 1     True     2016.0      NaN   

      runtimeMinutes        genres  averageRating  numVotes  \
11631             \N  Action,Crime            NaN       NaN   
49483             22  Horror,Sh

In [7]:
# Read the TSV file if it exists
names_tsv_file = os.path.join("name.basics.tsv")
if os.path.exists(names_tsv_file):
    names_df = pd.read_csv(names_tsv_file, sep="\t")
else:
    print(f"Error: File not found - {names_tsv_file}")
    names_df = None

# Drop specified columns if names_df is loaded
if names_df is not None:
    drop_columns = ["birthYear", "deathYear", "primaryProfession", "knownForTitles"]
    names_df = names_df.drop(columns=drop_columns, errors="ignore")


In [10]:
if movies_df is not None and names_df is not None:
    # Create a mapping dictionary: {nconst: primaryName}
    name_mapping = {str(nconst): str(name) for nconst, name in zip(names_df["nconst"], names_df["primaryName"])}

    # Function to replace nconst values with primary names safely
    def replace_nconst_with_name(nconst_list):
        if pd.isna(nconst_list):  # Handle NaN values
            return None  # Keeps it as NaN
        nconst_list = str(nconst_list)  # Convert to string if it's a number/float
        return ",".join([name_mapping.get(nconst.strip(), nconst.strip()) for nconst in nconst_list.split(",")])

    # Replace nconst values in the relevant columns
    for column in ["actor", "actress", "director", "producer", "writer"]:
        if column in movies_df.columns:
            movies_df[column] = movies_df[column].apply(replace_nconst_with_name)

    # Display the updated DataFrame
    print("Updated Movies DataFrame with Names:")
    print(movies_df.head())

    # Optionally, save the updated DataFrame to a new CSV file
    #movies_df.to_csv(os.path.join("movies_with_names.csv"), index=False)

Updated Movies DataFrame with Names:
          tconst titleType                      primaryTitle  \
11631  tt0011801     movie                  Tötet nicht mehr   
49483  tt0050396     short                     Final Curtain   
55746  tt0056840     short                          Aufsätze   
56269  tt0057369     short  Number 14: Late Superimpositions   
59190  tt0060361     short                          EMS nr 1   

                          originalTitle  isAdult  startYear  endYear  \
11631                  Tötet nicht mehr     True     2019.0      NaN   
49483                     Final Curtain     True     2012.0      NaN   
55746                          Aufsätze     True     2021.0      NaN   
56269  Number 14: Late Superimpositions     True     2023.0      NaN   
59190                          EMS nr 1     True     2016.0      NaN   

      runtimeMinutes        genres  averageRating  numVotes  \
11631             \N  Action,Crime            NaN       NaN   
49483             2

In [3]:

import pandas as pd

# Load the data
df = pd.read_csv('movies_with_names.csv')
movies_df = df[df['titleType'] == 'movie']
movies_first_5000 = movies_df.head(500000)

# Optional: Save these rows to a new CSV file or process further
movies_first_5000.to_csv('first_5000_movies.csv', index=False)

In [4]:
import pandas as pd
import csv
from io import StringIO

# Read the first 5000 movies data from the CSV file
df = pd.read_csv('first_5000_movies.csv')

# Convert the DataFrame to a CSV string with '^' as the delimiter and force quotes on all fields
output = StringIO()
df.to_csv(output, sep='^', index=False, quoting=csv.QUOTE_ALL)
result_string = output.getvalue()

# Save the resulting string to a new file
with open('output_movies.txt', 'w') as f:
    f.write(result_string)

print("Data has been saved to output_movies.txt")

Data has been saved to output_movies.txt
