In [1]:
import pandas as pd
import joblib
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
df = pd.read_csv("movie_metadata.csv")

In [3]:
columns_to_keep = [
    'movie_title',
    'genres',
    'plot_keywords',
    'actor_1_name',
    'actor_2_name',
    'actor_3_name',
    'director_name'
]

In [4]:
df = df[columns_to_keep].fillna('')

In [5]:
def clean_text(text):
    return str(text).replace('\xa0', ' ').replace('|', ' ').strip().lower()

for col in columns_to_keep:
    df[col] = df[col].apply(clean_text)

In [6]:
df['combined_features'] = df.apply(lambda row: f"{row['genres']} {row['plot_keywords']} {row['actor_1_name']} {row['actor_2_name']} {row['actor_3_name']} {row['director_name']}", axis=1)

In [7]:
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(df['combined_features'])

In [8]:
df.to_csv("movies_cleaned.csv", index=False)
joblib.dump(tfidf, "tfidf_vectorizer.pkl")
joblib.dump(tfidf_matrix, "tfidf_matrix.pkl")

['tfidf_matrix.pkl']