In [1]:
import numpy as np
import pandas as pd
import re
import pickle
import ast
import torch
from sentence_transformers import SentenceTransformer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Configurations
feature_columns = ['overview', 'genres', 'keywords', 'production_companies']
model = SentenceTransformer('all-MiniLM-L6-v2', device='cuda')
allowed_languages = ['en', 'hi', 'mr', 'ta', 'te', 'ml', 'kn']
required_columns = ['id', 'title', 'overview', 'genres', 'keywords', 'production_companies']

In [3]:
# Functions
def clean_title(text):
    """Clean and sanitize title text."""
    if pd.isna(text):
        return np.nan
    text = str(text).strip('\'\"-+| ')
    text = re.sub(r'[^a-zA-Z0-9\s-]', '', text)
    return text if re.search(r'[a-zA-Z0-9]', text) else np.nan

def clean_features(text):
    """Clean the movie overview."""
    if pd.isna(text):
        return np.nan
    text = str(text).strip('-+!\'\" ')
    text = re.sub(r'[\(\)\[\]\'\"\#]', '', text)
    return text if re.search(r'[a-zA-Z0-9]', text) else np.nan

def combine_features(row):
    return f"Overview: {row['overview']}. Genres include: {row['genres']}. Keywords are: {row['keywords']}. Produced by: {row['production_companies']}."

In [4]:
# Read the data
df = pd.read_csv('Datasets/Source/TMDB_movie_dataset_v11.csv')

In [5]:
# Filter and select required columns
df = df[(df['original_language'].isin(allowed_languages)) & (df['status'] == 'Released')]
df = df[required_columns]

In [6]:
# Drop rows with missing critical data
df.dropna(inplace=True)

In [8]:
# Clean text fields
df['title'] = df['title'].apply(clean_title)
df['overview'] = df['overview'].apply(clean_features)
df['genres'] = df['genres'].apply(clean_features)
df['keywords'] = df['keywords'].apply(clean_features)
df['production_companies'] = df['production_companies'].apply(clean_features)

In [9]:
# Drop any new nulls created after cleaning and remove duplicates
df.dropna(inplace=True)
df.drop_duplicates(inplace=True)

In [10]:
# Save intermediate cleaned data
df.to_csv('Datasets/cleaned/cleaned.csv', index=False)

In [11]:
# Create combined 'features' column
df['features'] = df.apply(combine_features, axis=1)

In [12]:
# Save processed data
df.to_csv('Datasets/cleaned/processed.csv', index=False)

In [13]:
# Vectorize tags using Sentence Transformers
embeddings = model.encode(df['features'].tolist(), show_progress_bar=True)

Batches: 100%|█████████████████████████████████████████████| 3241/3241 [05:04<00:00, 10.66it/s]


In [14]:
# Save DataFrame and embeddings
with open('Datasets/cleaned/df.pkl', 'wb') as f1, open('Datasets/cleaned/embeddings.pkl', 'wb') as f2:
    pickle.dump(df, f1)
    pickle.dump(embeddings, f2)