In [1]:
import re
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Read the dataset
dataset = pd.read_csv('movies.csv')

# Handling missing values
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
numerical_columns = dataset.select_dtypes(include=[np.number]).columns.tolist()
imputer.fit(dataset[numerical_columns])
dataset[numerical_columns] = imputer.transform(dataset[numerical_columns])

# Convert 'YEAR' column to a standardized format
dataset['YEAR'] = dataset['YEAR'].str.extract(r'(\d{4})')


In [2]:
# Preprocess 'ONE-LINE' column
def preprocess_text(text):
    return text.lower()

dataset['ONE-LINE'] = dataset['ONE-LINE'].apply(preprocess_text)

# Preprocess 'STARS' column and create binary columns
def extract_stars(stars_column):
    stars_list = re.findall(r'[A-Z][a-z]+ [A-Z][a-z]+', stars_column)
    stars_binary = {star: 1 for star in stars_list}
    return stars_binary

stars_df = dataset['STARS'].apply(extract_stars).apply(pd.Series).fillna(0)

# Combine the binary star columns with the original dataset
dataset_with_stars = pd.concat([dataset, stars_df], axis=1)

# Drop the original 'STARS' column
dataset_with_stars.drop(columns=['STARS'], inplace=True)

In [3]:
# Encoding categorical data
labelencoder = LabelEncoder()
dataset_with_stars['GENRE'] = labelencoder.fit_transform(dataset_with_stars['GENRE'])

# Splitting the dataset into independent and dependent variables
X = dataset_with_stars.drop(columns=['MOVIES', 'RATING'])
Y = dataset_with_stars['RATING']

In [4]:
# Splitting the dataset into the Training set and Test set
X_train, X_test, _, _ = train_test_split(X, Y, test_size=0.2, random_state=1)

# Save preprocessed training data
X_train_preprocessed = pd.DataFrame(X_train, columns=X_train.columns)
X_train_preprocessed.to_csv('X_train_preprocessed_movies.csv', index=False)

# Save preprocessed testing data
X_test_preprocessed = pd.DataFrame(X_test, columns=X_test.columns)
X_test_preprocessed.to_csv('X_test_preprocessed_movies.csv', index=False)

print("Preprocessed data saved successfully!")

Preprocessed data saved successfully!
