## Imports

In [1]:
import pandas as pd
import numpy as np
import re

## Read DataSet:

In [2]:
df = pd.read_csv(r'D:\Afra Zaib\Semester 6th\ARS\TwoStageMovieRecommender\MovieRecommender\MovieDataset\movies.csv')
df.head(5)

Unnamed: 0,movie title,Run Time,Rating,User Rating,Generes,Overview,Plot Kyeword,Director,Top 5 Casts,Writer,year,path
0,Top Gun: Maverick,"$170,000,000 (estimated)",8.6,187K,"['Action', 'Drama']",After more than thirty years of service as one...,"['fighter jet', 'sequel', 'u.s. navy', 'fighte...",Joseph Kosinski,"['Jack Epps Jr.', 'Peter Craig', 'Tom Cruise',...",Jim Cash,-2022,/title/tt1745960/
1,Jurassic World Dominion,2 hours 27 minutes,6.0,56K,"['Action', 'Adventure', 'Sci-Fi']",Four years after the destruction of Isla Nubla...,"['dinosaur', 'jurassic park', 'tyrannosaurus r...",Colin Trevorrow,"['Colin Trevorrow', 'Derek Connolly', 'Chris P...",Emily Carmichael,-2022,/title/tt8041270/
2,Top Gun,"$15,000,000 (estimated)",6.9,380K,"['Action', 'Drama']",As students at the United States Navy's elite ...,"['pilot', 'male camaraderie', 'u.s. navy', 'gr...",Tony Scott,"['Jack Epps Jr.', 'Ehud Yonay', 'Tom Cruise', ...",Jim Cash,-1986,/title/tt0092099/
3,Lightyear,"$71,101,257",5.2,32K,"['Animation', 'Action', 'Adventure']",While spending years attempting to return home...,"['galaxy', 'spaceship', 'robot', 'rocket', 'sp...",Angus MacLane,"['Jason Headley', 'Matthew Aldrich', 'Chris Ev...",Angus MacLane,-2022,/title/tt10298810/
4,Spiderhead,not-released,5.4,23K,"['Action', 'Crime', 'Drama']","In the near future, convicts are offered the c...","['discover', 'medical', 'test', 'reality', 'fi...",Joseph Kosinski,"['Rhett Reese', 'Paul Wernick', 'Chris Hemswor...",George Saunders,-2022,/title/tt9783600/


## Clean 'Run Time' column:

In [3]:
df['Run Time'] = df['Run Time'].astype(str).apply(lambda x: re.sub(r'[^0-9]', '', x))
df['Run Time'] = pd.to_numeric(df['Run Time'], errors='coerce')

##  Clean 'Rating' column:

In [4]:
df['Rating'] = df['Rating'].replace(['NoRating', 'N/A', 'NA', '-', 'none', 'None'], np.nan)
df['Rating'] = pd.to_numeric(df['Rating'], errors='coerce')

## Clean 'year' column:

In [5]:
df['year'] = df['year'].astype(str).apply(lambda x: re.sub(r'[^0-9]', '', x))
df['year'] = pd.to_numeric(df['year'], errors='coerce')


## Clean 'User Rating' column:

In [6]:
def clean_user_rating(x):
    x = str(x).strip()
    if 'K' in x:
        return float(x.replace('K', '')) * 1000
    else:
        return pd.to_numeric(x, errors='coerce')

df['User Rating'] = df['User Rating'].apply(clean_user_rating)

## Handle missing important columns Title:

In [7]:
df = df.dropna(subset=['movie title'])

## Fill missing 'Run Time' with median:

In [8]:
if df['Run Time'].isnull().sum() > 0:
    df['Run Time'] = df['Run Time'].fillna(df['Run Time'].median())

## Fill missing 'Rating' with 0:

In [9]:
df['Rating'] = df['Rating'].fillna(0)

## Fill missing 'year' with most common year:

In [10]:
if df['year'].isnull().sum() > 0:
    df['year'] = df['year'].fillna(df['year'].mode()[0])

## Fill missing values in less important columns:

In [11]:
columns_to_fill = ['Genres', 'Overview', 'Plot Kyeword', 'Director', 'Top 5 Casts', 'Writer']
for col in columns_to_fill:
    if col in df.columns:
        df[col] = df[col].fillna('Unknown')


## Remove duplicate movie entries:

In [12]:
df = df.drop_duplicates()

## Reset index:

In [13]:
df = df.reset_index(drop=True)
print("Final Shape:", df.shape)
print(df.head())

Final Shape: (24402, 12)
               movie title     Run Time  Rating  User Rating  \
0        Top Gun: Maverick  170000000.0     8.6     187000.0   
1  Jurassic World Dominion        227.0     6.0      56000.0   
2                  Top Gun   15000000.0     6.9     380000.0   
3                Lightyear   71101257.0     5.2      32000.0   
4               Spiderhead        145.0     5.4      23000.0   

                                Generes  \
0                   ['Action', 'Drama']   
1     ['Action', 'Adventure', 'Sci-Fi']   
2                   ['Action', 'Drama']   
3  ['Animation', 'Action', 'Adventure']   
4          ['Action', 'Crime', 'Drama']   

                                            Overview  \
0  After more than thirty years of service as one...   
1  Four years after the destruction of Isla Nubla...   
2  As students at the United States Navy's elite ...   
3  While spending years attempting to return home...   
4  In the near future, convicts are offered the c..

## Save cleaned dataset:

In [14]:
df.to_csv('cleaned_movies.csv', index=False)