<a href="https://colab.research.google.com/github/fpyaz/data-cleaning/blob/main/DataCleaning_PrimeVideo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd

df = pd.read_csv("amazon_prime_titles.csv")

df.head()


Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,The Grand Seduction,Don McKellar,"Brendan Gleeson, Taylor Kitsch, Gordon Pinsent",Canada,"March 30, 2021",2014,,113 min,"Comedy, Drama",A small fishing village must procure a local d...
1,s2,Movie,Take Care Good Night,Girish Joshi,"Mahesh Manjrekar, Abhay Mahajan, Sachin Khedekar",India,"March 30, 2021",2018,13+,110 min,"Drama, International",A Metro Family decides to fight a Cyber Crimin...
2,s3,Movie,Secrets of Deception,Josh Webber,"Tom Sizemore, Lorenzo Lamas, Robert LaSardo, R...",United States,"March 30, 2021",2017,,74 min,"Action, Drama, Suspense",After a man discovers his wife is cheating on ...
3,s4,Movie,Pink: Staying True,Sonia Anderson,"Interviews with: Pink, Adele, Beyoncé, Britney...",United States,"March 30, 2021",2014,,69 min,Documentary,"Pink breaks the mold once again, bringing her ..."
4,s5,Movie,Monster Maker,Giles Foster,"Harry Dean Stanton, Kieran O'Brien, George Cos...",United Kingdom,"March 30, 2021",1989,,45 min,"Drama, Fantasy",Teenage Matt Banting wants to work with a famo...


In [None]:
df.info()

df.isnull().sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9668 entries, 0 to 9667
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   show_id       9668 non-null   object
 1   type          9668 non-null   object
 2   title         9668 non-null   object
 3   director      7585 non-null   object
 4   cast          8435 non-null   object
 5   country       672 non-null    object
 6   date_added    155 non-null    object
 7   release_year  9668 non-null   int64 
 8   rating        9331 non-null   object
 9   duration      9668 non-null   object
 10  listed_in     9668 non-null   object
 11  description   9668 non-null   object
dtypes: int64(1), object(11)
memory usage: 906.5+ KB


Unnamed: 0,0
show_id,0
type,0
title,0
director,2083
cast,1233
country,8996
date_added,9513
release_year,0
rating,337
duration,0


In [None]:
# Fill missing text columns with "Unknown"
df["director"] = df["director"].fillna("Unknown")
df["cast"] = df["cast"].fillna("Unknown")
df["country"] = df["country"].fillna("Unknown")
df["rating"] = df["rating"].fillna("Unknown")


In [None]:
df["date_added"] = pd.to_datetime(df["date_added"], errors="coerce")

In [None]:
# Extract duration number and type (e.g., 90 min → 90 + min)
df[["duration_int", "duration_type"]] = df["duration"].str.extract(r'(\d+)\s*(\w+)')
df["duration_int"] = pd.to_numeric(df["duration_int"], errors='coerce')


In [None]:
text_cols = ["title", "type", "country", "listed_in", "director"]

for col in text_cols:
    df[col] = df[col].str.lower().str.strip()


In [None]:
# Convert genre string to list
df["genres"] = df["listed_in"].str.split(", ")


In [None]:
# Get first listed genre
df["main_genre"] = df["genres"].apply(lambda x: x[0] if isinstance(x, list) else "unknown")

# Get first listed country
df["main_country"] = df["country"].apply(lambda x: x.split(",")[0] if x != "unknown" else x)


In [None]:
df["year_added"] = df["date_added"].dt.year
df["month_added"] = df["date_added"].dt.month


In [None]:
def classify_length(row):
    if row["type"] == "movie":
        if row["duration_int"] < 40:
            return "short"
        elif row["duration_int"] <= 90:
            return "medium"
        else:
            return "long"
    elif row["type"] == "tv show":
        if row["duration_int"] <= 1:
            return "mini-series"
        elif row["duration_int"] <= 3:
            return "short series"
        else:
            return "long series"
    else:
        return "unknown"

df["length_category"] = df.apply(classify_length, axis=1)


In [None]:
# Drop duplicates without the 'genres' column (which contains lists)
df = df.drop_duplicates(subset=[col for col in df.columns if col != "genres"])


In [None]:
df.to_csv("cleaned_amazon_prime.csv", index=False)

In [None]:
df.head()


Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description,duration_int,duration_type,genres,main_genre,main_country,year_added,month_added,length_category
0,s1,movie,the grand seduction,don mckellar,"Brendan Gleeson, Taylor Kitsch, Gordon Pinsent",canada,2021-03-30,2014,Unknown,113 min,"comedy, drama",A small fishing village must procure a local d...,113,min,"[comedy, drama]",comedy,canada,2021.0,3.0,long
1,s2,movie,take care good night,girish joshi,"Mahesh Manjrekar, Abhay Mahajan, Sachin Khedekar",india,2021-03-30,2018,13+,110 min,"drama, international",A Metro Family decides to fight a Cyber Crimin...,110,min,"[drama, international]",drama,india,2021.0,3.0,long
2,s3,movie,secrets of deception,josh webber,"Tom Sizemore, Lorenzo Lamas, Robert LaSardo, R...",united states,2021-03-30,2017,Unknown,74 min,"action, drama, suspense",After a man discovers his wife is cheating on ...,74,min,"[action, drama, suspense]",action,united states,2021.0,3.0,medium
3,s4,movie,pink: staying true,sonia anderson,"Interviews with: Pink, Adele, Beyoncé, Britney...",united states,2021-03-30,2014,Unknown,69 min,documentary,"Pink breaks the mold once again, bringing her ...",69,min,[documentary],documentary,united states,2021.0,3.0,medium
4,s5,movie,monster maker,giles foster,"Harry Dean Stanton, Kieran O'Brien, George Cos...",united kingdom,2021-03-30,1989,Unknown,45 min,"drama, fantasy",Teenage Matt Banting wants to work with a famo...,45,min,"[drama, fantasy]",drama,united kingdom,2021.0,3.0,medium
