In [1]:
import pandas as pd  # Importing pandas for data handling

# Load the Netflix dataset
df = pd.read_csv("netflix_titles.csv")

# ✅ Step 1: Create a new column that tells us whether the row is a Movie (1) or a TV Show (0)
df['is_movie'] = df['type'].apply(lambda x: 1 if x == 'Movie' else 0)
# We apply a function: if the type is 'Movie', we write 1, otherwise 0


In [3]:
# ✅ Step 2: Create a new column for the release decade (like 2000s, 2010s)
df['release_decade'] = (df['release_year'] // 10) * 10
# This divides the year by 10 and multiplies back to get decade: 2015 becomes 2010, 2008 becomes 2000


In [5]:

# ✅ Step 3: Extract only the number of minutes from the 'duration' text column
df['duration_mins'] = df['duration'].str.extract(r'(\d+)').astype(float)
# Example: "90 min" becomes 90.0, "45 min" becomes 45.0



In [7]:
# ✅ Step 4: Create a new column 'is_long' that tells if a movie is long (90 mins or more)
df['is_long'] = df['duration_mins'].apply(lambda x: 1 if x >= 90 else 0)
# This will help to classify movies as long or short


In [9]:

# ✅ Step 5: Keep only the top 5 most common 'rating' values, and group others as 'Other'
top_ratings = df['rating'].value_counts().head(5).index  # Top 5 ratings only

# Replace less common ratings with 'Other'
df['rating_cleaned'] = df['rating'].apply(lambda x: x if x in top_ratings else 'Other')

# Apply one-hot encoding to the cleaned rating column
df_encoded = pd.get_dummies(df, columns=['rating_cleaned'])
# This creates a new column for each rating like G, PG, TV-MA etc. with 0 or 1


In [11]:

# ✅ Step 6: Drop columns that are not useful anymore
df_encoded = df_encoded.drop(['show_id', 'description', 'duration', 'type'], axis=1)
# 'show_id' is just an ID, 'description' is long text, 'duration' is already converted, and 'type' is now in 'is_movie'

# ✅ Final check: print the shape and first few rows of the dataset
print("Final dataset shape:", df_encoded.shape)
df_encoded.head()

Final dataset shape: (8807, 18)


Unnamed: 0,title,director,cast,country,date_added,release_year,rating,listed_in,is_movie,release_decade,duration_mins,is_long,rating_cleaned_Other,rating_cleaned_PG-13,rating_cleaned_R,rating_cleaned_TV-14,rating_cleaned_TV-MA,rating_cleaned_TV-PG
0,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,Documentaries,1,2020,90.0,1,False,True,False,False,False,False
1,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,"International TV Shows, TV Dramas, TV Mysteries",0,2020,2.0,0,False,False,False,False,True,False
2,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,"September 24, 2021",2021,TV-MA,"Crime TV Shows, International TV Shows, TV Act...",0,2020,1.0,0,False,False,False,False,True,False
3,Jailbirds New Orleans,,,,"September 24, 2021",2021,TV-MA,"Docuseries, Reality TV",0,2020,1.0,0,False,False,False,False,True,False
4,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"September 24, 2021",2021,TV-MA,"International TV Shows, Romantic TV Shows, TV ...",0,2020,2.0,0,False,False,False,False,True,False
