<a href="https://colab.research.google.com/github/datasriram/task-1-netflix-cleaning/blob/main/netflix_data_cleaning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# STEP 1: Import Required Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# STEP 2: Load the Netflix Dataset
url = 'https://raw.githubusercontent.com/sriyourgithubusername/task-1-netflix-cleaning/main/netflix_titles.csv'  # <-- replace later if needed
df = pd.read_csv(url)  # OR use df = pd.read_csv('/path/to/netflix_titles.csv') if uploading

# STEP 3: Basic Info
print(df.head())
print(df.info())
print(df.describe())
print(df.isnull().sum())

# STEP 4: Handle Missing Values
# Fill missing values or drop them based on column type
df['director'].fillna('Unknown', inplace=True)
df['cast'].fillna('Not Available', inplace=True)
df['country'].fillna(df['country'].mode()[0], inplace=True)
df.dropna(subset=['date_added'], inplace=True)

# STEP 5: Convert 'date_added' to datetime
df['date_added'] = pd.to_datetime(df['date_added'])

# STEP 6: Feature Engineering
df['year_added'] = df['date_added'].dt.year
df['month_added'] = df['date_added'].dt.month

# STEP 7: Encoding 'type' column (Movie/TV Show)
df['type_encoded'] = df['type'].map({'Movie': 0, 'TV Show': 1})

# STEP 8: Visualizing Outliers (Duration may have noise)
sns.boxplot(data=df[df['duration'].str.contains('min', na=False)], x='duration', color='orange')
plt.xticks(rotation=90)
plt.title("Duration Boxplot (Only Movies)")
plt.show()

# STEP 9: Save cleaned data (optional)
df.to_csv('cleaned_netflix_data.csv', index=False)
