In [None]:
from google.colab import files
uploads=files.upload()

In [2]:
import pandas as pd
import numpy as np
df=pd.read_csv('netflix_titles.csv')

In [None]:
df.head()
print(f"shape: {df.shape}")
print("\nContent type Distribution")
print(df['type'].value_counts())

In [None]:
#clean data
df_ml=df.copy()
# Fill missing values (from your EDA)
df_ml['director'] = df_ml['director'].fillna('Unknown')
df_ml['cast'] = df_ml['cast'].fillna('Unknown')
df_ml['country'] = df_ml['country'].fillna('Unknown')
df_ml['rating'] = df_ml['rating'].fillna('Unknown')
df_ml['duration'] = df_ml['duration'].fillna('0')
#Convert date_added to datetime
df_ml['date_added']=pd.to_datetime(df_ml['date_added'].str.strip(),errors="coerce")
df_ml['year_added']=df_ml['date_added'].dt.year
#extracting duration as numeric:
#movies "90min"=90
#tv shows "2 seasons" =2

#creating features
features=pd.DataFrame()
features['release_year'] = df_ml['release_year']
features['year_added'] = df_ml['year_added'].fillna(df_ml['year_added'].median())

#count features
features['num_genres'] = df_ml['listed_in'].str.count(',') + 1
features['num_cast'] = df_ml['cast'].str.count(',') + 1

#target variable
target=(df_ml['type']=='Movie').astype(int) #movie=1 tv show=0
#check
print("Features Created:")
print(features.head())
print(f"\nTarget (1=Movie, 0=Tv Show):")
print(target.value_counts())
print(f"\nMissing values: {features.isnull().sum().sum()}")

In [None]:
#split the data
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(features,target,test_size=0.2,random_state=42)
print(f"Training: {X_train.shape[0]} samples")
print(f"Testing: {X_test.shape[0]} samples")
#logisting regression
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
lr=LogisticRegression(random_state=42,max_iter=1000)
lr.fit(X_train,y_train)
y_pred_lr=lr.predict(X_test)
acc_lr=accuracy_score(y_test,y_pred_lr)
print(f"Logistic Regression accuracy: {acc_lr:.2%}")


In [None]:
#random forest
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
rf=RandomForestClassifier(n_estimators=100,random_state=42)
rf.fit(X_train,y_train)
y_pred_rf=rf.predict(X_test)
acc_rf=accuracy_score(y_test,y_pred_rf)
print(f"Random Forest accuracy: {acc_rf:.2%}")
print("\nRandom Forest Classification Report:")
print(classification_report(y_test,y_pred_rf,target_names=['Tv Show','Movie']))

In [None]:
# What's driving the predictions?
feature_importance = pd.DataFrame({
    'Feature': features.columns,
    'Importance': rf.feature_importances_
}).sort_values('Importance', ascending=False)

print("Feature Importance (Random Forest):")
print(feature_importance)

# Check if one feature dominates
print("\nTop feature alone accuracy:")
top_feature = feature_importance.iloc[0]['Feature']
print(f"Top feature: {top_feature}")

# Visualize
import matplotlib.pyplot as plt
feature_importance.plot(x='Feature', y='Importance', kind='barh', figsize=(10, 6))
plt.title('Feature Importance for Movie vs TV Show')
plt.xlabel('Importance')
plt.tight_layout()
plt.show()

## Netflix Content Type Prediction

### Problem
Predict whether Netflix content is a Movie or TV Show based on metadata

### Data Leakage Lessons Learned
1. **Duration leak:** Movies=minutes, TV=seasons (100% accuracy = cheating)
2. **Director leak:** TV shows often don't list directors (95% accuracy = metadata artifact)

### Fair Model Results
Using only content characteristics:
- Release year
- Year added to Netflix  
- Number of genres
- Cast size

**Random Forest: 71.91% accuracy** ✓
**Logistic Regression: 69.01% accuracy**

### Insights
- Modest accuracy shows Movies and TV Shows have similar patterns
- Director presence was best predictor (but unfair)
- Content metadata alone provides limited distinction
- Baseline (always predict Movie): 69.6%

### Feature Importance (Fair Model)
1. num_cast (cast size varies between types)
2. release_year (content age patterns)
3. num_genres (genre diversity)
4. year_added (Netflix acquisition timing)

## Feature Selection Notes

### Avoided Data Leakage
- **Duration removed:** Movies measured in minutes, TV shows in seasons (would give 100% accuracy)
- **Director presence removed:** TV shows systematically lack director metadata (would give 95% accuracy)

These features would create artificially high accuracy by exploiting data formatting rather than content patterns.

### Final Features (Fair Prediction)
Only content-based characteristics used:
- Release year, Year added, Number of genres, Cast size