In [None]:
# Import necessary libraries for data analysis, ML, and visualization
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Import ML components: data splitting, preprocessing, models, and metrics
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, roc_curve, auc
from sklearn.ensemble import RandomForestClassifier
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

# Set figure size for better visualization
plt.figure(figsize=(10, 6))

# Load Netflix dataset
df = pd.read_csv('../data/netflix_titles.csv')
df.head()

In [None]:
# Level 2 - Task 1: Regression to predict content duration
# Prepare regression data with relevant features
reg_df = df[['duration', 'release_year', 'rating', 'type', 'listed_in']].dropna()

# Extract numeric duration values (remove units like "min" or "Seasons")
reg_df['duration_num'] = reg_df['duration'].str.extract('(\d+)').astype(int)

# Encode categorical variable: Movie=0, TV Show=1
reg_df['type'] = reg_df['type'].map({'Movie': 0, 'TV Show': 1})

# One-hot encode rating categories to make them compatible with ML models
reg_df = pd.get_dummies(reg_df, columns=['rating'], drop_first=True)

# Display prepared data
reg_df.head()

In [None]:
# Separate features and target variable
X = reg_df.drop(['duration', 'duration_num', 'listed_in'], axis=1)
y = reg_df['duration_num']

# Split data into training and testing sets (80-20 split)
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Build and train Linear Regression model
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

lr = LinearRegression()
lr.fit(X_train, y_train)

# Make predictions on test set
y_pred = lr.predict(X_test)

# Evaluate model using Mean Squared Error and R² score
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Display results
mse, r2

In this task, a regression model was developed to predict the duration of Netflix titles based on content attributes.
Linear Regression was used as a baseline model, and its performance was evaluated using Mean Squared Error (MSE) and R² score.

In [None]:
# Task 2: Classification - Predict Movie vs TV Show
# Exploratory Data Analysis (EDA)
df.info()
# Count distribution of content types
df['type'].value_counts()
# Visualize distribution
sns.countplot(x='type', data=df)
plt.show()

In [None]:
# Data cleaning and preprocessing
# Select relevant features for classification
df = df[['type', 'release_year', 'duration', 'rating', 'country']]
# Remove rows with missing values
df.dropna(inplace=True)

# Extract numeric duration from duration column
df['duration'] = df['duration'].str.extract('(\d+)').astype(int)

In [None]:
# Display cleaned dataset
df.head()

In [None]:
# Encode categorical variables to numeric values
le = LabelEncoder()

# Encode type: Movie=0, TV Show=1
df['type'] = le.fit_transform(df['type'])
# Encode rating categories
df['rating'] = le.fit_transform(df['rating'])
# Encode country categories
df['country'] = le.fit_transform(df['country'])

In [None]:
# Feature scaling and target separation
# Separate features and target variable
X = df.drop('type', axis=1)
y = df['type']

# Scale features to have mean=0 and std=1 for better model performance
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [None]:
# Split scaled data into training and testing sets (80-20 split)
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42
)

In [None]:
# Train Logistic Regression model for binary classification
model = LogisticRegression()
model.fit(X_train, y_train)
# Make predictions on test set
y_pred = model.predict(X_test)

In [None]:
# Evaluate model performance using multiple metrics
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
# Display confusion matrix
confusion_matrix(y_test, y_pred)

In [None]:
# Generate and visualize ROC curve to assess model discrimination ability
# Get probability predictions for positive class
y_prob = model.predict_proba(X_test)[:,1]

# Calculate false positive rate, true positive rate, and AUC
fpr, tpr, _ = roc_curve(y_test, y_prob)
roc_auc = auc(fpr, tpr)

# Plot ROC curve
plt.plot(fpr, tpr, label=f"AUC = {roc_auc:.2f}")
plt.plot([0,1], [0,1], linestyle='--')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve")
plt.legend()
plt.show()

In [None]:
# Train Random Forest model for comparison
rf = RandomForestClassifier()
rf.fit(X_train, y_train)

# Make predictions and evaluate
rf_pred = rf.predict(X_test)
print("Random Forest Accuracy:", accuracy_score(y_test, rf_pred))

Title

Classification of Netflix Content Using Logistic Regression

Objective

The objective of this project is to build a classification model using Logistic Regression to predict whether a Netflix title is a Movie or a TV Show.

Dataset Description

A publicly available Netflix dataset containing information such as release year, duration, rating, and country was used.

Methodology

Data cleaning and preprocessing

Encoding categorical variables

Feature scaling

Train-test split

Logistic Regression model training

Performance evaluation using accuracy, precision, recall, and ROC curve

Comparison with Random Forest classifier

Results
Logistic Regression achieved good classification accuracy.

ROC-AUC score indicated strong class separation.

Random Forest slightly outperformed Logistic Regression but lacked interpretability.

Conclusion

Logistic Regression proved effective for classifying Netflix content types. The model is simple, interpretable, and suitable for binary classification tasks. Future improvements can include feature engineering and advanced NLP techniques.


In [None]:
# Task 3: Clustering - Group Netflix content based on characteristics
# Data preprocessing for clustering
df = df[['release_year', 'duration', 'rating', 'country']]
# Remove missing values
df.dropna(inplace=True)
# Extract numeric duration
df['duration'] = df['duration'].str.extract('(\d+)').astype(int)

In [None]:
# Encode categorical variables (rating, country) to numeric
le = LabelEncoder()
df['rating'] = le.fit_transform(df['rating'])
df['country'] = le.fit_transform(df['country'])

In [None]:
# Scale features for clustering (important for K-Means)
scaler = StandardScaler()
scaled_data = scaler.fit_transform(df)

In [None]:
# Elbow method: find optimal number of clusters using WCSS (Within-Cluster Sum of Squares)
wcss = []

for k in range(1, 11):
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(scaled_data)
    wcss.append(kmeans.inertia_)

In [None]:
# Plot elbow curve to visualize optimal cluster number
plt.plot(range(1, 11), wcss, marker='o')
plt.xlabel("Number of Clusters")
plt.ylabel("WCSS")
plt.title("Elbow Method for Optimal K")
plt.show()

# Apply K-Means with optimal number of clusters (k=3)
kmeans = KMeans(n_clusters=3, random_state=42)
clusters = kmeans.fit_predict(scaled_data)

# Add cluster assignments to dataframe
df['Cluster'] = clusters
df.head()

In [None]:
# Visualize clusters using PCA (Principal Component Analysis) for dimensionality reduction
pca = PCA(n_components=2)
pca_data = pca.fit_transform(scaled_data)

# Plot 2D scatter plot with cluster colors
plt.figure(figsize=(8,6))
plt.scatter(pca_data[:,0], pca_data[:,1], c=clusters)
plt.xlabel("PCA Component 1")
plt.ylabel("PCA Component 2")
plt.title("Netflix Content Clustering")
plt.show()

In [None]:
# Display cluster characteristics (mean values of features for each cluster)
df.groupby('Cluster').mean()

Title

Clustering Netflix Content Using K-Means

Objective

To group Netflix titles into meaningful clusters using unsupervised learning techniques.

Methodology

Data cleaning and preprocessing

Feature encoding and scaling

Optimal cluster selection using Elbow Method

K-Means clustering

Dimensionality reduction using PCA for visualization

Results

Netflix content was grouped into three clusters.

Each cluster represents different content patterns based on duration, release year, and regional distribution.

PCA visualization clearly shows cluster separation.

Conclusion

K-Means clustering effectively segmented Netflix content into meaningful groups. This approach can be used for recommendation systems and content analysis.