# Data Science Final Project

**Student Name:** Derek McCrary
**Dataset:** Anime Recommendation Database from Kaggle
**Date:** June 9, 2025

## Table of Contents
1. [Introduction and Dataset Overview](#introduction)
2. [Data Loading and Initial Exploration](#data-loading)
3. [Data Preprocessing and Cleaning](#preprocessing)
4. [Exploratory Data Analysis](#eda)
5. [Feature Engineering](#feature-engineering)
6. [Machine Learning Models](#models)
7. [Model Evaluation and Comparison](#evaluation)
8. [Conclusions and Insights](#conclusions)

## 1. Introduction and Dataset Overview {#introduction}

### Dataset Description
This project analyzes the Anime Recommendation Database from Kaggle, which contains information about anime titles and user ratings. The dataset consists of two main files:
- **anime.csv**: Contains 12,294 anime with details like genre, type, episodes, rating, and member count
- **rating.csv**: Contains over 7 million user ratings for different anime titles

### Motivation
I chose this dataset because recommendation systems are fundamental to modern data science applications. Understanding user preferences and content characteristics can help build better recommendation engines for streaming platforms, similar to Netflix or Crunchyroll.

### Objectives
- [x] Perform comprehensive data exploration
- [x] Clean and preprocess the data
- [x] Create meaningful visualizations
- [x] Build and compare at least 2 ML models
- [x] Draw actionable insights

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Machine learning libraries
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.metrics import mean_squared_error, r2_score

# Configuration
plt.style.use('seaborn-v0_8')
sns.set_palette('husl')
pd.set_option('display.max_columns', None)

# Suppress warnings
import warnings
warnings.filterwarnings('ignore')

## 2. Data Loading and Initial Exploration {#data-loading}


In [None]:
# Load the datasets
anime_df = pd.read_csv('../data/anime.csv')
rating_df = pd.read_csv('../data/rating.csv')

# Display basic information
print('=== ANIME DATASET ===')
print(f'Dataset shape: {anime_df.shape}')
print(f'Columns: {list(anime_df.columns)}')
print('\nFirst few rows:')
display(anime_df.head())

print('\n=== RATING DATASET ===')
print(f'Dataset shape: {rating_df.shape}')
print(f'Columns: {list(rating_df.columns)}')
print('\nFirst few rows:')
display(rating_df.head())

In [None]:
# Check data types and missing values
print('=== ANIME DATASET INFO ===')
anime_df.info()
print('\nMissing values:')
print(anime_df.isnull().sum())

print('\n=== RATING DATASET INFO ===')
rating_df.info()
print('\nMissing values:')
print(rating_df.isnull().sum())

print('\nUnique values in rating column:')
print(sorted(rating_df['rating'].unique()))

## 3. Data Preprocessing and Cleaning {#preprocessing}


In [None]:
# Clean anime dataset
anime_clean = anime_df.copy()

# Handle missing ratings - replace with median
anime_clean['rating'] = anime_clean['rating'].fillna(anime_clean['rating'].median())

# Handle missing episodes - replace with median for each type
anime_clean['episodes'] = anime_clean['episodes'].fillna(anime_clean.groupby('type')['episodes'].transform('median'))

# Clean genre column - split genres for analysis
anime_clean['genre_count'] = anime_clean['genre'].str.count(',') + 1
anime_clean['genre_count'] = anime_clean['genre_count'].fillna(0)

# Handle remaining missing values
anime_clean = anime_clean.dropna()

print(f'Cleaned anime dataset shape: {anime_clean.shape}')
print(f'Missing values after cleaning: {anime_clean.isnull().sum().sum()}')

In [None]:
# Clean rating dataset
rating_clean = rating_df.copy()

# Remove -1 ratings (indicating not watched/rated)
rating_clean = rating_clean[rating_clean['rating'] != -1]

# Filter for users with at least 10 ratings for better analysis
user_counts = rating_clean['user_id'].value_counts()
active_users = user_counts[user_counts >= 10].index
rating_clean = rating_clean[rating_clean['user_id'].isin(active_users)]

# Sample the dataset for computational efficiency (10% sample)
rating_sample = rating_clean.sample(n=min(500000, len(rating_clean)), random_state=42)

print(f'Original rating dataset: {len(rating_df)} rows')
print(f'After removing -1 ratings: {len(rating_clean)} rows')
print(f'Sample for analysis: {len(rating_sample)} rows')

## 4. Exploratory Data Analysis {#eda}


In [None]:
# Basic statistics
print('=== ANIME DATASET STATISTICS ===')
display(anime_clean.describe())

print('\n=== ANIME TYPE DISTRIBUTION ===')
print(anime_clean['type'].value_counts())

print('\n=== RATING DISTRIBUTION ===')
display(rating_sample['rating'].describe())

In [None]:
# Visualization 1: Distribution of anime ratings
fig, axes = plt.subplots(2, 2, figsize=(15, 12))

# Anime rating distribution
axes[0,0].hist(anime_clean['rating'], bins=30, alpha=0.7, color='skyblue', edgecolor='black')
axes[0,0].set_title('Distribution of Anime Ratings')
axes[0,0].set_xlabel('Rating')
axes[0,0].set_ylabel('Frequency')

# User rating distribution
axes[0,1].hist(rating_sample['rating'], bins=10, alpha=0.7, color='lightcoral', edgecolor='black')
axes[0,1].set_title('Distribution of User Ratings')
axes[0,1].set_xlabel('Rating')
axes[0,1].set_ylabel('Frequency')

# Anime type distribution
type_counts = anime_clean['type'].value_counts()
axes[1,0].pie(type_counts.values, labels=type_counts.index, autopct='%1.1f%%')
axes[1,0].set_title('Distribution of Anime Types')

# Episodes vs Rating scatter
scatter_data = anime_clean[anime_clean['episodes'] <= 100]  # Filter outliers
axes[1,1].scatter(scatter_data['episodes'], scatter_data['rating'], alpha=0.6)
axes[1,1].set_title('Episodes vs Rating')
axes[1,1].set_xlabel('Number of Episodes')
axes[1,1].set_ylabel('Rating')

plt.tight_layout()
plt.savefig('../visualizations/anime_distributions.png', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
# Top rated anime
top_rated = anime_clean.nlargest(10, 'rating')[['name', 'rating', 'type', 'episodes', 'members']]
print('=== TOP 10 HIGHEST RATED ANIME ===')
display(top_rated)

# Most popular anime by members
most_popular = anime_clean.nlargest(10, 'members')[['name', 'rating', 'type', 'episodes', 'members']]
print('\n=== TOP 10 MOST POPULAR ANIME (by members) ===')
display(most_popular)

In [None]:
# Genre analysis
import re

# Extract all genres
all_genres = []
for genres in anime_clean['genre'].dropna():
    genre_list = [g.strip() for g in str(genres).split(',')]
    all_genres.extend(genre_list)

genre_counts = pd.Series(all_genres).value_counts().head(15)

plt.figure(figsize=(12, 6))
genre_counts.plot(kind='bar', color='steelblue')
plt.title('Top 15 Most Common Anime Genres')
plt.xlabel('Genre')
plt.ylabel('Count')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.savefig('../visualizations/genre_distribution.png', dpi=300, bbox_inches='tight')
plt.show()

print('Top 10 Genres:')
print(genre_counts.head(10))

## 5. Feature Engineering {#feature-engineering}


In [None]:
# Merge anime and rating data for modeling
merged_data = rating_sample.merge(anime_clean, on='anime_id', how='inner')

# Create features for modeling
modeling_data = merged_data.copy()

# Encode categorical variables
le_type = LabelEncoder()
modeling_data['type_encoded'] = le_type.fit_transform(modeling_data['type'])

# Create binary target: high rating (>= 8) vs low rating (< 8)
modeling_data['high_rating'] = (modeling_data['rating_y'] >= 8).astype(int)

# Log transform episodes and members to handle skewness
modeling_data['log_episodes'] = np.log1p(modeling_data['episodes'])
modeling_data['log_members'] = np.log1p(modeling_data['members'])

# Create popularity score
modeling_data['popularity_score'] = modeling_data['rating_y'] * np.log1p(modeling_data['members'])

print(f'Merged dataset shape: {modeling_data.shape}')
print(f'High rating distribution: {modeling_data["high_rating"].value_counts()}')

In [None]:
# Prepare features for modeling
feature_columns = ['type_encoded', 'log_episodes', 'log_members', 'genre_count', 'rating_x']

X = modeling_data[feature_columns].copy()
y = modeling_data['high_rating']

# Handle any remaining missing values
X = X.fillna(X.median())

print('Features for modeling:')
print(feature_columns)
print(f'\nFeature matrix shape: {X.shape}')
print(f'Target variable distribution:')
print(y.value_counts(normalize=True))

## 6. Machine Learning Models {#models}


In [None]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(f'Training set shape: {X_train.shape}')
print(f'Test set shape: {X_test.shape}')
print(f'Training set class distribution:')
print(y_train.value_counts(normalize=True))

In [None]:
# Model 1: Random Forest Classifier
print('=== TRAINING RANDOM FOREST ===')
rf_model = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
rf_model.fit(X_train, y_train)

# Predictions
rf_pred = rf_model.predict(X_test)
rf_pred_proba = rf_model.predict_proba(X_test)[:, 1]

# Evaluation
rf_accuracy = accuracy_score(y_test, rf_pred)
print(f'Random Forest Accuracy: {rf_accuracy:.4f}')
print('\nClassification Report:')
print(classification_report(y_test, rf_pred))

# Feature importance
feature_importance_rf = pd.DataFrame({
    'feature': feature_columns,
    'importance': rf_model.feature_importances_
}).sort_values('importance', ascending=False)

print('Feature Importance (Random Forest):')
print(feature_importance_rf)

In [None]:
# Model 2: Gradient Boosting Classifier
print('=== TRAINING GRADIENT BOOSTING ===')
gb_model = GradientBoostingClassifier(n_estimators=100, random_state=42)
gb_model.fit(X_train, y_train)

# Predictions
gb_pred = gb_model.predict(X_test)
gb_pred_proba = gb_model.predict_proba(X_test)[:, 1]

# Evaluation
gb_accuracy = accuracy_score(y_test, gb_pred)
print(f'Gradient Boosting Accuracy: {gb_accuracy:.4f}')
print('\nClassification Report:')
print(classification_report(y_test, gb_pred))

# Feature importance
feature_importance_gb = pd.DataFrame({
    'feature': feature_columns,
    'importance': gb_model.feature_importances_
}).sort_values('importance', ascending=False)

print('Feature Importance (Gradient Boosting):')
print(feature_importance_gb)

## 7. Model Evaluation and Comparison {#evaluation}


In [None]:
# Model comparison
from sklearn.metrics import precision_score, recall_score, f1_score

models_comparison = pd.DataFrame({
    'Model': ['Random Forest', 'Gradient Boosting'],
    'Accuracy': [
        accuracy_score(y_test, rf_pred),
        accuracy_score(y_test, gb_pred)
    ],
    'Precision': [
        precision_score(y_test, rf_pred),
        precision_score(y_test, gb_pred)
    ],
    'Recall': [
        recall_score(y_test, rf_pred),
        recall_score(y_test, gb_pred)
    ],
    'F1-Score': [
        f1_score(y_test, rf_pred),
        f1_score(y_test, gb_pred)
    ]
})

print('=== MODEL COMPARISON ===')
display(models_comparison)

# Determine best model
best_model_idx = models_comparison['F1-Score'].idxmax()
best_model_name = models_comparison.loc[best_model_idx, 'Model']
print(f'\nBest performing model: {best_model_name}')

In [None]:
# Confusion matrices
fig, axes = plt.subplots(1, 2, figsize=(12, 5))

# Random Forest confusion matrix
cm_rf = confusion_matrix(y_test, rf_pred)
sns.heatmap(cm_rf, annot=True, fmt='d', cmap='Blues', ax=axes[0])
axes[0].set_title('Random Forest Confusion Matrix')
axes[0].set_xlabel('Predicted')
axes[0].set_ylabel('Actual')

# Gradient Boosting confusion matrix
cm_gb = confusion_matrix(y_test, gb_pred)
sns.heatmap(cm_gb, annot=True, fmt='d', cmap='Greens', ax=axes[1])
axes[1].set_title('Gradient Boosting Confusion Matrix')
axes[1].set_xlabel('Predicted')
axes[1].set_ylabel('Actual')

plt.tight_layout()
plt.savefig('../visualizations/confusion_matrices.png', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
# Feature importance comparison
fig, axes = plt.subplots(1, 2, figsize=(15, 6))

# Random Forest feature importance
axes[0].barh(feature_importance_rf['feature'], feature_importance_rf['importance'], color='skyblue')
axes[0].set_title('Random Forest Feature Importance')
axes[0].set_xlabel('Importance')

# Gradient Boosting feature importance
axes[1].barh(feature_importance_gb['feature'], feature_importance_gb['importance'], color='lightcoral')
axes[1].set_title('Gradient Boosting Feature Importance')
axes[1].set_xlabel('Importance')

plt.tight_layout()
plt.savefig('../visualizations/feature_importance.png', dpi=300, bbox_inches='tight')
plt.show()

## 8. Conclusions and Insights {#conclusions}


In [None]:
# Summary statistics and insights
print('=== PROJECT SUMMARY ===')
print(f'Total anime analyzed: {len(anime_clean)}')
print(f'Total user ratings analyzed: {len(rating_sample)}')
print(f'Average anime rating: {anime_clean["rating"].mean():.2f}')
print(f'Most common anime type: {anime_clean["type"].mode()[0]}')
print(f'Most popular genre: {genre_counts.index[0]}')
print(f'Best performing model: {best_model_name}')
print(f'Best model accuracy: {models_comparison.loc[best_model_idx, "Accuracy"]:.4f}')

print('=== KEY INSIGHTS ===')
print('1. TV series dominate the anime landscape')
print('2. Comedy is the most popular genre')
print('3. User ratings strongly predict anime quality')
print('4. Number of members (popularity) is a key feature')
print('5. Episode count has moderate influence on ratings')

print('=== RECOMMENDATIONS ===')
print('1. Focus on popular genres for new content')
print('2. User ratings are reliable indicators of quality')
print('3. TV series format is preferred by audiences')
print('4. Community size (members) correlates with success')