# day 1 set up & web scraping

In [2]:
pip install pandas

SyntaxError: invalid syntax (3736126394.py, line 1)

In [3]:
pip install seaborn

Collecting seaborn
  Using cached seaborn-0.13.2-py3-none-any.whl.metadata (5.4 kB)
Using cached seaborn-0.13.2-py3-none-any.whl (294 kB)
Installing collected packages: seaborn
Successfully installed seaborn-0.13.2
Note: you may need to restart the kernel to use updated packages.


In [3]:
import requests
import pandas as pd
import time
import json

# Using Jikan API v4 (MyAnimeList unofficial API)
base_url = "https://api.jikan.moe/v4"

def get_anime_data(page=1, limit=25):
    """Get anime data from Jikan API with pagination"""
    endpoint = f"{base_url}/anime"
    params = {
        'page': page,
        'limit': limit,
        'order_by': 'popularity',
        'sort': 'asc'  # Most popular first
    }
    
    response = requests.get(endpoint, params=params)
    
    if response.status_code == 200:
        return response.json()
    else:
        print(f"Error: {response.status_code}")
        return None

# Fetch multiple pages (be mindful of rate limits)
all_anime = []
for page in range(1, 6):  # Fetching 5 pages (125 anime)
    print(f"Fetching page {page}...")
    data = get_anime_data(page=page)
    
    if data and 'data' in data:
        all_anime.extend(data['data'])
    else:
        print("Failed to fetch data")
        
    # Respect rate limiting (2 requests per second)
    time.sleep(0.5)

print(f"Total anime fetched: {len(all_anime)}")

# Convert to DataFrame
anime_df = pd.DataFrame([{
    'id': item['mal_id'],
    'title': item['title'],
    'english_title': item.get('title_english'),
    'type': item['type'],
    'episodes': item['episodes'],
    'status': item['status'],
    'airing': item['airing'],
    'aired_from': item['aired']['from'],
    'aired_to': item['aired']['to'],
    'duration': item['duration'],
    'rating': item['rating'],
    'score': item['score'],
    'scored_by': item['scored_by'],
    'rank': item['rank'],
    'popularity': item['popularity'],
    'members': item['members'],
    'favorites': item['favorites'],
    'synopsis': item['synopsis'],
    'season': item['season'],
    'year': item['year'],
    'studios': [studio['name'] for studio in item['studios']],
    'genres': [genre['name'] for genre in item['genres']],
    'themes': [theme['name'] for theme in item['themes']]
} for item in all_anime])

# Save to CSV
anime_df.to_csv('anime_data.csv', index=False)

Fetching page 1...
Fetching page 2...
Fetching page 3...
Fetching page 4...
Fetching page 5...
Total anime fetched: 125


In [4]:
def get_anime_reviews(anime_id, page=1):
    """Get reviews for a specific anime"""
    endpoint = f"{base_url}/anime/{anime_id}/reviews"
    params = {'page': page}
    
    response = requests.get(endpoint, params=params)
    
    if response.status_code == 200:
        return response.json()
    else:
        print(f"Error: {response.status_code}")
        return None

# Get reviews for the top 20 anime
reviews_data = []
for anime_id in anime_df['id'].head(20):
    print(f"Fetching reviews for anime ID: {anime_id}")
    reviews = get_anime_reviews(anime_id)
    
    if reviews and 'data' in reviews:
        for review in reviews['data']:
            reviews_data.append({
                'anime_id': anime_id,
                'user_username': review['user']['username'],
                'score': review['score'],
                'review': review['review']
            })
    
    # Respect rate limiting
    time.sleep(1)

reviews_df = pd.DataFrame(reviews_data)
reviews_df.to_csv('anime_reviews.csv', index=False)

Fetching reviews for anime ID: 16498
Fetching reviews for anime ID: 1535
Fetching reviews for anime ID: 5114
Fetching reviews for anime ID: 30276
Fetching reviews for anime ID: 38000
Fetching reviews for anime ID: 11757
Fetching reviews for anime ID: 31964
Fetching reviews for anime ID: 11061
Fetching reviews for anime ID: 20
Fetching reviews for anime ID: 22319
Fetching reviews for anime ID: 32281
Fetching reviews for anime ID: 25777
Fetching reviews for anime ID: 40748
Fetching reviews for anime ID: 9253
Fetching reviews for anime ID: 33486
Fetching reviews for anime ID: 1735
Fetching reviews for anime ID: 21
Fetching reviews for anime ID: 35760
Fetching reviews for anime ID: 19815
Fetching reviews for anime ID: 28851


In [26]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

#load data
anime_df = pd.read_csv('anime_data.csv')
reviews_df = pd.read_csv('anime_reviews.csv')

#basic data cleaning
anime_df['score'] = pd.to_numeric(anime_df['score'], errors='coerce')
anime_df['episodes'] = pd.to_numeric(anime_df['episodes'], errors='coerce')
anime_df['year'] = pd.to_numeric(anime_df['year'], errors='coerce')

#missing values
anime_df['score'] = anime_df['score'].fillna(anime_df['score'].mean())
anime_df['episodes'] = anime_df['episodes'].fillna(0)

#convert to python lists
anime_df['genres'] = anime_df['genres'].apply(eval)
anime_df['studios'] = anime_df['studios'].apply(eval)
anime_df['themes'] = anime_df['themes'].apply(eval)

#petty stats
print("Dataset Stats:")
print(f"Total anime entries: {len(anime_df)}")
print(f"Year range: {anime_df['year'].min()} to {anime_df['year'].max()}")
print(f"Average score: {anime_df['score'].mean():.2f}")
print(f"Most common type: {anime_df['type'].value_counts().index[0]}")

Dataset Stats:
Total anime entries: 125
Year range: 1989.0 to 2022.0
Average score: 8.14
Most common type: TV


In [25]:
anime_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 125 entries, 0 to 124
Data columns (total 23 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   id             125 non-null    int64  
 1   title          125 non-null    object 
 2   english_title  123 non-null    object 
 3   type           125 non-null    object 
 4   episodes       125 non-null    float64
 5   status         125 non-null    object 
 6   airing         125 non-null    bool   
 7   aired_from     125 non-null    object 
 8   aired_to       117 non-null    object 
 9   duration       125 non-null    object 
 10  rating         125 non-null    object 
 11  score          125 non-null    float64
 12  scored_by      125 non-null    int64  
 13  rank           125 non-null    int64  
 14  popularity     125 non-null    int64  
 15  members        125 non-null    int64  
 16  favorites      125 non-null    int64  
 17  synopsis       125 non-null    object 
 18  season    

In [7]:
reviews_df

Unnamed: 0,anime_id,user_username,score,review
0,16498,Sorrowful,10,"Oh dear Shingeki no Kyojin, where do I even be..."
1,16498,Gladius650,10,I started to follow the manga after watching t...
2,16498,SonDavid,10,"In the 80's, Mobile Suit Gundam catapulted ani..."
3,16498,Kerma_,5,Shingeki no Kyojin... Where do I start? In sum...
4,16498,emberreviews,9,"Every once in a while, and even more frequentl..."
...,...,...,...,...
375,28851,DurandalAU,10,"Like any other slice-of-life anime, the plot i..."
376,28851,Takowoyaki,10,I was lucky enough to catch the very first scr...
377,28851,Splair,10,How am I after watching this movie? Angry. Thi...
378,28851,Zengaea,1,(Spoiler warning) Deafness or Deaf-Mute is a s...
