# day 1 set up & web scraping

In [2]:
pip install pandas

Collecting pandasNote: you may need to restart the kernel to use updated packages.

  Using cached pandas-2.2.3-cp39-cp39-win_amd64.whl.metadata (19 kB)
Collecting numpy>=1.22.4 (from pandas)
  Using cached numpy-2.0.2-cp39-cp39-win_amd64.whl.metadata (59 kB)
Collecting pytz>=2020.1 (from pandas)
  Using cached pytz-2025.1-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas)
  Using cached tzdata-2025.1-py2.py3-none-any.whl.metadata (1.4 kB)
Using cached pandas-2.2.3-cp39-cp39-win_amd64.whl (11.6 MB)
Using cached numpy-2.0.2-cp39-cp39-win_amd64.whl (15.9 MB)
Using cached pytz-2025.1-py2.py3-none-any.whl (507 kB)
Using cached tzdata-2025.1-py2.py3-none-any.whl (346 kB)
Installing collected packages: pytz, tzdata, numpy, pandas
Successfully installed numpy-2.0.2 pandas-2.2.3 pytz-2025.1 tzdata-2025.1


In [3]:
import requests
import pandas as pd
import time
import json

# Using Jikan API v4 (MyAnimeList unofficial API)
base_url = "https://api.jikan.moe/v4"

def get_anime_data(page=1, limit=25):
    """Get anime data from Jikan API with pagination"""
    endpoint = f"{base_url}/anime"
    params = {
        'page': page,
        'limit': limit,
        'order_by': 'popularity',
        'sort': 'asc'  # Most popular first
    }
    
    response = requests.get(endpoint, params=params)
    
    if response.status_code == 200:
        return response.json()
    else:
        print(f"Error: {response.status_code}")
        return None

# Fetch multiple pages (be mindful of rate limits)
all_anime = []
for page in range(1, 6):  # Fetching 5 pages (125 anime)
    print(f"Fetching page {page}...")
    data = get_anime_data(page=page)
    
    if data and 'data' in data:
        all_anime.extend(data['data'])
    else:
        print("Failed to fetch data")
        
    # Respect rate limiting (2 requests per second)
    time.sleep(0.5)

print(f"Total anime fetched: {len(all_anime)}")

# Convert to DataFrame
anime_df = pd.DataFrame([{
    'id': item['mal_id'],
    'title': item['title'],
    'english_title': item.get('title_english'),
    'type': item['type'],
    'episodes': item['episodes'],
    'status': item['status'],
    'airing': item['airing'],
    'aired_from': item['aired']['from'],
    'aired_to': item['aired']['to'],
    'duration': item['duration'],
    'rating': item['rating'],
    'score': item['score'],
    'scored_by': item['scored_by'],
    'rank': item['rank'],
    'popularity': item['popularity'],
    'members': item['members'],
    'favorites': item['favorites'],
    'synopsis': item['synopsis'],
    'season': item['season'],
    'year': item['year'],
    'studios': [studio['name'] for studio in item['studios']],
    'genres': [genre['name'] for genre in item['genres']],
    'themes': [theme['name'] for theme in item['themes']]
} for item in all_anime])

# Save to CSV
anime_df.to_csv('anime_data.csv', index=False)

Fetching page 1...
Fetching page 2...
Fetching page 3...
Fetching page 4...
Fetching page 5...
Total anime fetched: 125


In [4]:
def get_anime_reviews(anime_id, page=1):
    """Get reviews for a specific anime"""
    endpoint = f"{base_url}/anime/{anime_id}/reviews"
    params = {'page': page}
    
    response = requests.get(endpoint, params=params)
    
    if response.status_code == 200:
        return response.json()
    else:
        print(f"Error: {response.status_code}")
        return None

# Get reviews for the top 20 anime
reviews_data = []
for anime_id in anime_df['id'].head(20):
    print(f"Fetching reviews for anime ID: {anime_id}")
    reviews = get_anime_reviews(anime_id)
    
    if reviews and 'data' in reviews:
        for review in reviews['data']:
            reviews_data.append({
                'anime_id': anime_id,
                'user_username': review['user']['username'],
                'score': review['score'],
                'review': review['review']
            })
    
    # Respect rate limiting
    time.sleep(1)

reviews_df = pd.DataFrame(reviews_data)
reviews_df.to_csv('anime_reviews.csv', index=False)

Fetching reviews for anime ID: 16498
Fetching reviews for anime ID: 1535
Fetching reviews for anime ID: 5114
Fetching reviews for anime ID: 30276
Fetching reviews for anime ID: 38000
Fetching reviews for anime ID: 11757
Fetching reviews for anime ID: 31964
Fetching reviews for anime ID: 11061
Fetching reviews for anime ID: 20
Fetching reviews for anime ID: 22319
Fetching reviews for anime ID: 32281
Fetching reviews for anime ID: 25777
Fetching reviews for anime ID: 40748
Fetching reviews for anime ID: 9253
Fetching reviews for anime ID: 33486
Fetching reviews for anime ID: 1735
Fetching reviews for anime ID: 21
Fetching reviews for anime ID: 35760
Fetching reviews for anime ID: 19815
Fetching reviews for anime ID: 28851
