# day 1 set up & web scraping

In [1]:
pip install pandas

Note: you may need to restart the kernel to use updated packages.


In [2]:
pip install seaborn

Note: you may need to restart the kernel to use updated packages.


In [3]:
import requests
import pandas as pd
import time
import json

# Using Jikan API v4 (MyAnimeList unofficial API)
base_url = "https://api.jikan.moe/v4"

def get_anime_data(page=1, limit=25):
    """Get anime data from Jikan API with pagination"""
    endpoint = f"{base_url}/anime"
    params = {
        'page': page,
        'limit': limit,
        'order_by': 'popularity',
        'sort': 'asc'  # Most popular first
    }
    
    response = requests.get(endpoint, params=params)
    
    if response.status_code == 200:
        return response.json()
    else:
        print(f"Error: {response.status_code}")
        return None

# Fetch multiple pages (be mindful of rate limits)
all_anime = []
for page in range(1, 6):  # Fetching 5 pages (125 anime)
    print(f"Fetching page {page}...")
    data = get_anime_data(page=page)
    
    if data and 'data' in data:
        all_anime.extend(data['data'])
    else:
        print("Failed to fetch data")
        
    # Respect rate limiting (2 requests per second)
    time.sleep(0.5)

print(f"Total anime fetched: {len(all_anime)}")

# Convert to DataFrame
anime_df = pd.DataFrame([{
    'id': item['mal_id'],
    'title': item['title'],
    'english_title': item.get('title_english'),
    'type': item['type'],
    'episodes': item['episodes'],
    'status': item['status'],
    'airing': item['airing'],
    'aired_from': item['aired']['from'],
    'aired_to': item['aired']['to'],
    'duration': item['duration'],
    'rating': item['rating'],
    'score': item['score'],
    'scored_by': item['scored_by'],
    'rank': item['rank'],
    'popularity': item['popularity'],
    'members': item['members'],
    'favorites': item['favorites'],
    'synopsis': item['synopsis'],
    'season': item['season'],
    'year': item['year'],
    'studios': [studio['name'] for studio in item['studios']],
    'genres': [genre['name'] for genre in item['genres']],
    'themes': [theme['name'] for theme in item['themes']]
} for item in all_anime])

# Save to CSV
anime_df.to_csv('anime_data.csv', index=False)

Fetching page 1...
Fetching page 2...
Fetching page 3...
Fetching page 4...
Fetching page 5...
Total anime fetched: 125


In [4]:
def get_anime_reviews(anime_id, page=1):
    """Get reviews for a specific anime"""
    endpoint = f"{base_url}/anime/{anime_id}/reviews"
    params = {'page': page}
    
    response = requests.get(endpoint, params=params)
    
    if response.status_code == 200:
        return response.json()
    else:
        print(f"Error: {response.status_code}")
        return None

# Get reviews for the top 20 anime
reviews_data = []
for anime_id in anime_df['id'].head(20):
    print(f"Fetching reviews for anime ID: {anime_id}")
    reviews = get_anime_reviews(anime_id)
    
    if reviews and 'data' in reviews:
        for review in reviews['data']:
            reviews_data.append({
                'anime_id': anime_id,
                'user_username': review['user']['username'],
                'score': review['score'],
                'review': review['review']
            })
    
    # Respect rate limiting
    time.sleep(1)

reviews_df = pd.DataFrame(reviews_data)
reviews_df.to_csv('anime_reviews.csv', index=False)

Fetching reviews for anime ID: 61443
Fetching reviews for anime ID: 16498
Fetching reviews for anime ID: 1535
Fetching reviews for anime ID: 5114
Fetching reviews for anime ID: 30276
Fetching reviews for anime ID: 38000
Fetching reviews for anime ID: 11757
Fetching reviews for anime ID: 31964
Fetching reviews for anime ID: 11061
Fetching reviews for anime ID: 20
Fetching reviews for anime ID: 22319
Fetching reviews for anime ID: 32281
Fetching reviews for anime ID: 25777
Fetching reviews for anime ID: 40748
Fetching reviews for anime ID: 9253
Fetching reviews for anime ID: 33486
Fetching reviews for anime ID: 1735
Fetching reviews for anime ID: 21
Fetching reviews for anime ID: 35760
Fetching reviews for anime ID: 19815


In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

#load data
anime_df = pd.read_csv('anime_data.csv')
reviews_df = pd.read_csv('anime_reviews.csv')

#basic data cleaning
anime_df['score'] = pd.to_numeric(anime_df['score'], errors='coerce')
anime_df['episodes'] = pd.to_numeric(anime_df['episodes'], errors='coerce')
anime_df['year'] = pd.to_numeric(anime_df['year'], errors='coerce')

#missing values
anime_df['score'] = anime_df['score'].fillna(anime_df['score'].mean())
anime_df['episodes'] = anime_df['episodes'].fillna(0)

#convert to python lists
anime_df['genres'] = anime_df['genres'].apply(eval)
anime_df['studios'] = anime_df['studios'].apply(eval)
anime_df['themes'] = anime_df['themes'].apply(eval)

#petty stats
print("Dataset Stats:")
print(f"Total anime entries: {len(anime_df)}")
print(f"Year range: {anime_df['year'].min()} to {anime_df['year'].max()}")
print(f"Average score: {anime_df['score'].mean():.2f}")
print(f"Most common type: {anime_df['type'].value_counts().index[0]}")

#genre-based features
all_genres= []
for genres in anime_df['genres']:
    all_genres.extend(genres)
unique_genres = list(set(all_genres))
for genre in unique_genres:
    anime_df['fgenre_{genre}'] = anime_df['genres'].apply(lambda x: 1 if genre in x else 0)

Dataset Stats:
Total anime entries: 125
Year range: 1989.0 to 2022.0
Average score: 8.14
Most common type: TV


In [6]:
unique_genres

['Fantasy',
 'Drama',
 'Action',
 'Adventure',
 'Ecchi',
 'Mystery',
 'Award Winning',
 'Supernatural',
 'Horror',
 'Romance',
 'Sci-Fi',
 'Slice of Life',
 'Avant Garde',
 'Suspense',
 'Comedy',
 'Sports',
 'Gourmet']

In [7]:
anime_df

Unnamed: 0,id,title,english_title,type,episodes,status,airing,aired_from,aired_to,duration,...,popularity,members,favorites,synopsis,season,year,studios,genres,themes,fgenre_{genre}
0,61443,Shunkashuutou Daikousha: Haru no Mai,,TV,0.0,Not yet aired,False,,,Unknown,...,0,1,0,Winter was once the only season in the world-b...,,,[],"[Drama, Romance]",[],0
1,16498,Shingeki no Kyojin,Attack on Titan,TV,25.0,Finished Airing,False,2013-04-07T00:00:00+00:00,2013-09-29T00:00:00+00:00,24 min per ep,...,1,4140423,179241,"Centuries ago, mankind was slaughtered to near...",spring,2013.0,[Wit Studio],"[Action, Award Winning, Drama, Suspense]","[Gore, Military, Survival]",0
2,1535,Death Note,Death Note,TV,37.0,Finished Airing,False,2006-10-04T00:00:00+00:00,2007-06-27T00:00:00+00:00,23 min per ep,...,2,4084820,179239,"Brutal murders, petty thefts, and senseless vi...",fall,2006.0,[Madhouse],"[Supernatural, Suspense]",[Psychological],0
3,5114,Fullmetal Alchemist: Brotherhood,Fullmetal Alchemist: Brotherhood,TV,64.0,Finished Airing,False,2009-04-05T00:00:00+00:00,2010-07-04T00:00:00+00:00,24 min per ep,...,3,3504848,232872,After a horrific alchemy experiment goes wrong...,spring,2009.0,[Bones],"[Action, Adventure, Drama, Fantasy]",[Military],0
4,30276,One Punch Man,One Punch Man,TV,12.0,Finished Airing,False,2015-10-05T00:00:00+00:00,2015-12-21T00:00:00+00:00,24 min per ep,...,4,3367355,65720,The seemingly unimpressive Saitama has a rathe...,fall,2015.0,[Madhouse],"[Action, Comedy]","[Adult Cast, Parody, Super Power]",0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
120,4181,Clannad: After Story,Clannad: After Story,TV,24.0,Finished Airing,False,2008-10-03T00:00:00+00:00,2009-03-27T00:00:00+00:00,24 min per ep,...,120,1238203,71648,Tomoya Okazaki and Nagisa Furukawa have gradua...,fall,2008.0,[Kyoto Animation],"[Drama, Romance]",[],0
121,4898,Kuroshitsuji,Black Butler,TV,24.0,Finished Airing,False,2008-10-03T00:00:00+00:00,2009-03-27T00:00:00+00:00,24 min per ep,...,121,1229777,23084,"Young Ciel Phantomhive is known as ""the Queen'...",fall,2008.0,[A-1 Pictures],"[Action, Mystery, Supernatural]","[Historical, Mythology]",0
122,23283,Zankyou no Terror,Terror in Resonance,TV,11.0,Finished Airing,False,2014-07-11T00:00:00+00:00,2014-09-26T00:00:00+00:00,22 min per ep,...,122,1223835,22845,"Painted in red, the word ""VON"" is all that is ...",summer,2014.0,[MAPPA],"[Mystery, Suspense]","[Detective, Psychological]",0
123,19,Monster,Monster,TV,74.0,Finished Airing,False,2004-04-07T00:00:00+00:00,2005-09-28T00:00:00+00:00,24 min per ep,...,123,1221427,57244,"Dr. Kenzou Tenma, an elite neurosurgeon recent...",spring,2004.0,[Madhouse],"[Drama, Mystery, Suspense]","[Adult Cast, Psychological]",0


In [8]:
reviews_df

Unnamed: 0,anime_id,user_username,score,review
0,16498,Sorrowful,10,"Oh dear Shingeki no Kyojin, where do I even be..."
1,16498,Gladius650,10,I started to follow the manga after watching t...
2,16498,SonDavid,10,"In the 80's, Mobile Suit Gundam catapulted ani..."
3,16498,Kerma_,5,Shingeki no Kyojin... Where do I start? In sum...
4,16498,emberreviews,9,"Every once in a while, and even more frequentl..."
...,...,...,...,...
355,19815,ToastytheBaker,9,Blank can unite this burning land and bring an...
356,19815,ar2000,5,A lot of popular shows have plenty of problems...
357,19815,athetosis,6,"""Contender for anime of the season"" ""No doubt ..."
358,19815,_mahoushoujos_,7,Do you ever just have those series that you re...
