In [1]:
import pandas as pd
import duckdb
import seaborn as sns
import matplotlib.pyplot as plt
from dotenv import load_dotenv
import os

In [3]:
con = duckdb.connect(":memory:")

In [4]:
con.sql("load postgres;")

In [5]:
con.sql(
    f"ATTACH 'dbname={os.getenv('DB_NAME')} user={os.getenv('DB_USER')} host={os.getenv('DB_HOST')} password={os.getenv('DB_PASSWORD')} port={os.getenv('DB_PORT')}' AS supabase (TYPE postgres, SCHEMA 'public');"
)

In [7]:
con.sql("select * from supabase.anime limit 10;").df()

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266
5,32935,Haikyuu!!: Karasuno Koukou VS Shiratorizawa Ga...,"Comedy, Drama, School, Shounen, Sports",TV,10,9.15,93351
6,11061,Hunter x Hunter (2011),"Action, Adventure, Shounen, Super Power",TV,148,9.13,425855
7,820,Ginga Eiyuu Densetsu,"Drama, Military, Sci-Fi, Space",OVA,110,9.11,80679
8,15335,Gintama Movie: Kanketsu-hen - Yorozuya yo Eien...,"Action, Comedy, Historical, Parody, Samurai, S...",Movie,1,9.1,72534
9,15417,Gintama&#039;: Enchousen,"Action, Comedy, Historical, Parody, Samurai, S...",TV,13,9.11,81109


In [None]:
genre_query = """
select
trim(lower(trim(UNNEST(STRING_SPLIT(trim(genre), ','))))) AS genres_cleaned,
from supabase.anime 
"""

con.sql(genre_query).df()

┌────────────────────────┐
│     genres_cleaned     │
│        varchar         │
├────────────────────────┤
│ drama                  │
│ romance                │
│ school                 │
│ supernatural           │
│ action                 │
│ adventure              │
│ drama                  │
│ fantasy                │
│ magic                  │
│ military               │
│   ·                    │
│   ·                    │
│   ·                    │
│ mecha                  │
│ military               │
│ romance                │
│ sci-fi                 │
│ shounen                │
│ action                 │
│ mecha                  │
│ military               │
│ romance                │
│ shoujo                 │
├────────────────────────┤
│         ? rows         │
│ (>9999 rows, 20 shown) │
└────────────────────────┘

In [9]:
genres = con.sql(genre_query).df()
genres.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26684 entries, 0 to 26683
Data columns (total 1 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   genres_cleaned  26684 non-null  object
dtypes: object(1)
memory usage: 208.6+ KB


In [19]:
genres_cleaned = (
    genres.groupby("genres_cleaned")["genres_cleaned"].count().reset_index(name="count")
)
genres_cleaned

Unnamed: 0,genres_cleaned,count
0,,10
1,action,2327
2,adventure,1804
3,cars,48
4,comedy,3383
5,dementia,67
6,demons,194
7,drama,1512
8,ecchi,571
9,fantasy,1647


In [20]:
genres_cleaned.sort_values(by="count", ascending=False, inplace=True)

In [21]:
genres_cleaned

Unnamed: 0,genres_cleaned,count
4,comedy,3383
1,action,2327
2,adventure,1804
28,sci-fi,1732
9,fantasy,1647
32,shounen,1526
7,drama,1512
25,romance,1282
27,school,1007
34,slice of life,895


In [23]:
len(genres_cleaned["genres_cleaned"].to_list())

41

In [25]:
genres_cleaned.dropna(inplace=True)
genres_cleaned.tail()

Unnamed: 0,genres_cleaned,count
33,shounen ai,59
31,shoujo ai,51
14,josei,50
3,cars,48
0,,10


In [26]:
genres_cleaned = genres_cleaned.iloc[:-1]
genres_cleaned.tail()

Unnamed: 0,genres_cleaned,count
5,dementia,67
33,shounen ai,59
31,shoujo ai,51
14,josei,50
3,cars,48
