In [20]:
import os
import json
import re
import pandas as pd
import requests
from bs4 import BeautifulSoup

# ==========================================================
#         TOP 12 ALL EPS (all_episodes_top12.json)
# ==========================================================

url = "https://en.wikipedia.org/wiki/Produce_48"
response = requests.get(url)
soup = BeautifulSoup(response.content, "lxml")

# Cari tabel Top 12
table = None
for t in soup.find_all("table", {"class": "wikitable"}):
    headers = [th.get_text(strip=True) for th in t.find_all("th")]
    if headers and headers[0] == "No." and "Ep.1" in headers:
        table = t
        break

if not table:
    raise Exception("❌ Tabel ranking tidak ditemukan.")

# Ambil isi tabel
rows = table.find_all("tr")[1:]
episode_headers = [th.get_text(strip=True) for th in table.find_all("th")][1:]

data = []

for row in rows:
    cols = row.find_all("td")
    if not cols:
        continue
    rank_no = cols[0].get_text(strip=True)
    for i, col in enumerate(cols[1:]):
        episode = episode_headers[i]
        # Ambil angka episode saja
        episode_num = int(re.sub(r"\D", "", episode))  # "Ep.12" -> 12
        name = col.get_text(strip=True)

        # cleaning nama
        name = name.replace("-", "")                  # hapus tanda "-"
        name = re.sub(r"\s*[↑↓+\-]\d+\s*$", "", name) # hapus simbol "↑1", "↓2", "+3", "-4", dsb
        name = re.split(r"\s*=\s*", name)[0]

        if not name:
            continue
        # Kolom pertama episode, lalu name, lalu rank
        data.append({
            "episode": episode_num,
            "name": name,
            "rank": rank_no
        })

# Simpan ke JSON
output_dir = "../data/raw"
output_file = "all_episodes_top12.json"
os.makedirs(output_dir, exist_ok=True)

with open(os.path.join(output_dir, output_file), "w", encoding="utf-8") as f:
    json.dump(data, f, indent=4, ensure_ascii=False)

print(f"✅ Sukses menyimpan ranking ke '{output_dir}/{output_file}'")


✅ Sukses menyimpan ranking ke '../data/raw/all_episodes_top12.json'


In [28]:
# CEK TABEL
import pandas as pd

tables = pd.read_html("https://en.wikipedia.org/wiki/List_of_Produce_48_contestants", header=None)
print("Jumlah tabel:", len(tables))
for i, tbl in enumerate(tables):
    # Kecualikan tabel terlalu kecil barisnya
    if tbl.shape[0] > 3:
        print(i, tbl.iloc[2].tolist())

Jumlah tabel: 17
3 ['8D Creative (에잇디크리에이티브)', 'Kang Hyewon (강혜원)', np.int64(19), 'F', 'F', np.int64(38), np.int64(40), '41', '25', '222716', '3', '927362', '3', '4', '311212', '8', '248432', np.int64(8)]
4 [np.int64(1), 'GFriend', '"Love Whisper"', '칠전팔기 (Never Give Up!)', '334', 'Sub vocal 2', 'Son Eunchae', np.int64(88), np.float64(nan), np.float64(nan)]
5 ['Vocal & Rap', np.int64(1), 'Wanna One', '"Energetic"', 'Na Goeun', '382', np.int64(2), nan, np.float64(nan), np.float64(nan), np.float64(nan)]
6 [np.int64(1), 'Contemporary Girls Pop', 'oReO', '"1000%"', np.int64(138), 'Sub vocal 2', 'Moe Goto', np.int64(28), np.int64(19), np.float64(nan)]
7 [np.int64(1), 'Yasushi Akimoto', '"Suki ni Nacchau Darō? (반해버리잖아? / 好きになっちゃうだろう？)"', 'Sub vocal 2', 'Sakura Miyawaki']
8 ['China', 'Produce 101 China Chuang 2019 Chuang 2020 Chuang 2021']
10 ['48 (IZ*ONE)', 'Jang Won-young Sakura Miyawaki Jo Yu-ri Choi Ye-na An Yu-jin Nako Yabuki Kwon Eun-bi Kang Hye-won Hitomi Honda Kim Chaewon Kim Min-ju L

In [39]:
import pandas as pd
import json
import os

# Load semua tabel dari Wikipedia
url = "https://en.wikipedia.org/wiki/List_of_Produce_48_contestants"
tables = pd.read_html(url)

# Ambil tabel ke-4 (index ke-3)
df = tables[3]

# Ubah nama kolom, tambahkan koma yang hilang di ep1_rank dan ep2_rank
df.columns = [
    "company", "name", "age", "1st_grade", "last_grade", 
    "ep1_rank", "ep2_rank", "ep3_rank", "ep5_rank", 
    "ep5_votes", "ep8_rank", "ep8_votes", 
    "ep10_rank", "ep11_rank", "ep11_votes", 
    "ep12_rank", "total_votes", "final_rank"
]

# Pilih kolom yang relevan
data = df[[
    "name", "ep1_rank", "ep2_rank", "ep3_rank", "ep5_rank", 
    "ep8_rank", "ep10_rank", "ep11_rank", "ep12_rank"
]].copy()

# Ganti "-" dan NaN jadi None, dan handle "Eliminated"
for col in data.columns[1:]:
    data[col] = data[col].apply(lambda x: 
        "eliminated" if isinstance(x, str) and "eliminated" in x.lower() 
        else (None if x == "-" or pd.isna(x) else x)
    )

# Hapus baris kosong total
data = data.dropna(how="all")

# Buang duplikat berdasarkan 'name' dan 'final_rank'
data = data.drop_duplicates(subset=["name", "ep12_rank"])

# Ubah ke long format
rank_long = data.melt(id_vars='name', var_name='episode', value_name='rank')
rank_long = rank_long.dropna(subset=['rank'])

# Ekstrak nomor episode dari nama kolom, contoh: "ep1_rank" -> 1
rank_long['episode'] = rank_long['episode'].str.extract(r'ep(\d+)_rank')
rank_long['episode'] = pd.to_numeric(rank_long['episode'], errors='coerce')
rank_long = rank_long.dropna(subset=['episode'])

# Ubah tipe data episode ke int
rank_long['episode'] = rank_long['episode'].astype(int)

# Pastikan rank "eliminated" tetap string, selain itu ubah ke int
def clean_rank(val):
    if isinstance(val, str) and val.lower() == "eliminated":
        return "eliminated"
    try:
        return int(val)
    except:
        return val

rank_long['rank'] = rank_long['rank'].apply(clean_rank)

# Ubah ke dict
records = rank_long.to_dict(orient="records")

# Simpan
output_dir = "../data/raw"
os.makedirs(output_dir, exist_ok=True)
output_file = "trainee_episode_rank.json"

with open(os.path.join(output_dir, output_file), "w", encoding="utf-8") as f:
    json.dump(records, f, indent=4, ensure_ascii=False)

print(f"Sukses simpan {len(records)} trainee per episode ke '{output_file}'")

Sukses simpan 768 trainee per episode ke 'trainee_episode_rank.json'


In [None]:
# ==============================================
#         DATA TRAINEE (trainee.json)
# ==============================================

import pandas as pd
import os
import json

# Load semua tabel
url = "https://produce101.fandom.com/wiki/Season_3_Contestants"
tables = pd.read_html(url)

# Ambil tabel pertama
df = tables[0]

# Rename kolom
df.columns = [
    "nationality", "company", "name", "korean_name", "japanese_name",
    "age", "1st_grade", "last_grade", "1st_rank", "final_rank"
]

# CLEANING
# ada beberapa kolom yg lebih baik bertipe integer
# kolom yg berisi "-", diganti ""
df = df.replace("-", "")
df['age'] = df['age'].astype('Int64')    
df['1st_rank'] = df['1st_rank'].astype('Int64')

# Hapus baris kosong total
df = df.dropna(how="all")
# Forward fill
df = df.fillna(method="ffill")

# Hapus baris duplikat berdasarkan kolom 'name' dan 'final_rank'
df = df.drop_duplicates(subset=["name", "final_rank"])


records = df.to_dict(orient="records")

# Simpan
output_dir = "../../data"
os.makedirs(output_dir, exist_ok=True)
output_file = "trainee.json"

with open(os.path.join(output_dir, output_file), "w", encoding="utf-8") as f:
    json.dump(records, f, indent=4, ensure_ascii=False)

print(f"Sukses simpan {len(records)} trainee ke '{output_file}'")


✅ Sukses simpan 96 trainee ke 'trainee.json'


  df = df.fillna(method="ffill")


<h3> DATA PERFORMANCE </h3>

In [90]:
# CEK TABEL
import pandas as pd

tables = pd.read_html("https://akb48.fandom.com/wiki/Produce_48", header=None)
print("Jumlah tabel:", len(tables))
for i, tbl in enumerate(tables):
    # Kecualikan tabel terlalu kecil barisnya
    if tbl.shape[0] > 3:
        print(i, tbl.iloc[2].tolist())


Jumlah tabel: 12
0 [np.int64(3), 'Jo Yuri', 'Stone Music Entertainment', np.int64(294734)]
1 [np.float64(nan), 'Eliminated in Episode 11']
2 ['AKB48', 'Team A', 'Shinozaki Ayana', 'F', 'F', np.float64(84.0), np.float64(87.0), '77', '91 (30,489)', 'Eliminated', 'Eliminated', 'Eliminated', 'Eliminated']
3 ['Starship Entertainment', 'Jang Wonyoung', np.int64(15), 'B', 'B', np.int64(3), np.int64(4), np.int64(4), '3 (539,596)', '1 (1,010,555)', np.int64(8), '7 (277,922)', '1 (338,366)']
4 [np.float64(nan), 'Center']
5 [np.int64(1), 'GFRIEND', '"Love Whisper"', 'ChilJeonPalgi (칠전팔기)', '334', 'Sub vocal 2', 'Son Eunchae', np.int64(88), np.float64(nan), np.float64(nan)]
6 [np.float64(nan), 'Center']
7 ['Vocal & Rap', np.int64(1), 'Wanna One', '"Energetic"', 'Na Goeun', np.float64(382.0), np.int64(2), nan, np.float64(nan), np.float64(nan), np.float64(nan)]
9 [np.int64(1), 'Contemporary Girls Pop', 'oReO', '"1000%"', np.int64(138), 'Sub vocal 2', 'Goto Moe', np.int64(28), np.int64(19), np.float6

In [None]:
# ==========================================================
#     INDIVIDUAL EVALUATION (individual_evaluation.json)
# ==========================================================

import pandas as pd
import os
import re
import json

# Fetch tables
tables = pd.read_html("https://produce101.fandom.com/wiki/Produce_48_Company_Evaluation")

# Ambil tabel posisi evaluasi (pastikan ini benar)
position_table = tables[0]  # konfirmasi berdasarkan struktur tabel

# Tampilkan struktur mentah untuk debugging (bisa di-comment kalau udah oke)
print("Raw table columns:", position_table.columns.tolist())
print("Sample rows:\n", position_table.head(3).to_string())

# Ambil header dari baris ke-2 dan data mulai baris ke-3
headers = position_table.iloc[1]
data = position_table.iloc[2:].copy()

# Atur kolom manual
data.columns = [
    "company",
    "name",
    "grade",
    "song",
    "link"
]

data = data[["company","name", "grade", "song"]]

# cleaning kolom song
def clean_quotes(song):
    if isinstance(song, str):
        # Hapus kutip ganda hanya di sekitar judul lagu
        return re.sub(r'^"(.*?)"\s*-\s*(.*)$', r'\1 - \2', song).strip()
    return song

data['song'] = data['song'].apply(clean_quotes)

# Hapus baris kosong total
data = data.dropna(how='all').reset_index(drop=True)

# Ganti semua NaN (kosong) menjadi string kosong ""
data = data.fillna("")

# Convert ke records (list of dict)
records = data.to_dict(orient='records')

# Simpan ke file JSON
output_dir = "../../data"
os.makedirs(output_dir, exist_ok=True)
output_file = "individual_evaluation.json"

with open(os.path.join(output_dir, output_file), 'w', encoding='utf-8') as f:
    json.dump(records, f, indent=2, ensure_ascii=False)

print(f"✅ Successfully saved {len(records)} records to {output_file}")



Raw table columns: [0, 1, 2, 3, 4]
Sample rows:
                  0                1                2                           3                4
0  Korean Trainees  Korean Trainees  Korean Trainees             Korean Trainees  Korean Trainees
1          Company         Trainees            Grade                        Song             Link
2      Independent    Park Seoyoung                C  "Roller Coaster" - Chungha       Video Link
✅ Successfully saved 98 records to individual_evaluation.json


In [162]:
import requests
from bs4 import BeautifulSoup
import json
import os

# ==========================================================
#     GROUP BATTLE EVALUATION (group_battle_evaluation.json)
# ==========================================================

# Fetch tables
tables = pd.read_html("https://akb48.fandom.com/wiki/Produce_48")

# Ambil tabel posisi evaluasi 
position_table = tables[5] 

# Tampilkan struktur mentah untuk debugging (bisa di-comment kalau udah oke)
print("Raw table columns:", position_table.columns.tolist())
print("Sample rows:\n", position_table.head(3).to_string())

# Ambil header dari baris ke-2 dan data mulai baris ke-1
headers = position_table.iloc[1]
data = position_table.iloc[0:].copy()

# Atur kolom manual
data.columns = [
    "id_perform",
    "original_artist", 
    "song", 
    "team_name", 
    "team_votes", 
    "trainee_position",
    "trainee_name", 
    "trainee_votes", 
    "trainee_bonus",
    "unused"
]

data = data[["id_perform", "original_artist", "song", "team_name", "team_votes", 
    "trainee_position", "trainee_name", "trainee_votes", "trainee_bonus"]]

# Hapus baris kosong total
data = data.dropna(how='all').reset_index(drop=True)

# cleaning kolom angka
def safe_int(val):
    if pd.isna(val) or val == "":
        return 0
    # Ambil digit dengan regex
    digits = re.sub(r"[^\d]", "", str(val))
    return int(digits) if digits else 0

data['trainee_bonus'] = data['trainee_bonus'].apply(safe_int)

# cleaning kolom song
def clean_quotes(text):
    if isinstance(text, str):
        text = text.strip()
        # Hilangkan hanya kutip ganda luar, tapi biarkan apostrof (kutip tunggal)
        if text.startswith('"') and text.endswith('"'):
            return text[1:-1]
    return text

data['song'] = data['song'].apply(clean_quotes)

# Ganti semua NaN (kosong) menjadi string kosong ""
data = data.fillna("")

# Convert ke records (list of dict)
records = data.to_dict(orient='records')

# Simpan ke file JSON
output_dir = "../../data"
os.makedirs(output_dir, exist_ok=True)
output_file = "group_battle_evaluation.json"

with open(os.path.join(output_dir, output_file), 'w', encoding='utf-8') as f:
    json.dump(records, f, indent=2, ensure_ascii=False)

print(f"✅ Successfully saved {len(records)} records to {output_file}")


Raw table columns: [('Performance', '#'), ('Performance', 'Artist'), ('Performance', 'Song'), ('Team', 'Name'), ('Team', 'Votes (Average)'), ('Contestant', 'Position'), ('Contestant', 'Name'), ('Contestant', 'Votes'), ('Contestant', 'Bonus'), ('Contestant', 'Unnamed: 9_level_1')]
Sample rows:
   Performance                                           Team                   Contestant                                            
            #   Artist            Song                  Name Votes (Average)     Position         Name Votes Bonus Unnamed: 9_level_1
0           1  GFRIEND  "Love Whisper"  ChilJeonPalgi (칠전팔기)             334   Main vocal      Wang Ke    28   NaN                NaN
1           1  GFRIEND  "Love Whisper"  ChilJeonPalgi (칠전팔기)             334  Sub vocal 1    Muto Tomu    52   NaN                NaN
2           1  GFRIEND  "Love Whisper"  ChilJeonPalgi (칠전팔기)             334  Sub vocal 2  Son Eunchae    88   NaN                NaN
✅ Successfully saved 92 records to 

In [161]:
# ==========================================================
#     POSITION EVALUATION (position_evaluation.json)
# ==========================================================

import pandas as pd
import os
import re
import json

# Fetch tables
tables = pd.read_html("https://akb48.fandom.com/wiki/Produce_48")

# Ambil tabel posisi evaluasi (pastikan ini benar)
position_table = tables[7]  # konfirmasi berdasarkan struktur tabel

# Tampilkan struktur mentah untuk debugging (bisa di-comment kalau udah oke)
print("Raw table columns:", position_table.columns.tolist())
print("Sample rows:\n", position_table.head(3).to_string())

# Ambil header dari baris ke-2 dan data mulai baris ke-1
headers = position_table.iloc[0]
data = position_table.iloc[0:].copy()

# Atur kolom manual
data.columns = [
    "category", 
    "id_perform", 
    "original_artist", 
    "song", 
    "trainee_name", 
    "trainee_votes", 
    "rank_in_team", 
    "trainee_bonus",
    "results_8",
    "results_9",
    "results_10"
]

data = data[["category", "id_perform", "original_artist", "song", "trainee_name", "trainee_votes", "rank_in_team", "trainee_bonus",]]

# Hapus baris kosong total
data = data.dropna(how='all').reset_index(drop=True)

# cleaning kolom Bonus
def safe_int(val):
    if pd.isna(val) or val == "":
        return 0
    # Ambil digit dengan regex
    digits = re.sub(r"[^\d]", "", str(val))
    return int(digits) if digits else 0

data['trainee_bonus'] = data['trainee_bonus'].apply(safe_int)

# cleaning kolom song
def clean_quotes(text):
    if isinstance(text, str):
        text = text.strip()
        # Hilangkan hanya kutip ganda luar, tapi biarkan apostrof (kutip tunggal)
        if text.startswith('"') and text.endswith('"'):
            return text[1:-1]
    return text

data['song'] = data['song'].apply(clean_quotes)

# Ganti semua NaN (kosong) menjadi string kosong ""
data = data.fillna("")

# Convert ke records (list of dict)
records = data.to_dict(orient='records')

# Simpan ke file JSON
output_dir = "../../data"
os.makedirs(output_dir, exist_ok=True)
output_file = "position_evaluation.json"

with open(os.path.join(output_dir, output_file), 'w', encoding='utf-8') as f:
    json.dump(records, f, indent=2, ensure_ascii=False)

print(f"✅ Successfully saved {len(records)} records to {output_file}")


Raw table columns: [('Performance', 'Position'), ('Performance', '#'), ('Performance', 'Artist'), ('Performance', 'Song'), ('Name', 'Name'), ('Results', 'Votes'), ('Results', 'Rank'), ('Results', 'Bonus'), ('Results', 'Unnamed: 8_level_1'), ('Results', 'Unnamed: 9_level_1'), ('Results', 'Unnamed: 10_level_1')]
Sample rows:
    Performance                                    Name Results                                                                      
      Position  #     Artist         Song         Name   Votes Rank  Bonus Unnamed: 8_level_1 Unnamed: 9_level_1 Unnamed: 10_level_1
0  Vocal & Rap  1  Wanna One  "Energetic"      Jo Yuri   511.0    1  +5000                NaN                NaN                 NaN
1  Vocal & Rap  1  Wanna One  "Energetic"  Kim Sihyeon   376.0    3    NaN                NaN                NaN                 NaN
2  Vocal & Rap  1  Wanna One  "Energetic"     Na Goeun   382.0    2    NaN                NaN                NaN                 NaN
✅ Success

In [13]:
# ==========================================================
#     CONCEPT EVALUATION (concept_evaluation.json)
# ==========================================================

import pandas as pd
import os
import re
import json

# Ambil semua tabel dari Fandom Wiki
tables = pd.read_html("https://akb48.fandom.com/wiki/Produce_48")

# Pilih tabel Concept Evaluation (tabel ke-9)
table = tables[9]

# Cek struktur awal (opsional)
print("Kolom awal:", table.columns.tolist())
print("Contoh baris:\n", table.head(3).to_string())

# Ambil data dari baris ke-3 ke bawah
data = table.iloc[0:].copy()

# Rename kolom secara manual
data.columns = [
    "id_perform",
    "concept",
    "producer",
    "original_single",
    "votes",
    "trainee_position",
    "trainee_name",
    "trainee_votes",
    "trainee_rank",
    "bonus"
]

# Drop baris yang kosong total
data = data.dropna(how='all').reset_index(drop=True)

print(data["bonus"].unique()[:10]) #ngecek

# cleaning kolom angka
def safe_int(val):
    if pd.isna(val) or val == "":
        return 0
    # Ambil digit dengan regex
    digits = re.sub(r"[^\d]", "", str(val))
    return int(digits) if digits else 0

# cleaning int
data['votes'] = data['votes'].apply(safe_int)
data['bonus'] = data['bonus'].apply(safe_int)
data['trainee_rank'] = data['trainee_rank'].apply(safe_int)
data['trainee_votes'] = data['trainee_votes'].apply(safe_int)

# cleaning kolom song
def clean_quotes(text):
    if isinstance(text, str):
        text = text.strip()
        # Hilangkan hanya kutip ganda luar, tapi biarkan apostrof (kutip tunggal)
        if text.startswith('"') and text.endswith('"'):
            return text[1:-1]
    return text

data['original_single'] = data['original_single'].apply(clean_quotes)

# Ganti NaN lainnya dengan string kosong
data = data.fillna("")

# Konversi ke list of dict
records = data.to_dict(orient='records')

# Simpan ke file JSON
output_dir = "../data/raw"
os.makedirs(output_dir, exist_ok=True)
output_file = "concept_evaluation.json"

with open(os.path.join(output_dir, output_file), "w", encoding="utf-8") as f:
    json.dump(records, f, indent=2, ensure_ascii=False)

print(f"✅ Sukses simpan {len(records)} baris ke {output_file}")


Kolom awal: [('Performance', '#'), ('Performance', 'Concept'), ('Performance', 'Producer'), ('Performance', 'Song'), ('Performance', 'Votes'), ('Contestant', 'Position'), ('Contestant', 'Name'), ('Contestant', 'Votes'), ('Contestant', 'Rank'), ('Contestant', 'Bonus')]
Contoh baris:
   Performance                                                   Contestant                                
            #                 Concept Producer     Song Votes     Position           Name Votes Rank Bonus
0           1  Contemporary Girls Pop     oReO  "1000%"   138   Main vocal   Lee Chaeyeon    18   24   NaN
1           1  Contemporary Girls Pop     oReO  "1000%"   138  Sub vocal 1  Miyazaki Miho    42   13   NaN
2           1  Contemporary Girls Pop     oReO  "1000%"   138  Sub vocal 2       Goto Moe    28   19   NaN
[   nan 20000. 50000.]
✅ Sukses simpan 30 baris ke concept_evaluation.json


In [21]:
# ==========================================================
#     DEBUT EVALUATION (debut_evaluation.json)
# ==========================================================

# Load semua tabel
tables = pd.read_html("https://akb48.fandom.com/wiki/Produce_48")

# Ambil tabel ke-11
table = tables[11]

# Cek struktur awal (bisa di-comment kalau sudah yakin)
print("Kolom awal:", table.columns.tolist())
print("Contoh data:\n", table.head(3).to_string())

# Ambil data dari baris ke-3 (karena baris 1 & 2 biasanya header campuran)
data = table.iloc[2:].copy()

# Atur ulang nama kolom
data.columns = [
    "id_perform",
    "producer",
    "original_single",
    "trainee_position",
    "trainee_name"
]

# cleaning kolom song
def clean_quotes(text):
    if isinstance(text, str):
        text = text.strip()
        # Hilangkan hanya kutip ganda luar, tapi biarkan apostrof (kutip tunggal)
        if text.startswith('"') and text.endswith('"'):
            return text[1:-1]
    return text

data['original_single'] = data['original_single'].apply(clean_quotes)

# Drop baris kosong total
data = data.dropna(how='all').reset_index(drop=True)

# Ganti NaN menjadi string kosong
data = data.fillna("")

# Ubah ke format list of dict
records = data.to_dict(orient="records")

# Simpan sebagai JSON
output_dir = "../data/raw"
os.makedirs(output_dir, exist_ok=True)
output_file = "debut_evaluation.json"

with open(os.path.join(output_dir, output_file), "w", encoding="utf-8") as f:
    json.dump(records, f, indent=2, ensure_ascii=False)

print(f"✅ Sukses simpan {len(records)} baris ke {output_file}")


Kolom awal: [('Performance', '#'), ('Performance', 'Producer'), ('Performance', 'Song'), ('Contestant', 'Position'), ('Contestant', 'Name')]
Contoh data:
   Performance                                                                       Contestant                 
            #         Producer                                               Song     Position             Name
0           1  Akimoto Yasushi  "Suki ni Nacchau Darou? (반해버리잖아? / 好きになっちゃうだろう？)"   Main vocal       Kwon Eunbi
1           1  Akimoto Yasushi  "Suki ni Nacchau Darou? (반해버리잖아? / 好きになっちゃうだろう？)"  Sub vocal 1        Choi Yena
2           1  Akimoto Yasushi  "Suki ni Nacchau Darou? (반해버리잖아? / 好きになっちゃうだろう？)"  Sub vocal 2  Miyawaki Sakura
✅ Sukses simpan 18 baris ke debut_evaluation.json


<h3>GENERAL INFO </h3>

In [121]:
# ==========================================================
#     OFFICIAL WINNER (winner.json)
# ==========================================================

# Load semua tabel
tables = pd.read_html("https://akb48.fandom.com/wiki/Produce_48")

# Ambil tabel pertama
table = tables[0]

# Cek struktur awal (bisa di-comment kalau sudah yakin)
print("Kolom awal:", table.columns.tolist())
print("Contoh data:\n", table.head(3).to_string())

# Ambil data dari baris ke-3 (karena baris 1 & 2 biasanya header campuran)
data = table.iloc[2:].copy()

# Atur ulang nama kolom
data.columns = [
    "rank",
    "name",
    "company",
    "votes"
]

# Drop baris kosong total
data = data.dropna(how='all').reset_index(drop=True)

# Ganti NaN menjadi string kosong
data = data.fillna("")

# Ubah ke format list of dict
records = data.to_dict(orient="records")

# Simpan sebagai JSON
output_dir = "../../data"
os.makedirs(output_dir, exist_ok=True)
output_file = "winner.json"

with open(os.path.join(output_dir, output_file), "w", encoding="utf-8") as f:
    json.dump(records, f, indent=2, ensure_ascii=False)

print(f"✅ Sukses simpan {len(records)} baris ke {output_file}")


Kolom awal: ['#', 'Name', 'Agency / Group', 'Votes']
Contoh data:
    #             Name             Agency / Group   Votes
0  1    Jang Wonyoung     Starship Entertainment  338366
1  2  Miyawaki Sakura                      HKT48  316105
2  3          Jo Yuri  Stone Music Entertainment  294734
✅ Sukses simpan 10 baris ke winner.json


In [46]:
# CEK TABEL 2
import pandas as pd

tables = pd.read_html("https://en.wikipedia.org/wiki/Produce_48#Result", header=None)
print("Jumlah tabel:", len(tables))
for i, tbl in enumerate(tables):
    # Kecualikan tabel terlalu kecil barisnya
    if tbl.shape[0] > 3:
        print(i, tbl.iloc[2].tolist())


Jumlah tabel: 22
0 ['Created by', 'Kim Young-bum for Mnet']
1 [np.float64(nan), 'Contestants eliminated in the third elimination round']
2 ['Kim Min-ju (김민주)', 'Lee Chae-yeon (이채연)', 'Han Cho-won (한초원)', 'Lee Ga-eun (이가은)', 'Miho Miyazaki (宮崎美穂)']
3 ['2', '"Episode 2"', 'June\xa022,\xa02018', nan]
5 [np.int64(3), 'Jang Won-young', 'Kwon Eun-bi ↑22', 'Sakura Miyawaki ↑2', 'Jang Won-young ↑1', 'Kang Hye-won ↑22', 'Kang Hye-won =', 'Lee Chae-yeon ↑9', 'Jo Yu-ri ↑15']
6 [np.int64(3), 'Jo Yu-ri', np.int64(294734), 'Stone Music']
8 ['"To Reach You" (너에게 닿기를)', '2018', '57', '61', '—', '30 Girls 6 Concepts', nan, nan, nan, nan, nan, nan, nan, nan, nan, nan]
9 ['3', 'June 29, 2018', '1.999%', '2.098%']
13 ['China', 'Produce 101 China Chuang 2019 Chuang 2020 Chuang 2021']
15 ['48 (IZ*ONE)', 'Jang Won-young Sakura Miyawaki Jo Yu-ri Choi Ye-na An Yu-jin Nako Yabuki Kwon Eun-bi Kang Hye-won Hitomi Honda Kim Chaewon Kim Min-ju Lee Chae-yeon']
16 ['48', 'Alex Christine Hong Ye-ji Huh Yunjin Jang Gyu

In [22]:
# ==========================================================
#     SINGLES PEAK CHART POSITION(single_peak_chart.json)
# ==========================================================

# Load semua tabel
tables = pd.read_html("https://en.wikipedia.org/wiki/Produce_48#Result")

# Ambil tabel ke-8
table = tables[8]

# Cek struktur (opsional)
print("Kolom awal:", table.columns.tolist())
print("Contoh data:\n", table.head(3).to_string())

# Ambil data dari baris ke-3 ke bawah
data = table.iloc[0:].copy()

# Drop semua kolom yang isinya kosong total (semua nilainya NaN)
data = data.dropna(axis=1, how='all')

# Cek ulang jumlah kolom yang tersisa
print("🧹 Kolom setelah dibersihkan:", data.shape[1])

# Rename kolom jika jumlahnya sudah tepat (harus 6)
data.columns = [
    "song_title",
    "year",
    "peak_kor",
    "kor_hot_100",
    "jpn_hot_100",
    "album",
    "Unused_1",
    "Unused_2",
    "Unused_3",
    "Unused_4",
    "Unused_5",
    "Unused_6",
    "Unused_7",
    "Unused_8",
    "Unused_9",
    "Unused_10"
]

data = data[["song_title", "year", "peak_kor", "kor_hot_100", "jpn_hot_100", "album"]]

# cleaning kolom song
def clean_quotes(text):
    if isinstance(text, str):
        text = text.strip()
        # Hilangkan hanya kutip ganda luar, tapi biarkan apostrof (kutip tunggal)
        if text.startswith('"') and text.endswith('"'):
            return text[1:-1]
    return text

data['song_title'] = data['song_title'].apply(clean_quotes)

# cleaning angka: "-" dan NaN jadi 0, string angka jadi int
def safe_int(val):
    if pd.isna(val) or val == "-" or val == "":
        return 0
    try:
        return int(str(val).replace(",", "").strip())
    except:
        return 0

for col in ["year", "peak_kor", "kor_hot_100", "jpn_hot_100"]:
    data[col] = data[col].apply(safe_int)

# Cleaning sekalian baris kosong
data = data.dropna(how='all').reset_index(drop=True)

# Ganti NaN menjadi string kosong
data = data.fillna("")

# Ubah ke format list of dict
records = data.to_dict(orient="records")

# Simpan ke JSON
output_dir = "../data/raw"
os.makedirs(output_dir, exist_ok=True)
output_file = "single_peak_chart.json"

with open(os.path.join(output_dir, output_file), "w", encoding="utf-8") as f:
    json.dump(records, f, indent=2, ensure_ascii=False)

print(f"Sukses simpan {len(records)} baris ke {output_file}")


Kolom awal: [('Title', 'Title'), ('Year', 'Year'), ('Peak positions', 'KOR [26]'), ('Peak positions', 'KOR Hot 100 [27]'), ('Peak positions', 'JPN Hot 100'), ('Album', 'Album'), ('Unnamed: 6_level_0', 'Unnamed: 6_level_1'), ('Unnamed: 7_level_0', 'Unnamed: 7_level_1'), ('Unnamed: 8_level_0', 'Unnamed: 8_level_1'), ('Unnamed: 9_level_0', 'Unnamed: 9_level_1'), ('Unnamed: 10_level_0', 'Unnamed: 10_level_1'), ('Unnamed: 11_level_0', 'Unnamed: 11_level_1'), ('Unnamed: 12_level_0', 'Unnamed: 12_level_1'), ('Unnamed: 13_level_0', 'Unnamed: 13_level_1'), ('Unnamed: 14_level_0', 'Unnamed: 14_level_1'), ('Unnamed: 15_level_0', 'Unnamed: 15_level_1')]
Contoh data:
                       Title  Year Peak positions                                             Album Unnamed: 6_level_0 Unnamed: 7_level_0 Unnamed: 8_level_0 Unnamed: 9_level_0 Unnamed: 10_level_0 Unnamed: 11_level_0 Unnamed: 12_level_0 Unnamed: 13_level_0 Unnamed: 14_level_0 Unnamed: 15_level_0
                      Title  Year       K

In [176]:
# ==========================================================
#            RATING PER EPS (episode_rating.json)
# ==========================================================

import os
import json
import pandas as pd

# Load semua tabel
tables = pd.read_html("https://en.wikipedia.org/wiki/Produce_48#Result")

# Ambil tabel ke-10
table = tables[9]

# Flatten MultiIndex kolom (tuple menjadi string biasa)
table.columns = [' '.join(col).strip() if isinstance(col, tuple) else col for col in table.columns]

# Cek struktur (opsional)
print("Kolom awal:", table.columns.tolist())
print("Contoh data:\n", table.head(3).to_string())

# Buat salinan untuk cleaning
data = table.copy()

# Drop semua kolom yang isinya kosong total
data = data.dropna(axis=1, how='all')

# Cek ulang jumlah kolom
print("🧹 Kolom setelah dibersihkan:", data.shape[1])

# Rename kolom
data.columns = [
    "episode",
    "broadcast_date",
    "nationwide_rate",
    "seoul_rate"
]

# Fungsi cleaning angka
def clean_rate(rate):
    if pd.isna(rate) or rate in ["", "NR"]:
        return 0.0
    return float(rate.replace("%", "").strip())

data["nationwide_rate"] = data["nationwide_rate"].apply(clean_rate)
data["seoul_rate"] = data["seoul_rate"].apply(clean_rate)

# Hapus baris kosong total
data = data.dropna(how='all').reset_index(drop=True)

# Ganti NaN jadi string kosong
data = data.fillna("")

# Konversi ke list of dict
records = data.to_dict(orient="records")

# Simpan ke JSON
output_dir = "../../data"
os.makedirs(output_dir, exist_ok=True)
output_file = "episode_rating.json"

with open(os.path.join(output_dir, output_file), "w", encoding="utf-8") as f:
    json.dump(records, f, indent=2, ensure_ascii=False)

print(f"✅ Sukses simpan {len(records)} baris ke {output_file}")


Kolom awal: ['Ep. Ep. Ep.', 'Broadcast date Broadcast date Broadcast date', 'Average audience share AGB Nielsen Nationwide[33]', 'Average audience share AGB Nielsen Seoul[34]']
Contoh data:
   Ep. Ep. Ep. Broadcast date Broadcast date Broadcast date Average audience share AGB Nielsen Nationwide[33] Average audience share AGB Nielsen Seoul[34]
0           1                                June 15, 2018                                            1.132%                                           NR
1           2                                June 22, 2018                                            1.913%                                       1.768%
2           3                                June 29, 2018                                            1.999%                                       2.098%
🧹 Kolom setelah dibersihkan: 4
✅ Sukses simpan 13 baris ke episode_rating.json


In [None]:
# CEK TABEL WEBPAGE 3
import pandas as pd

tables = pd.read_html("https://en.wikipedia.org/wiki/30_Girls_6_Concepts", header=None)
print("Jumlah tabel:", len(tables))
for i, tbl in enumerate(tables):
    # Kecualikan tabel terlalu kecil barisnya
    if tbl.shape[0] > 3:
        print(i, tbl.iloc[2].tolist())

Jumlah tabel: 10
0 ['Released', 'September\xa01,\xa02018', np.float64(nan)]
2 [np.float64(3.0), '"Yume o Miteiru Aida (Korean Version)" (꿈을 꾸는 동안 (夢を見ている間) Korean Ver.)', 'Yasushi Akimoto', 'Iggy (OREO)Youngbae (RBW)', 'Iggy (OREO)Youngbae (RBW)', '3:28']
3 ['China', 'Produce 101 China Chuang 2019 Chuang 2020 Chuang 2021']
5 ['48 (IZ*ONE)', 'Jang Won-young Sakura Miyawaki Jo Yu-ri Choi Ye-na An Yu-jin Nako Yabuki Kwon Eun-bi Kang Hye-won Hitomi Honda Kim Chaewon Kim Min-ju Lee Chae-yeon']
6 ['48', 'Alex Christine Hong Ye-ji Huh Yunjin Jang Gyu-ri Juri Takahashi Jurina Matsui Kim Do-ah Kim Si-hyeon Mako Kojima Miho Miyazaki Miru Shiroma Miyu Takeuchi Rena Hasegawa Sae Murase Shin Su-hyun Tomu Muto Yūka Kato']


In [18]:
# ==========================================================
#    SONGS (singles.json)
# ==========================================================

def load_and_clean_table(url, table_index):
    tables = pd.read_html(url)
    table = tables[table_index]

    data = table.iloc[0:].copy()

    data.columns = [
        "id_single",
        "title",
        "lyrics_writer",
        "music_producer",
        "arrangement",
        "duration"
    ]

    data = data.fillna("")

    return data.to_dict(orient="records")

# ===== LOAD DATASET 1 =====
songs_1 = load_and_clean_table(
    "https://en.wikipedia.org/wiki/30_Girls_6_Concepts", 
    table_index=2
)

# ===== LOAD DATASET 2 =====
songs_2 = load_and_clean_table(
    "https://en.wikipedia.org/wiki/Produce_48_%E2%80%93_Final", 
    table_index=2
)

# ===== GABUNGKAN =====
total_songs = songs_1 + songs_2

# clean judul lagu
def clean_title(title):
    title = title.strip()
    # "\"Rumor\" (H.I.N.P)" => "Rumor (H.I.N.P)"
    title = re.sub(r'^"([^"]+)"', r'\1', title)
    return title

for i, song in enumerate(total_songs, start=1):
    song["id_single"] = str(i)  # Atau f"{i:02d}" kalau mau leading zero
    song["title"] = clean_title(song["title"])

# ===== SIMPAN KE JSON =====
output_dir = "../data/raw"
os.makedirs(output_dir, exist_ok=True)
output_file = "singles.json"

with open(os.path.join(output_dir, output_file), "w", encoding="utf-8") as f:
    json.dump(total_songs, f, indent=2, ensure_ascii=False)

print(f"✅ Sukses gabung {len(songs_1)} + {len(songs_2)} = {len(total_songs)} lagu ke '{output_file}'")


✅ Sukses gabung 6 + 4 = 10 lagu ke 'singles.json'


In [186]:
# ==========================================================
#     ALBUM (album.json)
# ==========================================================

import os
import json
import pandas as pd
import re

# Ambil semua tabel dari Wikipedia
tables = pd.read_html("https://en.wikipedia.org/wiki/Produce_48#Result")

# Ambil tabel ke-7
table = tables[7]

# Tampilkan struktur awal (opsional)
print("Kolom awal:", table.columns.tolist())
print("Contoh data:\n", table.head(2).to_string())

# Ambil baris data aja
data = table.iloc[0:].reset_index(drop=True)

# Rename kolom
data.columns = [
    "title",
    "details",
    "jpn_hot",
    "jpn_digital",
    "us_world",
    "sales"
]

# =====================
# CLEANING FUNCTIONS
# =====================

def clean_chart(val):
    if val == "—" or val == "":
        return 0
    return int(val)

def clean_sales(val):
    if pd.isna(val) or val == "":
        return 0
    val = str(val)
    match = re.search(r"(\d[\d,]*)", val)
    if match:
        return int(match.group(1).replace(",", ""))
    return 0

def parse_details(details):
    release_date = ""
    label = ""
    formats = ""

    if isinstance(details, str):
        if "Released:" in details:
            release_date = details.split("Released:")[1].split("Label:")[0].strip()
        if "Label:" in details:
            label = details.split("Label:")[1].split("Formats:")[0].strip()
        if "Formats:" in details:
            formats = details.split("Formats:")[1].strip()

    return release_date, label, formats

# =====================
# APPLY CLEANING
# =====================

cleaned_records = []

for i, row in data.iterrows():
    release_date, label, formats = parse_details(row["details"])

    cleaned_records.append({
        "title": row["title"],
        "release_date": release_date,
        "label": label,
        "formats": formats,
        "jpn_hot": clean_chart(row["jpn_hot"]),
        "jpn_digital": clean_chart(row["jpn_digital"]),
        "us_world": clean_chart(row["us_world"]),
        "sales": clean_sales(row["sales"])
    })

# =====================
# SIMPAN KE JSON
# =====================

output_dir = "../../data"
os.makedirs(output_dir, exist_ok=True)
output_file = "album.json"

with open(os.path.join(output_dir, output_file), "w", encoding="utf-8") as f:
    json.dump(cleaned_records, f, indent=2, ensure_ascii=False)

print(f"✅ Sukses simpan {len(cleaned_records)} baris ke {output_file}")


Kolom awal: [('Title', 'Title'), ('Details', 'Details'), ('Peak chart positions', 'JPN Hot [22]'), ('Peak chart positions', 'JPN Dig [23]'), ('Peak chart positions', 'US World [24]'), ('Sales', 'Sales')]
Contoh data:
                  Title                                                                                 Details Peak chart positions                                      Sales
                 Title                                                                                 Details         JPN Hot [22] JPN Dig [23] US World [24]           Sales
0  30 Girls 6 Concepts    Released: August 18, 2018 Label: Stone Music Entertainment Formats: Digital download                   14            7             9  JPN: 2,473[25]
1   Produce 48 – Final  Released: September 1, 2018 Label: Stone Music Entertainment Formats: Digital download                   22            —             —             NaN
✅ Sukses simpan 2 baris ke album.json
