In [1]:
# Import Library
import requests
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
from datetime import datetime

In [2]:
# Import excel file containing series id, url list, and title (retrieved from sitemap of the main url)
df = pd.read_excel(r"~/Documents/Datasets/kakao/kakao_webtoon.xlsx")
df.head()

Unnamed: 0,Id,Content,Title_Ori
0,1,https://id.kakaowebtoon.com/content/Dr.-Brain/1,Dr. Brain
1,2,https://id.kakaowebtoon.com/content/The-Tyrant...,The Tyrant's Tranquilizer
2,3,https://id.kakaowebtoon.com/content/Starting-f...,Starting from Today Im a Princess
3,4,https://id.kakaowebtoon.com/content/My-Life-as...,My Life as a Player
4,5,https://id.kakaowebtoon.com/content/Go-Youngsi...,Go Youngsins Trainee Life


In [12]:
# Create empty list to store loop result containing list of all columns
by_titles = []
n = len(df["Content"])

In [34]:
# Iterate for all series in the list
for i in range(n):
    try:
        the_url = df.loc[i, "Content"]
        the_id = df.loc[i, "Id"]

        result = requests.get(the_url)

        soup = BeautifulSoup(result.content, 'html5lib')

        subresults = soup.find_all("p")
        subresult_text = [subresult.text for subresult in subresults if subresult.text and subresult.text != "0"]

        by_title = {'Id': the_id,
                    'subresult_text': subresult_text[:],
                    'title': subresult_text[0],
                    'author_group': subresult_text[1],
                    'genre': subresult_text[2],
                    'views': subresult_text[3],
                    'likes': subresult_text[4],
                    'desc': soup.find("meta", attrs={'name': 'description'})["content"]}

        by_titles.append(by_title)
    except requests.exceptions.RequestException as e:
        print(f"Error fetching URL for iteration {i}: {e}")
        break
    except (KeyError, IndexError) as e:
        print(f"Error parsing data for iteration {i}: {e}")
        break

In [35]:
len(by_titles)

628

In [24]:
n

628

In [39]:
# Transform dictionary to dataframe
df_detail = pd.DataFrame.from_dict(by_titles)
df_detail.tail()

Unnamed: 0,Id,subresult_text,title,author_group,genre,views,likes,desc
623,642,"[The Beloved Fake Saint, jn, financier, Junye,...",The Beloved Fake Saint,"jn, financier, Junye",Romansa Fantasi,"2,2M","590,1K",Aku menjadi seorang Saintess palsu dalam sebua...
624,643,"[Dealing with the Secretive Male Second-Lead, ...",Dealing with the Secretive Male Second-Lead,"Insulaire, samo, Lee sinrok",Romansa Fantasi,999K,"284,3K",Saat menghadiri pesta tahun baru bersama tunan...
625,644,"[One Night Relationship, ZUOAN KAMAN, Romansa,...",One Night Relationship,ZUOAN KAMAN,Romansa,"356,7K","40,9K",Gara-gara dua kejadian yang terjadi dalam satu...
626,646,"[Adeline's Deep Night, MUA, HAPSTER,CHEEZENAAN...",Adeline's Deep Night,"MUA, HAPSTER,CHEEZENAAN, Lee hyeonsung",Romansa Fantasi,"1,5M","360,5K","Di malam yang gelap dan dingin, Adeline dibunu..."
627,648,"[Royal Marriage, HIYA, Kangryeol, Portofino​, ...",Royal Marriage,"HIYA, Kangryeol, Portofino​",Romansa Fantasi,"1,7M","408,8K","Tatiana Cartienne, wanita dengan kekayaan dan ..."


In [40]:
# Create column storing the number of authors involved (n_author_group) from counting elements in "author_group"
df_detail['n_author_group'] = df_detail['author_group'].str.count(",")+1
df_detail.tail()

Unnamed: 0,Id,subresult_text,title,author_group,genre,views,likes,desc,n_author_group
623,642,"[The Beloved Fake Saint, jn, financier, Junye,...",The Beloved Fake Saint,"jn, financier, Junye",Romansa Fantasi,"2,2M","590,1K",Aku menjadi seorang Saintess palsu dalam sebua...,3
624,643,"[Dealing with the Secretive Male Second-Lead, ...",Dealing with the Secretive Male Second-Lead,"Insulaire, samo, Lee sinrok",Romansa Fantasi,999K,"284,3K",Saat menghadiri pesta tahun baru bersama tunan...,3
625,644,"[One Night Relationship, ZUOAN KAMAN, Romansa,...",One Night Relationship,ZUOAN KAMAN,Romansa,"356,7K","40,9K",Gara-gara dua kejadian yang terjadi dalam satu...,1
626,646,"[Adeline's Deep Night, MUA, HAPSTER,CHEEZENAAN...",Adeline's Deep Night,"MUA, HAPSTER,CHEEZENAAN, Lee hyeonsung",Romansa Fantasi,"1,5M","360,5K","Di malam yang gelap dan dingin, Adeline dibunu...",4
627,648,"[Royal Marriage, HIYA, Kangryeol, Portofino​, ...",Royal Marriage,"HIYA, Kangryeol, Portofino​",Romansa Fantasi,"1,7M","408,8K","Tatiana Cartienne, wanita dengan kekayaan dan ...",3


In [41]:
df_main = df_detail.drop(columns = ["subresult_text"])
df_main.tail()

Unnamed: 0,Id,title,author_group,genre,views,likes,desc,n_author_group
623,642,The Beloved Fake Saint,"jn, financier, Junye",Romansa Fantasi,"2,2M","590,1K",Aku menjadi seorang Saintess palsu dalam sebua...,3
624,643,Dealing with the Secretive Male Second-Lead,"Insulaire, samo, Lee sinrok",Romansa Fantasi,999K,"284,3K",Saat menghadiri pesta tahun baru bersama tunan...,3
625,644,One Night Relationship,ZUOAN KAMAN,Romansa,"356,7K","40,9K",Gara-gara dua kejadian yang terjadi dalam satu...,1
626,646,Adeline's Deep Night,"MUA, HAPSTER,CHEEZENAAN, Lee hyeonsung",Romansa Fantasi,"1,5M","360,5K","Di malam yang gelap dan dingin, Adeline dibunu...",4
627,648,Royal Marriage,"HIYA, Kangryeol, Portofino​",Romansa Fantasi,"1,7M","408,8K","Tatiana Cartienne, wanita dengan kekayaan dan ...",3


In [42]:
df_main.dtypes

Id                 int64
title             object
author_group      object
genre             object
views             object
likes             object
desc              object
n_author_group     int64
dtype: object

In [43]:
# Numbers in views and likes (1,7M/999K) are stored as string - to transform
def string_to_num(text):
    if "K" in text:
        return int(float(text.split("K")[0])*1000)
    elif "M" in text:
        return int(float(text.split("M")[0])*1000000)
    else:
        return int(text)

In [44]:
df_main["views"] = df_main["views"].str.replace(",",".").apply(string_to_num)
df_main["likes"] = df_main["likes"].str.replace(",",".").apply(string_to_num)
df_main.tail()

Unnamed: 0,Id,title,author_group,genre,views,likes,desc,n_author_group
623,642,The Beloved Fake Saint,"jn, financier, Junye",Romansa Fantasi,2200000,590100,Aku menjadi seorang Saintess palsu dalam sebua...,3
624,643,Dealing with the Secretive Male Second-Lead,"Insulaire, samo, Lee sinrok",Romansa Fantasi,999000,284300,Saat menghadiri pesta tahun baru bersama tunan...,3
625,644,One Night Relationship,ZUOAN KAMAN,Romansa,356700,40900,Gara-gara dua kejadian yang terjadi dalam satu...,1
626,646,Adeline's Deep Night,"MUA, HAPSTER,CHEEZENAAN, Lee hyeonsung",Romansa Fantasi,1500000,360500,"Di malam yang gelap dan dingin, Adeline dibunu...",4
627,648,Royal Marriage,"HIYA, Kangryeol, Portofino​",Romansa Fantasi,1700000,408800,"Tatiana Cartienne, wanita dengan kekayaan dan ...",3


In [45]:
# Replace "genre" to english
df_main["genre"] = df_main["genre"].replace(["Romansa Fantasi", "Romansa", "Aksi"],["Romance Fantasy", "Romance", "Action"])
df_main["genre"].value_counts()

Romance Fantasy    313
Romance            186
Action              71
Drama               58
Name: genre, dtype: int64

In [26]:
df_main.to_excel(r"~/Documents/kakao_dataset.xlsx")