<a href="https://colab.research.google.com/github/dchung1209/Webtoon-Data-Analysis/blob/main/Naver_Webtoon_Request.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [10]:
import numpy as np
import pandas as pd
import requests

from bs4 import BeautifulSoup
from datetime import datetime
from google.colab import files

# Scrap Data

In [11]:
# Final Update 02/01/2024
class Queries():
  def __init__(self):
    self.target = "https://www.webtoons.com/en/dailySchedule"
    self.r = requests.get(self.target)
    self.raw = BeautifulSoup(self.r.content, 'html.parser')

  def raw_query(self):
    return self.raw

  # Query the titles
  def title_query(self):
    title = [p.text for p in self.raw.find_all('p', {'class': 'subj'})]
    return title

  # Query the authors
  def author_query(self):
    author = [p.text for p in self.raw.find_all('p', {'class': 'author'})]
    return author

  # Query the genres
  def genre_query(self):
    genre = [p.text for p in self.raw.find_all('p', {'class': 'genre'})]
    return genre

  # Query # likes
  def likes_query(self):
    likes = [p.text for p in self.raw.find_all('em', {'class': 'grade_num'})]
    return likes

  # Query
  def days_query(self):
    days = [p.text for p in self.raw.find_all('em', {'class': 'grade_num'})]
    return days

  def urls_query(self):
    urls = [p['href'] for p in self.raw.find_all('a', {'class': 'daily_card_item'})]
    return urls

  def description_query(self, url):
    r = requests.get(url)
    raw = BeautifulSoup(r.content, 'html.parser')
    description = raw.find('meta', {'property': 'og:description'})['content']
    return description

  def dict_query(self):
    dict = {'Title': self.title_query(), 'Author': self.author_query(), 'Genre': self.genre_query(), 'Like': self.likes_query(), 'URL' : self.urls_query()}
    return dict

  def df_query(self):
    df = pd.DataFrame(self.dict_query())
    return df

  def csv_query(self):
    df = self.df_query()
    df.to_csv(f"naver_webtoon_{datetime.today().strftime('%Y%m%d')}.csv")
    files.download(f"naver_webtoon_{datetime.today().strftime('%Y%m%d')}.csv")
    # return df


df = Queries().df_query()

In [12]:
df.head()

Unnamed: 0,Title,Author,Genre,Like,URL
0,Cursed Princess Club,LambCat,Comedy,22.8M,https://www.webtoons.com/en/comedy/cursed-prin...
1,Morgana and Oz,Miyuli,Fantasy,6.8M,https://www.webtoons.com/en/fantasy/morgana-an...
2,I’m the Queen in This Life,Themis / Omin,Fantasy,4.9M,https://www.webtoons.com/en/fantasy/im-the-que...
3,To The Stars and Back,Peglo,Slice of life,9.1M,https://www.webtoons.com/en/slice-of-life/to-t...
4,Ten Ways to Get Dumped by a Tyrant,danmyeong / mapzzil,Fantasy,405175,https://www.webtoons.com/en/fantasy/ten-ways-t...


# Clean Data

In [13]:
# Remove Duplicates

print("Before removing duplicates:",  df.shape)
df = df.drop_duplicates()
print("After removing duplicates:", df.shape)

Before removing duplicates: (1142, 5)
After removing duplicates: (1118, 5)


In [14]:
# Unit Conversion

def convert(s):
  import re

  if "M" in s:
    return 1000000 * float(re.sub("[^0-9.\-]","", s))
  elif "," in s:
    return float(re.sub("[^0-9.\-]","", s))

if isinstance(df['Like'][0], str):
  df.loc['Like'] = df['Like'].apply(lambda s: convert(s))

In [15]:
df.head()

Unnamed: 0,Title,Author,Genre,Like,URL
0,Cursed Princess Club,LambCat,Comedy,22.8M,https://www.webtoons.com/en/comedy/cursed-prin...
1,Morgana and Oz,Miyuli,Fantasy,6.8M,https://www.webtoons.com/en/fantasy/morgana-an...
2,I’m the Queen in This Life,Themis / Omin,Fantasy,4.9M,https://www.webtoons.com/en/fantasy/im-the-que...
3,To The Stars and Back,Peglo,Slice of life,9.1M,https://www.webtoons.com/en/slice-of-life/to-t...
4,Ten Ways to Get Dumped by a Tyrant,danmyeong / mapzzil,Fantasy,405175,https://www.webtoons.com/en/fantasy/ten-ways-t...


# Download

In [16]:
# to_csv
Queries().csv_query()

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>