# <b> Data collection </b>

## <b> Import libraries </b>

In [5]:
import requests
import pandas as pd 
from bs4 import BeautifulSoup
import re
from requests_html import HTMLSession
import nest_asyncio
import datetime

In [6]:
# nest_asyncio.apply()
session = HTMLSession()
listUrl = []
for i in range(0, 4951, 50):
    url = f'https://myanimelist.net/topmanga.php?limit={i}'
    html = requests.get(url).text
    soup = BeautifulSoup(html, "html.parser")
    listItem = soup.find_all("td", {"class": "title al va-t clearfix word-break"})
    for item in listItem:
        listUrl.append(item.find('a').get('href'))
    print(f'Collecting {len(listUrl)} manga urls...', end='\r', flush=True)


Collecting 5000 manga urls...

In [7]:
listHtml = []
# Extract time of data collection to report for the project
now = datetime.datetime.now()
now = now.strftime("%Y-%m-%d %H:%M:%S")
print("Time of data collection: ", now)

for url in listUrl:
    res = session.get(url)
    listHtml.append(res.text)
    print(f'Collecting {len(listHtml)}/{len(listUrl)} manga html...', end='\r', flush=True)
print(len(listHtml))

Time of data collection:  2023-11-24 17:45:26
5000ecting 5000/5000 manga html...


In [8]:
def extract_info(htmlComic):
    soup = BeautifulSoup(htmlComic, "html.parser")

    title = soup.find('span', {'itemprop': 'name'})
    if title is None:
        return None

    title = title.text
    ratingValue = soup.find('span', {'itemprop': 'ratingValue'}).text
    ratingCount = soup.find('span', {'itemprop': 'ratingCount'}).text
    ranked = re.findall(r'\d+', soup.find('span', {'class': 'numbers ranked'}).text)[0]
    popularity = re.findall(r'\d+', soup.find('span', {'class': 'numbers popularity'}).text)[0]

    volumes, chapters, status, published = '', '', '', ''
    genres, themes, authors, favorites, members = [], [], '', '', ''

    for space in soup.find_all("div", {'class': 'spaceit_pad'}):
        text = space.text
        if 'Volumes' in text:
            volumes = text.split(':')[1].strip()
        elif 'Chapters' in text:
            chapters = text.split(':')[1].strip()
        elif 'Status' in text:
            status = text.split(':')[1].strip()
        elif 'Published' in text:
            published = text.split(':')[1].strip()
        elif 'Genres' in text:
            genres = [gen.text for gen in space.find_all('span', {'itemprop': 'genre'})]
        elif 'Themes' in text:
            themes = [theme.text for theme in space.find_all('span', {'itemprop': 'genre'})]
        elif 'Authors' in text:
            authors = text.split(':')[1].strip()
        elif 'Favorites' in text:
            favorites = text.split(':')[1].strip()
        elif 'Members' in text:
            members = text.split(':')[1].strip()

    infoReviews = soup.find('div', {'class': 'manga-info-review__header mal-navbar'})
    totalReviews = re.findall(r'\d+', infoReviews.find('div', {'class': 'right'}).text)[0]

    typeReview = [
        int(re.findall(r'\d+', infoReviews.find('div', {'class': 'recommended'}).text)[0]),
        int(re.findall(r'\d+', infoReviews.find('div', {'class': 'mixed-feelings'}).text)[0]),
        int(re.findall(r'\d+', infoReviews.find('div', {'class': 'not-recommended'}).text)[0])
    ]

    return {
        "Title": title, "Score": ratingValue, "Vote": ratingCount,
        "Ranked": ranked, "Popularity": popularity, "Members": members,
        "Favorite": favorites, "Volumes": volumes, "Chapters": chapters,
        "Status": status, "Published": published, "Genres": genres,
        "Themes": themes, "Author": authors, "Total Review": totalReviews,
        "Type Review": typeReview
    }


data_list = [extract_info(htmlComic) for htmlComic in listHtml if extract_info(htmlComic) is not None]
df = pd.DataFrame(data_list)

In [9]:
df.head()

Unnamed: 0,Title,Score,Vote,Ranked,Popularity,Members,Favorite,Volumes,Chapters,Status,Published,Genres,Themes,Author,Total Review,Type Review
0,Berserk,9.47,330372,1,1,663681,122589,Unknown,Unknown,Publishing,"Aug 25, 1989 to ?","[Action, Adventure, Award Winning, Drama, Fant...","[Gore, Military, Mythology, Psychological]","Miura, Kentarou (Story & Art), Studio Gaga (Art)",259,"[234, 15, 10]"
1,JoJo no Kimyou na Bouken Part 7: Steel Ball Run,9.3,156011,2,26,255574,42791,24,96,Finished,"Jan 19, 2004 to Apr 19, 2011","[Action, Adventure, Mystery, Supernatural]",[],"Araki, Hirohiko (Story & Art)",128,"[120, 7, 1]"
2,Vagabond,9.24,135878,3,15,363767,40004,37,327,On Hiatus,"Sep 3, 1998 to May 21, 2015","[Action, Adventure, Award Winning]","[Historical, Samurai]","Inoue, Takehiko (Story & Art), Yoshikawa, Eiji...",95,"[87, 7, 1]"
3,One Piece,9.22,365947,4,3,598004,114319,Unknown,Unknown,Publishing,"Jul 22, 1997 to ?","[Action, Adventure, Fantasy]",[],"Oda, Eiichiro (Story & Art)",206,"[173, 17, 16]"
4,Monster,9.15,93668,5,29,235740,20468,18,162,Finished,"Dec 5, 1994 to Dec 20, 2001","[Award Winning, Drama, Mystery]","[Adult Cast, Psychological]","Urasawa, Naoki (Story & Art)",76,"[64, 7, 5]"


In [10]:
df.to_csv('./data/comic.csv', index=False)