# <p style="text-align: center;"> <b> Data Collecting </b></p>
---

In [1]:
import requests
from requests_html import HTMLSession
from bs4 import BeautifulSoup
import re
import nest_asyncio
import pandas as pd 
import datetime
import time

In [2]:
nest_asyncio.apply() 
session = HTMLSession()

### 👉 Crawl the first 5000 urls

In [3]:
listUrl1 = []

for i in range(0, 5000, 50):
    # Url of the website to scrap
    url = f'https://myanimelist.net/topmanga.php?limit={i}'

    # Get the html content
    html = requests.get(url).text

    # Parse the html content
    soup = BeautifulSoup(html, "html.parser")

    # Get the list of manga
    listItem = soup.find_all("td", {"class": "title al va-t clearfix word-break"})

    # Get the url of each manga
    for item in listItem:
        listUrl1.append(item.find('a').get('href'))

    # Print the number of manga urls collected
    print(f'{len(listUrl1)} urls collected', end='\r', flush=True)


5000 urls collected

### 👉 Crawl the remaining 5000 urls

- Similar to data collection above

In [7]:
listUrl2 = []

for i in range(5000,10000,50):
    # Url of the website to scrap
    url = f'https://myanimelist.net/topmanga.php?limit={i}'

    # Get the html content
    html = requests.get(url).text

    # Parse the html content
    soup = BeautifulSoup(html, "html.parser")

    # Get the list of manga
    listItem = soup.find_all("td", {"class": "title al va-t clearfix word-break"})

    # Get the url of each manga
    for item in listItem:
        listUrl2.append(item.find('a').get('href'))

    # Print the number of manga urls collected
    print(f'{len(listUrl2)} urls collected', end='\r', flush=True)
    
listUrl = listUrl1 + listUrl2
print(f'Total: {len(listUrl)} urls collected')
with open('listUrl_manga.txt', 'w') as f:
    for url in listUrl:
        f.write("%s\n" % url)

Total: 10000 urls collected


### 👉 Concatenate 2 list urls

In [6]:
listUrl1 = []

for i in range(0, 5000, 50):
    url = f'https://myanimelist.net/topanime.php?limit={i}'
    html = requests.get(url).text
    soup = BeautifulSoup(html, "html.parser")

    listItem = soup.find_all("td", {"class": "title al va-t word-break"})

    for item in listItem:
        listUrl1.append(item.find('a').get('href'))

    # Print the number of manga urls collected
    print(f'{len(listUrl1)} urls collected', end='\r', flush=True)


5000 urls collected

In [10]:
listUrl2 = []

for i in range(5000,10000,50):
    # Url of the website to scrap
    url = f'https://myanimelist.net/topanime.php?limit={i}'
    html = requests.get(url).text
    soup = BeautifulSoup(html, "html.parser")

    listItem = soup.find_all("td", {"class": "title al va-t word-break"})

    for item in listItem:
        listUrl2.append(item.find('a').get('href'))

    print(f'{len(listUrl2)} urls collected', end='\r', flush=True)
    
listUrl = listUrl1 + listUrl2
print(f'Total: {len(listUrl)} urls collected')
with open('listUrl_anime.txt', 'w') as f:
    for url in listUrl:
        f.write("%s\n" % url)

Total: 10000 urls collected


### 👉 Crawl HTML content from the first 5000 manga URLs

In [13]:
manga_url1 = []
with open('listUrl_manga.txt', 'r') as f:
    listUrl = f.read().splitlines()

for url in listUrl[:5000]:
    res = session.get(url)
    while len(res.text) < 4000:
        # Sleep for 10 minutes
        time.sleep(600)
        res = session.get(url)
        
    manga_url1.append(res.text)

    # Print the number of manga html collected
    print(f'{len(manga_url1)}/{len(listUrl)} manga html collected', end='\r', flush=True)

5000/10000 manga html collected

### 👉 Crawl HTML content from the remaining 5000 manga URLs

In [None]:
with open('manga_html1.txt', 'w') as f:
    for manga in manga_url1:
        f.write("%s\n" % manga)


In [16]:
manga_url2 = []

for url in listUrl[5000:]:
    res = session.get(url)
    while len(res.text) < 4000:
        # Sleep for 10 minutes
        time.sleep(600)
        res = session.get(url)
        
    manga_url2.append(res.text)

    # Print the number of manga html collected
    print(f'{len(manga_url2)+5000}/{len(listUrl)} manga html collected', end='\r', flush=True)

10000/10000 manga html collected

In [17]:
with open('manga_html2.txt', 'w') as f:
    for manga in manga_url2:
        f.write("%s\n" % manga)

In [11]:
anime_url1 = []
with open('listUrl_anime.txt', 'r') as f:
    listUrl = f.read().splitlines()

for url in listUrl[:5000]:
    res = session.get(url)
    while len(res.text) < 4000:
        # Sleep for 10 minutes
        time.sleep(600)
        res = session.get(url)
        
    anime_url1.append(res.text)

    # Print the number of manga html collected
    print(f'{len(anime_url1)}/{len(listUrl)} anime html collected', end='\r', flush=True)
    
with open('anime_html1.txt', 'w') as f:
    for anime in anime_url1:
        f.write("%s\n" % anime)
        f.write('========\n')


5000/10000 anime html collected

In [5]:
with open('listUrl_anime.txt', 'r') as f:
    listUrl = f.read().splitlines()

In [12]:
anime_url2 = []

for url in listUrl[:5000]:
    res = session.get(url)
    while len(res.text) < 4000:
        # Sleep for 10 minutes
        time.sleep(1000)
        res = session.get(url)
        
    anime_url2.append(res.text)

    # Print the number of manga html collected
    print(f'{len(anime_url2)+5000}/{len(listUrl)} anime html collected', end='\r', flush=True)
    
with open('anime_html2.txt', 'w') as f:
    for anime in anime_url2:
        f.write("%s\n" % anime)
        f.write("========\n")

10000/10000 anime html collected

In [11]:
# Extract time of data collection to report for the project
now = datetime.datetime.now()
now = now.strftime("%Y-%m-%d")
print("Time of data collection: ", now)

Time of data collection:  2023-12-05


### 👉 Concatenate 2 list htmls

In [20]:
manga_url = manga_url1 + manga_url2
print(f'Total: {len(manga_url)} manga html collected')


Total: 10000 manga html collected


In [13]:
anime_url = anime_url1 + anime_url2
print(f'Total: {len(anime_url)} anime html collected')

Total: 10000 anime html collected


### 👉 Extracting the detailed values of each comic website page

In [32]:
def extract_manga(htmlComic):
    soup = BeautifulSoup(htmlComic, "html.parser")

    title = soup.find('span', {'itemprop': 'name'})
    if title is None:
        return None
    else:
        title_text = title.text.strip()
        title_english_span = title.find('span', {'class': 'title-english'})

        if title_english_span is not None:
            title_english_text = title_english_span.text.strip()
            title_text = title_text.replace(title_english_text, '')
            title = f'{title_text} ({title_english_text})'
        else:
            title = title_text
    ratingValue = soup.find('span', {'itemprop': 'ratingValue'}).text
    ratingCount = soup.find('span', {'itemprop': 'ratingCount'}).text
    ranked = re.findall(r'\d+', soup.find('span', {'class': 'numbers ranked'}).text)[0]
    popularity = re.findall(r'\d+', soup.find('span', {'class': 'numbers popularity'}).text)[0]

    volumes, chapters, status, published = '', '', '', ''
    genres, themes,demographics, serialization,authors, favorites, members = [], [], [],'', '', '',''
    
    for space in soup.find_all("div", {'class': 'spaceit_pad'}):
        text = space.text
        if 'Volumes' in text:
            volumes = text.split(':')[1].strip()
        elif 'Chapters' in text:
            chapters = text.split(':')[1].strip()
        elif 'Status' in text:
            status = text.split(':')[1].strip()
        elif 'Published' in text:
            published = text.split(':')[1].strip()
        elif 'Genres' in text:
            genres = [gen.text for gen in space.find_all('span', {'itemprop': 'genre'})]
        elif 'Themes' in text:
            themes = [theme.text for theme in space.find_all('span', {'itemprop': 'genre'})]
        elif 'Demographic' in text:
            demographics = [demographic.text for demographic in space.find_all('span', {'itemprop': 'genre'})]
        elif 'Serialization' in text:
            serialization = text.split(':')[1].strip()
        elif 'Authors' in text:
            authors = text.split(':')[1].strip()
        elif 'Favorites' in text:
            favorites = text.split(':')[1].strip()
        elif 'Members' in text:
            members = text.split(':')[1].strip()
        


    return {
        "Title": title, "Score": ratingValue, "Vote": ratingCount,
        "Ranked": ranked, "Popularity": popularity, "Members": members,
        "Favorite": favorites, "Volumes": volumes, "Chapters": chapters,
        "Status": status, "Published": published, "Genres": genres,
        "Themes": themes, 'Demographics': demographics, 'Serialization': serialization,
        "Author": authors, 
    }

In [14]:
def extract_anime(htmlComic):
    soup = BeautifulSoup(htmlComic, "html.parser")

    title = soup.find('div', {'itemprop': 'name'})
    if title is None:
        return None
    else:
        title_text = title.text.strip()
        title_english_span = title.find('span', {'class': 'title-english'})

        if title_english_span is not None:
            title_english_text = title_english_span.text.strip()
            title_text = title_text.replace(title_english_text, '')
            title = f'{title_text} ({title_english_text})'
        else:
            title = title_text
    ratingValue = soup.find('span', {'itemprop': 'ratingValue'}).text
    ratingCount = soup.find('span', {'itemprop': 'ratingCount'}).text
    ranked = re.findall(r'\d+', soup.find('span', {'class': 'numbers ranked'}).text)[0]
    popularity = re.findall(r'\d+', soup.find('span', {'class': 'numbers popularity'}).text)[0]

    episodes, status, aired, premiered, producers, licensors, studios, source, duration, rating = '', '', '', '', '', '', '', '', '', ''

    for space in soup.find_all("div", {'class': 'spaceit_pad'}):
        text = space.text
        if 'Episodes' in text:
            episodes = text.split(':')[1].strip()
        elif 'Status' in text:
            status = text.split(':')[1].strip()
        elif 'Aired' in text:
            aired = text.split(':')[1].strip()
        elif 'Premiered' in text:
            premiered = text.split(':')[1].strip()
        elif 'Producers' in text:
            producers = [producer.text for producer in space.find_all('a')]
        elif 'Licensors' in text:
            licensors = text.split(':')[1].strip()
        elif 'Studios' in text:
            studios = text.split(':')[1].strip()
        elif 'Source' in text:
            source = text.split(':')[1].strip()
        elif 'Genres' in text:
            genres = [gen.text for gen in space.find_all('span', {'itemprop': 'genre'})]
        elif 'Demographic' in text:
            demographics = [demographic.text for demographic in space.find_all('span', {'itemprop': 'genre'})]
        elif 'Duration' in text:
            duration = text.split(':')[1].strip()
        elif 'Rating' in text:
            rating = text.split(':')[1].strip()



    return {
        "Title": title, "Score": ratingValue, "Vote": ratingCount,
        "Ranked": ranked, "Popularity": popularity, "Episodes": episodes,
        "Status": status, "Aired": aired, "Premiered": premiered,
        "Producers": producers, "Licensors": licensors, "Studios": studios,
        "Source": source, "Duration": duration, "Rating": rating
    }

In [33]:
manga_data = [extract_manga(html) for html in manga_url]
manga_df = pd.DataFrame(manga_data)

manga_df.to_csv('./data/manga.csv', index=False)

In [34]:
manga_df.sample(10)

Unnamed: 0,Title,Score,Vote,Ranked,Popularity,Members,Favorite,Volumes,Chapters,Status,Published,Genres,Themes,Demographics,Serialization,Author
848,Shinohayu: the Dawn of Age,7.95,360,849,10835,1508,36,Unknown,Unknown,Publishing,"Jul 25, 2013 to ?",[],[],[Shounen],Big Gangan,"Igarashi, Aguri (Art), Kobayashi, Ritz (Story)"
9199,Hatsukoi Kids Sitter (First Love Kids Sitter),6.96,101,9225,27954,299,0,1,7,Finished,"Jul 12, 2022 to Dec 13, 2022",[],[],[],B-BOY P!,"Kuroda, Kurota (Story & Art)"
1125,Saihate ni Madou,7.85,1337,1142,2662,7464,82,Unknown,Unknown,Publishing,"Jan 6, 2023 to ?",[],[],[Josei],Flat Hero's,"Momoyama, Hato (Story & Art)"
2594,Risouteki Boyfriend (The World Best Boyfriend),7.54,1801,2613,3945,5027,49,7,29,Finished,"Mar 31, 2016 to Jul 13, 2018",[],[],[Shoujo],Bessatsu Margaret,"Ayase, Umi (Story & Art)"
5259,Kurogane,7.25,1686,5313,6280,2987,5,Unknown,1,Finished,"Dec 20, 2010","[Action, Supernatural]","[Martial Arts, School]",[Shounen],Shounen Jump (Weekly),"Ikezawa, Haruto (Story & Art)"
2153,Youchien Wars (Kindergarten Wars),7.61,2005,2176,2469,7965,73,Unknown,Unknown,Publishing,"Sep 15, 2022 to ?","[Action, Comedy]",[],[Shounen],Shounen Jump+,"Chiba, Yuu (Story & Art)"
7501,Arakure Ojousama wa Monmon Shiteiru,7.08,3387,7527,1606,11997,66,Unknown,Unknown,Publishing,"Nov 6, 2018 to ?","[Comedy, Ecchi]","[Romantic Subtext, School]",[Seinen],Young Magazine the 3rd,"Kinoshita, Yuuichi (Story & Art)"
4479,Nadeshiko Club,7.32,885,4479,7994,2241,4,7,35,Finished,2000 to 2003,"[Comedy, Romance]","[Reverse Harem, School]",[Shoujo],Hana to Yume,"Sakamoto, Miku (Story & Art)"
3318,Genjitsu Shugi Yuusha no Oukoku Saikenki (How ...,7.45,6043,3251,897,19604,147,Unknown,Unknown,Publishing,"Jul 10, 2017 to ?","[Action, Fantasy]","[Isekai, Military]",[],Comic Gardo,"Ueda, Satoshi (Art), Dozeumaru (Story)"
3908,Zoku - Kindan no Koi wo Shiyou,7.38,1623,3849,6475,2895,16,1,4,Finished,2001,"[Fantasy, Romance]",[],[Josei],Petit Comic,"Ohmi, Tomu (Story & Art)"


In [15]:
anime_data = [extract_anime(html) for html in anime_url]
anime_df = pd.DataFrame(anime_data)
anime_df.head()
anime_df.to_csv('./data/anime.csv', index=False)

In [17]:
anime_df.head()

Unnamed: 0,Title,Score,Vote,Ranked,Popularity,Episodes,Status,Aired,Premiered,Producers,Licensors,Studios,Source,Duration,Rating
0,Sousou no FrierenFrieren: Beyond Journey's End,9.14,128768,1,508,28,Currently Airing,"Sep 29, 2023 to Mar 2024",Fall 2023,"[Aniplex, Dentsu, Shogakukan-Shueisha Producti...","None found, add some",Madhouse,Manga,24 min. per ep.,PG-13 - Teens 13 or older
1,Fullmetal Alchemist: Brotherhood,9.09,2080863,2,3,64,Finished Airing,"Apr 5, 2009 to Jul 4, 2010",Spring 2009,"[Aniplex, Square Enix, Mainichi Broadcasting S...","Funimation, Aniplex of America",Bones,Manga,24 min. per ep.,R - 17+ (violence & profanity)
2,Steins;Gate,9.07,1375512,3,13,24,Finished Airing,"Apr 6, 2011 to Sep 14, 2011",Spring 2011,"[Frontier Works, Media Factory, Kadokawa Shote...",Funimation,White Fox,Visual novel,24 min. per ep.,PG-13 - Teens 13 or older
3,Gintama°Gintama Season 4,9.06,246431,4,337,51,Finished Airing,"Apr 8, 2015 to Mar 30, 2016",Spring 2015,"[TV Tokyo, Aniplex, Dentsu]","Funimation, Crunchyroll",Bandai Namco Pictures,Manga,24 min. per ep.,PG-13 - Teens 13 or older
4,Shingeki no Kyojin Season 3 Part 2Attack on Ti...,9.05,1545108,5,21,10,Finished Airing,"Apr 29, 2019 to Jul 1, 2019",Spring 2019,"[Production I.G, Dentsu, Mainichi Broadcasting...",Funimation,Wit Studio,Manga,23 min. per ep.,R - 17+ (violence & profanity)
