# 🚀 Imports and Configurations

## Import libraries

In [1]:
from bs4 import BeautifulSoup as soup  # HTML data structure
from urllib.request import urlopen as uReq  # Web client
import pandas as pd
import re
from tqdm import tqdm

## Configurations

In [2]:
configs = dict(
    HOME_URL = "http://shereno.com/",
    FILE_NAME = "poetry_dataset.csv"
)

# 🍲 Preparing the Soup

In [3]:
def url_request(url):
    uClient = uReq(url)
    page_soup = soup(uClient.read(), "html.parser")
    uClient.close()
    return page_soup

In [19]:
def main_page(url):
    df = pd.DataFrame(columns=['Poem','Poet','Title','Book'])
    page_soup = url_request(url)
    containers = page_soup.findAll("ul", {"class": "side_list3 over"})[1]
    for a in containers.findAll("a"):
        poet = a.text
        url = re.findall(r'\d+', a['href'][1:])[0]
        print(f'Poet:{poet}')
        df = poet_page(url, poet, df)
    return df

In [25]:
def poet_page(url, poet, df):
    page_soup = url_request(configs['HOME_URL'] + url)
    containers = page_soup.findAll("table")[0]
    links = containers.findAll("a")
    for index, link in enumerate(links):
        book = link.text
        print(f'Book:{book} ({index+1} of {len(links)})')
        url = link['href'][2:]
        df = poetry_notebook(url, poet, book, df)
    return df

In [26]:
def poetry_notebook(url, poet, book, df):
    page_soup = url_request(configs['HOME_URL'] + url)
    containers = page_soup.findAll("table")[0]
    for container in tqdm(containers.findAll("a")):
        title = container.text
        url = container['href'][2:]
        new_row = poetry_page(url, poet, book, title)
        df = df.append(new_row, ignore_index=True)
    return df

In [27]:
def poetry_page(url, poet, book, title):
    page_soup = url_request(configs['HOME_URL'] + url)
    body = page_soup.findAll("div", {"id": "areap"})[0]
    poem = body.span.text
    new_row = {
        'Poet' :  poet,
        'Poem' :  poem,
        'Title':  title,
        'Book' :  book
    }
    return new_row

# 🥣 Serving the Soup

In [28]:
data = main_page(configs['HOME_URL'])

Poet:نیما یوشیج
Book:مجموعه اشعار (1 of 1)


100%|██████████| 38/38 [00:23<00:00,  1.59it/s]


# 💾 Export Data

In [33]:
data.to_csv('shereno.csv', 
            index=False,
            encoding = 'utf-8')