# Lab4 data collection
## By Evgeny Melnikov

The code below extracts Vladimir Mayakovsky's poems from https://www.culture.ru and writes them down to a json file alongside poems' titles, tags, etc.

In addition, all the poems are concatenated in a single file, which will be used for training

In [221]:
from selenium.webdriver import Chrome
from selenium.webdriver.chrome.options import Options

from time import sleep
# import tqdm
from tqdm.notebook import tqdm
import json
import time
import re
import unicodedata
import requests
from bs4 import BeautifulSoup
from html2text import html2text

import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer
from IPython.display import clear_output

import warnings
warnings.filterwarnings('ignore')

In [222]:
# The function extracts poems from https://www.culture.ru/literature/poems
# and writes them in JSON format to out_filename
# The extracted data is: 
# - poem link
# - poem title 
# - poem author
# - poem tags 
# - poem text
# The number of pages the poems are extracted from is specified by page_num
# The author is specified by author_name 

def parse_cultureru_poems(author_name, page_num, out_filename, progress_bar=True):   
    domain_name = "https://www.culture.ru"
    
    # progress bar setup   
    if progress_bar:
        page_bar = tqdm(range(page_num), desc = 'Page progress', leave = False)

    # create output json file
    with open(out_filename,'w', encoding="utf-8") as file:
        pass   
    
    # convert the author_name string to use it when constructing links
    author_name = author_name.replace(" ", "-")
    author_name = author_name.lower()    
    
    # iterate over pages of all author's poems
    for page in range(1, page_num + 1):
        
        if progress_bar:
            page_bar.update(1)
        
        previews_link = f"{domain_name}/literature/poems/author-{author_name}?page={page}" # construct a link
        soup = BeautifulSoup(requests.get(previews_link, verify=False).text, "lxml")
        
        poem_previews = soup.find_all("div", {"class": "eWEZx"}) # divs containing preview of a poem
        for poem_preview in poem_previews: # iterate over poem previews
            poem_rel_link = poem_preview.find("a", {"class": "C0Urp"})['href'] # find a relative link to a poem in a div 
            poem_link = domain_name + poem_rel_link # construct an absolute link            
            soup_poem = BeautifulSoup(requests.get(poem_link, verify=False).text, "lxml") # get a page with a poem            
            # суп поем лол
            
            # exctract poem data            
            json_dict = {} # create a dictionary for poem data
            json_dict['link'] = poem_link
            
            title = soup_poem.find("div", {"class": "kNLyi"}).getText(' ').strip() # div containing a poem title
            json_dict['title'] = unicodedata.normalize("NFKD", title) # thereafter, .normalize is used to remove '\xa0' characters
            
            author = soup_poem.find("div", {"class": "_5bAYe"}).getText(' ').strip() # div containing author's name
            json_dict['author'] = unicodedata.normalize("NFKD", author)
            
            # text processing
            poem_text = soup_poem.find("div", {"class": "xiryu"}) # div containing the poem itself 
            poem_text = poem_text.find_all("p") # find all paragraphs
            poem_text = [paragraph.getText('\n') for paragraph in poem_text] # list of paragraphs            
            poem_text = '\n'.join(poem_text) # join a list of paragraphs into a single string            
            poem_text = unicodedata.normalize("NFKD", poem_text)
            # poem_text = re.sub("[\d+]\.", ' ', poem_text) # remove digits with periods (like "2.")
            # poem_text = re.sub("[\d]", ' ', poem_text) # remove all the other digits
            # actually, it'a crutch. I'm too lazy to learn regexes right now, so deal with it s
            # poem_text = re.sub(' +', ' ', poem_text) # remove multiple spaces         
            poem_text = poem_text.strip() # strip one more time 'cause why not :) 
            json_dict['text'] = poem_text
           
            # tags processing
            json_dict['tags'] = []
            poem_tags = soup_poem.find_all("div", {"class": "zUOzO"}) # div containing a tag
            for tag in poem_tags:
                json_dict['tags'].append(unicodedata.normalize("NFKD", tag.getText(' ').strip()))
                
            # write data to output json file  
            with open(out_filename,'a', encoding="utf-8") as file:
                file.write(json.dumps(json_dict, ensure_ascii=False) + '\n')

In [223]:
parse_cultureru_poems(author_name="Vladimir Mayakovskii", page_num=29, out_filename="Poems2.json")

Page progress:   0%|          | 0/29 [00:00<?, ?it/s]

In [224]:
def concatenate(json_filename, out_filename):
    data = pd.read_json(json_filename, lines=True)
    all_poems = [poem + '\n\n'  for poem in data['text']]
    
    with open(out_filename,'w', encoding="utf-8") as file:
        file.write(' '.join(all_poems))   

In [225]:
concatenate("Poems.json", "all_poems2.txt")