In [1]:
import requests
import time
import pandas as pd
from bs4 import BeautifulSoup as bs
import csv

In [2]:
MAIN_URL = "https://www.strana.news"

def get_page(nr):
    text = requests.get(f'{MAIN_URL}/news/page-{nr}.html').text
    soup = bs(text, 'html.parser')
    articles = soup.select('.lenta-news.clearfix')
    page_articles = []
    for a in articles:
        url = a.select('.title .article')[0]['href']
        title = a.select('.title .article')[0].getText().strip()
        page_articles.append({'url': url, 'title': title})
    return page_articles

def get_article(url : str) -> dict :
    text = requests.get(url).text
    soup = bs(text, 'html.parser')
    # raw_meta = soup.select("script[type='application/ld+json']")[0].get_text().replace("\n", "")
    try:
        article_main = soup.select('.articles')[0]
        title = article_main.select('.article-title.article-edit .article')[0].getText()
        datetime = article_main.select('.date span.strana-adate')[0]['data-time']
        article_text = article_main.select('.article-text')[0]
        try:
            image_url = article_text.select('.article-image img')[0]['src']
        except:
            image_url = ""
        try:
            caption = article_text.select('.caption')[0].get_text().strip()
        except:
            caption = ""
        text = article_text.select('#article-body p')
        content = ''
        for i in text:
            content += i.get_text().strip() + " "
        return {'title': title, 'url': url, 'image': image_url, 'caption': caption, 'content': content, 'datetime': datetime}
    except:
        pass

In [3]:
# Shortened (errored) article

def get_shortened_article(url):
    text = requests.get(url).text
    soup = bs(text, 'html.parser')
    article_main = soup.select('.article')[0]
    title = article_main.select('.article-title.article-edit')[0].getText().strip()

    art_date = article_main.select('.date span.strana-adate')[0]['data-time']
    art_time = article_main.select('.date')[0].getText().split(',')[0]
    if art_time.startswith('\n'):
        art_time = "11:00"
    datetime = art_date + " " +  art_time + ':00'
    article_text = article_main.select('.article-text')[0]
    try:
        image_url = article_text.select('.article-image img')[0]['src']
    except:
        image_url = ""
    try:
        caption = article_text.select('.caption')[0].get_text().strip()
    except:
        caption = ""
    text = article_text.select('#article-body p')
    content = ''
    for i in text:
        content += i.get_text().strip() + " "
    return {'title': title, 'url': url, 'image': image_url, 'caption': caption, 'content': content, 'datetime': datetime}

# get_shortened_article('https://strana.news/news/145928-zlata-ohnevich-na-chm-2018-budet-bolet-za-ispaniju-a-irina-jusupova-za-ehipet.html')
# get_shortened_article('https://strana.news/news/453886-itohi-671-dnja-vojny-v-ukraine.html')

In [3]:
# OPTIONAL
# Collects links and title

# PROTECTION WITH THE FILE NAME - pages2.csv

PAGE_FROM = 113

f = open ('pages.csv','a')
f.write("url,title\n")
for ind, pagenr in enumerate(range(PAGE_FROM, 0, -1)):
    articles_on_page = get_page(pagenr)
    for a in articles_on_page:
        f.write(f"{MAIN_URL}{a['url']}"+","+a['title'].replace(",", "")+"\n")
    if pagenr % 100 == 0:
        print("Page " + str(pagenr) + " written")
    time.sleep(0.2)

f.close()

Page 100 written


In [4]:
# OPTIONAL
# CLEANING DUPLICATES IN PAGES.CSV file

ds = pd.read_csv('./pages.csv')
len1 = len(ds)
ds.drop_duplicates(inplace=True)
len2 = len(ds)
print (f'Difference is {len1-len2} articles')
if (len1 - len2) > 0:
    ds.to_csv('./pages.csv', index=False)
    print ('File updated')    

Difference is 0 articles


In [7]:
# Fetches n articles urls from pages.csv file and appends to the file
# Headers to be included if NEW FILE

# Process original and error pages (links, error_links, art in loop)

ITERS = 10000

FILE = './strana_articles.csv'

header = ['title','url', 'image', 'caption', 'content', 'datetime']
links = pd.read_csv('pages.csv') # PROCESS ORIGINAL PAGES
#links = pd.read_csv('./strana_errors.csv') # PROCESS ERROR PAGES

error_links = open('./strana_errors.csv', 'a') # IF ORIGINAL FILE PROCESSED
#error_links = open('./strana_errors2.csv', 'a') # IF ERROR FILE PROCESSED

with open(FILE, 'a') as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=header)
    writer.writeheader()
    for ind, row in links.iterrows():
        art = get_article(row['url']) # ORIGINAL ARTICLES
        #art = get_shortened_article(row['url']) # ERROR ARTICLES PROCESSED
        try:
            writer.writerow({'title': art['title'], 'url': art['url'], 'image': art['image'], 'caption': art['caption'],
                             'content': art['content'], 'datetime': art['datetime']})
        except:
            error_links.write(f"{row['url']},{row['title']}\n")
            pass
        if ind == ITERS:
            break
        if ind % 20 == 0 and ind % 200 != 0 and ind % 1000 != 0 and ind !=0:
            print(".", end='')
        if ind % 200 == 0 and ind % 1000 !=0 and ind != 0:
            print("|", end='')
        if ind % 1000 == 0 and ind != 0:
            print (str(ind), end='\n')
        time.sleep(0.5) # set half a second instead of 0.25

print (f"\nLast item is :{row}")
error_links.close()

.....
Last item is :url      https://www.strana.news/news/458439-chto-prois...
title    730-й день войны в Украине. Что происходит 23 ...
Name: 108, dtype: object


In [6]:
error_links.close()

In [None]:
# download missing content to pages.csv
# parse to strana_articles_temp.csv
# inspect duplicated items and move them to articles.csv
# consider to separate the dataset into multiple file
# consider to process data

In [11]:
import pandas as pd


In [12]:
df = pd.read_csv('../data/strana_2018.csv')

In [13]:
len(df)

22735

In [None]:
# RECOGNIZE EMPTY & ERRATIC SHIT
counter = 0
for item in range(len(df)):
    if len(df.iloc[item]['datetime'].split()) < 2 :
            counter += 1
            # print (df.iloc[item]['title'])

In [None]:
print (counter)

In [None]:
df.iloc[9]['title']

In [None]:
df2 = df.drop_duplicates(subset=['title'])

In [None]:
len(df2)