# Read before running

If you haven't uploaded web pages to Google Drive, set upload_web_page_to_drive to True. If you haven't uploaded json files to Google Drive, set upload_json_to_drive to True.

In [None]:
upload_web_page_to_drive = False
upload_json_to_drive = False

Specify URL-adresses of web pages, id of a folder on Google Drive, and the prefix part of HTML/text file title.The folder must be created in advance within 'My Drive' parent directory.



In [None]:
base_url = 'https://echofm.online/programs/status/status-s-ekaterinoj-shulman-'
pages_folder_id = '179qr6SD9Y2A8jrKsiH9RKnAplW0G4ST9'
pages_folder_name = 'Episodes'
urls = [f'{base_url}{episode_number}' for episode_number in range(1,73)]
page_file_title = 'episode_'
json_folder_id = '17s7434iNxvPDyn4wTOLDzyDfX-4XMAEV'
json_folder_name = 'Episodes json'

Before 'Handling downloaded web pages' step Mount Drive in the left sidebar (Files -> Mount Drive)

# Installing Python modules and libraries

In [None]:
!pip install requests
!pip install beautifulsoup4
!pip install dataclasses_json

Collecting dataclasses_json
  Downloading dataclasses_json-0.6.3-py3-none-any.whl (28 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses_json)
  Downloading marshmallow-3.20.2-py3-none-any.whl (49 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m49.4/49.4 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting typing-inspect<1,>=0.4.0 (from dataclasses_json)
  Downloading typing_inspect-0.9.0-py3-none-any.whl (8.8 kB)
Collecting mypy-extensions>=0.3.0 (from typing-inspect<1,>=0.4.0->dataclasses_json)
  Downloading mypy_extensions-1.0.0-py3-none-any.whl (4.7 kB)
Installing collected packages: mypy-extensions, marshmallow, typing-inspect, dataclasses_json
Successfully installed dataclasses_json-0.6.3 marshmallow-3.20.2 mypy-extensions-1.0.0 typing-inspect-0.9.0


# Authentication in Google Drive

In [None]:
from pydrive2.auth import GoogleAuth
from pydrive2.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

In [None]:
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

# Defining the function for uploading files to Google Drive

In [None]:
def upload_to_folder(folder_id, file_title, mimeType, text, number='') -> None:
  metadata = {
      'parents': [
          {"id": folder_id}
      ],
      'title': f'{file_title}{number}',
      'mimeType': f'{mimeType}'
  }
  file = drive.CreateFile(metadata=metadata)
  file.SetContentString(text)
  file.Upload()

# Uploading web pages to Google Drive

In [None]:
import requests
import time
from tqdm import tqdm

In [None]:
if upload_web_page_to_drive:
  for url in tqdm(urls):
    try:
      r = requests.get(url)
      text = r.text
      number = url.removeprefix(base_url)
      upload_to_folder(pages_folder_id, page_file_title, 'text/plain', text, number)
      time.sleep(1)
    finally:
      continue

100%|██████████| 72/72 [04:42<00:00,  3.93s/it]


# Handling uploaded web pages

In [None]:
try:
    from BeautifulSoup import BeautifulSoup
except ImportError:
    from bs4 import BeautifulSoup

In [None]:
from dataclasses import dataclass, field
from dataclasses_json import dataclass_json

@dataclass_json
@dataclass
class Data:
  """Class for storing utterances of speakers"""
  speaker: int
  text: str

@dataclass_json
@dataclass
class Episode:
  """Class for storing utterances content"""
  title: str
  date: str
  number: int|str
  speakers: list[str] = field(default_factory=list)
  data: list[Data] = field(default_factory=list)

In [None]:
from sys import base_prefix
from dataclasses_json import dataclass_json
import re

def remove_speaker_name(text: str) -> str:
  name_and_utterance_lst = list(filter(None,
                                       re.split(r'([А-Я][.][А-Я]+[:][ ])',
                                                text, 1)))
  if len(name_and_utterance_lst) == 2:
    text = name_and_utterance_lst[1]
  return text

def create_episode(file: str) -> Episode:
  text = ''
  with open(f'/content/drive/My Drive/{pages_folder_name}/{file}', 'r') as f:
    text = f.read()
    number = file.removeprefix(page_file_title)
  parsed_html = BeautifulSoup(text)
  article = parsed_html.body.find('article',
                                  attrs={'class':'wp-embed-responsive'})

  title = article.find('h1').text

  div1 = article.find('div', {'class':'sc-1f63cf03-6 cJndFP'})
  div2 = div1.find('div', {'class':'sc-1f63cf03-7 gBakUU'})
  date = div2.find('span').text

  episode = Episode(title, date, number)

  paragraphs = article.find_all('p')

  speaker_id = -1
  pattern = '([А-Я][.][А-Я]+)'
  prog = re.compile(pattern)

  for paragraph in paragraphs:
    # if speaker is defined in this paragraph set it as a current speaker
    s = paragraph.find('strong')
    b = paragraph.find('b')
    if s is not None or b is not None:
      if s is not None:
        paragraph_with_speaker = s.text
      elif b is not None:
        paragraph_with_speaker = b.text
      match = prog.match(paragraph_with_speaker)
      speaker = match.group(0)

      if match:
        if speaker not in episode.speakers:
          episode.speakers.append(speaker)
        speaker_id = episode.speakers.index(speaker)

    # if there is no current speaker go to the next iteration
    if speaker_id < 0:
      continue

    # add the speaker's utterance to the episode's content
    text = remove_speaker_name(paragraph.text)
    utterance = Data(speaker_id, text)
    episode.data.append(utterance)

  return episode

# Uploading json files to Google Drive

In [None]:
if upload_json_to_drive:
  import os
  from tqdm import tqdm
  import json
  path = f'/content/drive/MyDrive/{pages_folder_name}'
  files = os.listdir(path)
  for file in tqdm(files):
    try:
      episode = create_episode(file)
      json_episode = episode.to_json(ensure_ascii=False)
      upload_to_folder(json_folder_id,
                       file,
                       'text/plain',
                       json_episode)
    except AttributeError as e:
      print(e)

  0%|          | 0/72 [00:00<?, ?it/s]

'NoneType' object has no attribute 'find'
'NoneType' object has no attribute 'find'


  4%|▍         | 3/72 [00:00<00:05, 13.56it/s]

'NoneType' object has no attribute 'find'
'NoneType' object has no attribute 'group'
'NoneType' object has no attribute 'find'
'NoneType' object has no attribute 'find'
'NoneType' object has no attribute 'group'


 12%|█▎        | 9/72 [00:01<00:14,  4.49it/s]

'NoneType' object has no attribute 'find'


 15%|█▌        | 11/72 [00:03<00:21,  2.85it/s]

'NoneType' object has no attribute 'find'


 18%|█▊        | 13/72 [00:04<00:30,  1.91it/s]

'NoneType' object has no attribute 'group'


 22%|██▏       | 16/72 [00:07<00:38,  1.46it/s]

'NoneType' object has no attribute 'group'


 25%|██▌       | 18/72 [00:09<00:35,  1.54it/s]

'NoneType' object has no attribute 'find'


 28%|██▊       | 20/72 [00:10<00:32,  1.58it/s]

'NoneType' object has no attribute 'find'


 31%|███       | 22/72 [00:12<00:33,  1.50it/s]

'NoneType' object has no attribute 'find'


 33%|███▎      | 24/72 [00:13<00:30,  1.59it/s]

'NoneType' object has no attribute 'find'


 36%|███▌      | 26/72 [00:15<00:30,  1.53it/s]

'NoneType' object has no attribute 'find'


 39%|███▉      | 28/72 [00:16<00:27,  1.62it/s]

'NoneType' object has no attribute 'find'


 42%|████▏     | 30/72 [00:17<00:24,  1.69it/s]

'NoneType' object has no attribute 'find'


 44%|████▍     | 32/72 [00:19<00:26,  1.50it/s]

'NoneType' object has no attribute 'find'


 49%|████▊     | 35/72 [00:21<00:20,  1.80it/s]

'NoneType' object has no attribute 'find'
'NoneType' object has no attribute 'find'


 53%|█████▎    | 38/72 [00:24<00:25,  1.32it/s]

'NoneType' object has no attribute 'find'


 57%|█████▋    | 41/72 [00:26<00:16,  1.93it/s]

'NoneType' object has no attribute 'find'
'NoneType' object has no attribute 'find'


 60%|█████▉    | 43/72 [00:27<00:16,  1.72it/s]

'NoneType' object has no attribute 'find'


 62%|██████▎   | 45/72 [00:29<00:16,  1.63it/s]

'NoneType' object has no attribute 'find'


 65%|██████▌   | 47/72 [00:30<00:16,  1.48it/s]

'NoneType' object has no attribute 'find'


 71%|███████   | 51/72 [00:35<00:18,  1.11it/s]

'NoneType' object has no attribute 'find'


 74%|███████▎  | 53/72 [00:37<00:15,  1.22it/s]

'NoneType' object has no attribute 'find'


 76%|███████▋  | 55/72 [00:38<00:12,  1.33it/s]

'NoneType' object has no attribute 'find'


 81%|████████  | 58/72 [00:41<00:10,  1.35it/s]

'NoneType' object has no attribute 'find'


 83%|████████▎ | 60/72 [00:42<00:08,  1.39it/s]

'NoneType' object has no attribute 'find'


 85%|████████▍ | 61/72 [00:43<00:06,  1.74it/s]

'NoneType' object has no attribute 'group'


 88%|████████▊ | 63/72 [00:44<00:05,  1.65it/s]

'NoneType' object has no attribute 'find'


 99%|█████████▊| 71/72 [00:53<00:00,  1.14it/s]

'NoneType' object has no attribute 'find'


100%|██████████| 72/72 [00:55<00:00,  1.31it/s]


# Creating bar charts

In [None]:
import os
from tqdm import tqdm
import json
import re
from dataclasses_json import dataclass_json
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [None]:
path = f'/content/drive/MyDrive/{json_folder_name}'
files = os.listdir(path)
all_episodes_stat = list()
for file in tqdm(files):
  try:
    with open(f'{path}/{file}', 'r') as f:
      dict_with_data = json.load(f)
      episode = Episode.from_dict(dict_with_data)
      stat = dict()
      stat['episode_number'] = episode.number
      stat['episode_date'] = episode.date
      stat['speakers'] = dict()
      stat['all'] = 0
      for utterance in episode.data:
        speaker = episode.speakers[utterance.speaker]
        if stat['all'] == 0:
          stat['all'] =  len(utterance.text.split())
        else:
          stat['all'] +=  len(utterance.text.split())
        if speaker not in stat['speakers'].keys():
          stat['speakers'][speaker] =  len(utterance.text.split())
        else:
          stat['speakers'][speaker] +=  len(utterance.text.split())
      all_episodes_stat.append(stat)
  except:
    pass
print(all_episodes_stat)

100%|██████████| 37/37 [00:08<00:00,  4.22it/s]

[{'episode_number': '8', 'episode_date': '25 октября 2022', 'speakers': {'М.КУРНИКОВ': 642, 'Е.ШУЛЬМАН': 6912}, 'all': 7554}, {'episode_number': '10', 'episode_date': '1 ноября 2022', 'speakers': {'М.КУРНИКОВ': 767, 'Е.ШУЛЬМАН': 7117}, 'all': 7884}, {'episode_number': '12', 'episode_date': '13 декабря 2022', 'speakers': {'М.КУРНИКОВ': 933, 'Е.ШУЛЬМАН': 7029}, 'all': 7962}, {'episode_number': '14', 'episode_date': '10 января 2023', 'speakers': {'М.КУРНИКОВ': 1029, 'Е.ШУЛЬМАН': 6592}, 'all': 7621}, {'episode_number': '15', 'episode_date': '17 января 2023', 'speakers': {'М.КУРНИКОВ': 663, 'Е.ШУЛЬМАН': 7360}, 'all': 8023}, {'episode_number': '17', 'episode_date': '14 февраля 2023', 'speakers': {'М.КУРНИКОВ': 1028, 'Е.ШУЛЬМАН': 6934}, 'all': 7962}, {'episode_number': '19', 'episode_date': '21 февраля 2023', 'speakers': {'М.КУРНИКОВ': 672, 'Е.ШУЛЬМАН': 7945}, 'all': 8617}, {'episode_number': '21', 'episode_date': '28 февраля 2023', 'speakers': {'М.КУРНИКОВ': 621, 'Е.ШУЛЬМАН': 7386}, 'all': 8




In [None]:
print(len(all_episodes_stat))

37


In [None]:
def legend_without_duplicate_labels(ax):
    handles, labels = ax.get_legend_handles_labels()
    unique = [(h, l) for i, (h, l) in enumerate(zip(handles, labels)) if l not in labels[:i]]
    ax.legend(*zip(*unique), fontsize="8", loc ="upper left")

def create_bar_chart(all_stat):
  all_stat = [x for x in all_stat if len(x['speakers'].keys())>0]

  X = [f"от {episode['episode_date']}" for episode in all_stat]
  X_axis_range = np.arange(len(X))
  X_axis_range_iterator = iter(X_axis_range)
  width = 0.2  # the width of the bars
  multiplier = 0
  color = ['purple', 'blue', 'lightblue', 'red', 'black']
  fig, ax = plt.subplots(layout='constrained')
  count = 0
  for stat in all_stat:
    speakers = [key for key in stat['speakers'].keys()]
    print(speakers)
    count+=1
    bar_height = [round(stat['speakers'][speaker]/stat['all']*100, 1) for speaker in speakers]
    print(bar_height)

    next_x_axis = next(X_axis_range_iterator)
    offsets = [(next_x_axis + width*num) for num in range(0, (len(speakers)))]
    bar = ax.bar(offsets, bar_height, width, color=color[:len(speakers)], label=speakers)
    multiplier += 1
  print(count)

  ax.set_ylabel("Доля (%)")
  ax.set_xlabel("Эпизоды")
  ax.set_title("Процентное отношение высказываний спикеров программы 'Статус'")
  ax.set_xticks(X_axis_range+width/2, X)
  ax.tick_params(axis='both', which='major', labelsize=6)
  ax.tick_params(axis='both', which='minor', labelsize=4)
  ax.spines['top'].set_color('#DDDDDD')
  ax.spines['right'].set_color('#DDDDDD')
  ax.spines['left'].set_color('#DDDDDD')
  ax.spines['bottom'].set_color('#DDDDDD')
  ax.tick_params(bottom=False, left=False)
  ax.set_axisbelow(True)
  ax.yaxis.grid(True, color='#EEEEEE')
  ax.xaxis.grid(False)
  print(len(ax.patches))
  for bar in ax.patches:
    bar_value = bar.get_height()
    text = f'{bar_value}%'
    text_x = bar.get_x() + bar.get_width() / 2
    text_y = bar.get_y() + bar_value
    bar_color = bar.get_facecolor()
    ax.text(text_x, text_y, text, ha='center', va='bottom', color=bar_color,
          size=6)
  legend_without_duplicate_labels(ax)
  plt.show()

In [None]:
create_bar_chart(all_episodes_stat)

In [None]:
import pandas as pd
import plotly.express as px

In [None]:
all_stat = [x for x in all_episodes_stat if len(x['speakers'].keys())>0]
speakers = list()
[speakers.extend(x) for x in [x['speakers'].keys() for x in all_stat]]
speakers = list(set(speakers))
df = pd.json_normalize(all_stat)
for speaker in speakers:
  df.rename(columns = {f"speakers.{speaker}":speaker}, inplace = True)

In [None]:
fig = px.bar(df,
             x="episode_date",
             y=speakers,
             title="Процентное отношение высказываний спикеров программы 'Статус'",
             labels={"value": "Доля (%)", "variable": "Спикер", "episode_date": "Эпизоды"},
             template="simple_white"
            )
#fig.update_traces(textposition='inside')
fig.update_layout(showlegend=True,
                  barnorm="percent",
                  autosize=True,
                  bargap=0,
                  font_family="arial",
                  font_size=10)
fig.show()