# Telegram channel message data analysis

In [4]:
import json
import pandas as pd
import re
import requests

from bs4 import BeautifulSoup
from glob import glob

## Load message data from json files

Let's loop through all available files and concatenate
their values into a single dataframe.

In [5]:
json_files = glob("*-messages.json")

In [6]:
def grab_wanted_object_properties(parsed_json):
    wanted_props = {
        "message_id": parsed_json['id'],
        "date": parsed_json['date'],
    }
    wanted_props["message"] = parsed_json['message'] if "message" in parsed_json else None
    return wanted_props

In [7]:
formatted_list = []
for file in json_files:
    with open(file, 'r') as json_file:
        loaded_file = json.load(json_file)
    for obj in loaded_file:
        formatted_list.append(grab_wanted_object_properties(obj))

In [8]:
df = pd.DataFrame(formatted_list)

In [9]:
df.head()

Unnamed: 0,message_id,date,message
0,66147,2023-02-22 12:28:36+00:00,guest_42: Kraken
1,66146,2023-02-22 12:26:42+00:00,guest_42: OCRopus?
2,66145,2023-02-22 12:13:17+00:00,Não entendi o que é que tu quer que a bibliote...
3,66144,2023-02-22 11:46:27+00:00,Bom dia. Existe alguma biblioteca em python pr...
4,66143,2023-02-20 21:01:41+00:00,fogo na babilonha


In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1800 entries, 0 to 1799
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   message_id  1800 non-null   int64 
 1   date        1800 non-null   object
 2   message     1784 non-null   object
dtypes: int64(1), object(2)
memory usage: 42.3+ KB


In [11]:
df.dtypes

message_id     int64
date          object
message       object
dtype: object

## Finding messages with youtube URLs

This can be done by using a regular expression.

In [12]:
def find_youtube_url(message):
    if message is None:
        return None
    rgx = re.search(
        "(?:https?:\/\/)?(?:m\.)?(?:www\.)?youtu\.?be(?:\.com)?\/?\S*(?:watch|embed)?(?:\S*v=|v\/|\/)([\w\-]+)(?:[\&\?]?([\w\-]+)?=?([\w\%\-]+)?)+",
        message
    )
    return rgx.group(0) if rgx is not None else None

In [13]:
test_1 = find_youtube_url("Essa música aqui que é boa: https://www.youtube.com/watch?v=SOJSM46nWwo")
print(f"Encontrado: {test_1}")
test_2 = find_youtube_url("Essa mensagem não tem nenhuma url do youtube!")
print(f"Encontrado: {test_2}")

Encontrado: https://www.youtube.com/watch?v=SOJSM46nWwo
Encontrado: None


### Creating a new DataFrame column

This new column will contain only the youtube url parsed from the message.

In [14]:
df['youtube_url'] = df['message'].apply(find_youtube_url)

In [15]:
df_ytb_urls = df.dropna()
df_ytb_urls.head()

Unnamed: 0,message_id,date,message,youtube_url
6,66141,2023-02-20 16:04:20+00:00,http://www.youtube.com/watch?v=TEtZTNvco0U,http://www.youtube.com/watch?v=TEtZTNvco0U
7,66140,2023-02-20 16:01:45+00:00,https://www.youtube.com/watch?v=I1Aisbeuo2o,https://www.youtube.com/watch?v=I1Aisbeuo2o
8,66139,2023-02-20 15:56:30+00:00,https://www.youtube.com/watch?v=bX1w9BZ5svw,https://www.youtube.com/watch?v=bX1w9BZ5svw
11,66136,2023-02-20 15:06:54+00:00,https://www.youtube.com/watch?v=-UJwSUxf0YE,https://www.youtube.com/watch?v=-UJwSUxf0YE
14,66133,2023-02-19 19:40:53+00:00,https://youtu.be/NkTqBmjbnpU,https://youtu.be/NkTqBmjbnpU


In [16]:
len(df_ytb_urls)

242

# Getting youtube video information from the oembed API

Youtube has an open endpoint that can be used to validate URLs and get general information about a video.

In [17]:
video_url = "https://www.youtube.com/watch?v=I1Aisbeuo2o"
oembed_url = f"https://www.youtube.com/oembed?url={video_url}&format=json"
response = requests.get(oembed_url)
print(response.status_code)
print(response.json())

200
{'title': 'Raiz Ancestral', 'author_name': 'Gaia Piá - Topic', 'author_url': 'https://www.youtube.com/channel/UCf8omZ55YHyrfijCxzX5mow', 'type': 'video', 'height': 150, 'width': 200, 'version': '1.0', 'provider_name': 'YouTube', 'provider_url': 'https://www.youtube.com/', 'thumbnail_height': 360, 'thumbnail_width': 480, 'thumbnail_url': 'https://i.ytimg.com/vi/I1Aisbeuo2o/hqdefault.jpg', 'html': '<iframe width="200" height="150" src="https://www.youtube.com/embed/I1Aisbeuo2o?feature=oembed" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture; web-share" allowfullscreen title="Raiz Ancestral"></iframe>'}


## Videos have a "author_name" field

In the following format: `'author_name': 'Gaia Piá - Topic'`, the "Topic" might represent that it is from a band.

In [18]:
video_url = "https://www.youtube.com/watch?v=MlzTET_8SQg"
oembed_url = f"https://www.youtube.com/oembed?url={video_url}&format=json"
response = requests.get(oembed_url)
print(f"Response status: {response.status_code}")
ytb_json = response.json()
print(f"Youtube oembed json: {ytb_json}")

Response status: 200
Youtube oembed json: {'title': 'Motörhead – Overkill (Official Video)', 'author_name': 'Motörhead Official', 'author_url': 'https://www.youtube.com/@motorhead', 'type': 'video', 'height': 150, 'width': 200, 'version': '1.0', 'provider_name': 'YouTube', 'provider_url': 'https://www.youtube.com/', 'thumbnail_height': 360, 'thumbnail_width': 480, 'thumbnail_url': 'https://i.ytimg.com/vi/MlzTET_8SQg/hqdefault.jpg', 'html': '<iframe width="200" height="150" src="https://www.youtube.com/embed/MlzTET_8SQg?feature=oembed" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture; web-share" allowfullscreen title="Motörhead – Overkill (Official Video)"></iframe>'}


## Getting band/author information

We have a few options on how to use a given author name to grab genre information.

- Spotify API: <https://developer.spotify.com/documentation/web-api/>
- EveryNoise: <https://everynoise.com/>

The everynoise website doesn't have an actual API endpoint, but we can leverage it's lookup url and parse the HTML.

In [19]:
ytb_author = ytb_json['author_name']
ytb_title = ytb_json['title']

print(f"Received author from youtube oembed: {ytb_author}")
print(f"Received title from youtube oembed: {ytb_title}")

Received author from youtube oembed: Motörhead Official
Received title from youtube oembed: Motörhead – Overkill (Official Video)


## Using the everynoise search URL

Let's start by sending the `ytb_author` string directly and trying to parse the output

In [20]:
everynoise_url = "https://everynoise.com/lookup.cgi"

In [21]:
everynoise_request = requests.get(everynoise_url, params={"who": ytb_author})

In [24]:
everynoise_request.text

'<html>\n<head>\n<meta http-equiv="Content-Type" content="text/html;charset=utf-8">\n<style type="text/css">\nbody {font: 14px "Gill Sans", "Gill Sans MT", "Trebuchet MS", sans-serif; margin: 0px; color: gray}\nform {display: inline}\na {color: teal; text-decoration: none}\na:hover {color: red; text-decoration: underline}\n</style>\n<body>\n\n<form action="lookup.cgi" method="GET">find artist <input type=text size=52 name=who value="MotÃ¶rhead Official"><input type=hidden name=mode value="map"></form>\n<div>\nSorry, that one doesn\'t seem to be on any of these maps yet.\n</div></body></html>\n'

Since this requests returns valid html, we can parse it with beautiful soup.

In [26]:
soup = BeautifulSoup(everynoise_request.text, 'html.parser')
soup.prettify()


'<html>\n <head>\n  <meta content="text/html;charset=utf-8" http-equiv="Content-Type"/>\n  <style type="text/css">\n   body {font: 14px "Gill Sans", "Gill Sans MT", "Trebuchet MS", sans-serif; margin: 0px; color: gray}\nform {display: inline}\na {color: teal; text-decoration: none}\na:hover {color: red; text-decoration: underline}\n  </style>\n  <body>\n   <form action="lookup.cgi" method="GET">\n    find artist\n    <input name="who" size="52" type="text" value="MotÃ¶rhead Official"/>\n    <input name="mode" type="hidden" value="map"/>\n   </form>\n   <div>\n    Sorry, that one doesn\'t seem to be on any of these maps yet.\n   </div>\n  </body>\n </head>\n</html>\n'

In [28]:
soup.find_all('div')

[<div>
 Sorry, that one doesn't seem to be on any of these maps yet.
 </div>]

In [27]:
soup.find_all('a')

[]

We can try splitting the youtube author by words, and sending them individually

In [30]:
results = []
for word in ytb_author.split(" "):
    everynoise_request = requests.get(everynoise_url, params={"who": word})
    soup = BeautifulSoup(everynoise_request.text, 'html.parser')
    results.append(soup.find_all('a'))
results

[[<a href="engenremap-metal.html" target="_parent">metal</a>,
  <a href="engenremap-hardrock.html" target="_parent">hard rock</a>,
  <a href="engenremap-speedmetal.html" target="_parent">speed metal</a>,
  <a href="engenremap-rock.html" target="_parent">rock</a>,
  <a href="engenremap-albumrock.html" target="_parent">album rock</a>,
  <a href="everynoise1d.cgi?scope=all&amp;root=metal&amp;root=hard%20rock&amp;root=speed%20metal&amp;root=rock&amp;root=album%20rock" target="_parent" title="go to the list view for these genres">â·</a>,
  <a href="artistprofile.cgi?id=1DFr97A9HnbV3SKTJFu62M" target="_parent" title="go to the profile for this artist">â</a>],
 []]

In [31]:
for result in results:
    for link in result:
        print(link.text)

metal
hard rock
speed metal
rock
album rock
â·
â
