In [18]:
import pathlib
import requests
import urllib.parse
import json
from openai import OpenAI

In [19]:
NBS_DIR = pathlib.Path().resolve().parent
BASE_DIR = NBS_DIR
DATASET_DIR = BASE_DIR / "dataset"

In [20]:
def url_encode(params):
  return urllib.parse.urlencode(params, quote_via=urllib.parse.quote_plus)

def get_url(search_term="Python programming"):
  params = {
      'lang': 'en_us',
      'media': 'podcast',
      'entity': 'podcastEpisode',
      'limit': 10,
      'term': search_term
  }
  encoded_params = url_encode(params)
  return f"https://itunes.apple.com/search?{encoded_params}"

In [26]:
url = get_url(search_term="systemd")
url

'https://itunes.apple.com/search?lang=en_us&media=podcast&entity=podcastEpisode&limit=10&term=systemd'

In [27]:
import requests

r = requests.get(url, headers={"Content-Type": "application/json"})

data = r.json()

results = data.get('results')

results = sorted(results, key=lambda x: x['releaseDate'], reverse=True)

for idx, result in enumerate(results):
    kind = result.get('kind')
    if kind != "podcast-episode":
        continue
    releaseDate = result.get('releaseDate')
    podcastName = result['collectionName']
    title = result['trackName']
    episodeUrl = result['episodeUrl']
    print(idx+1, title, podcastName, releaseDate, episodeUrl)
    print("\n")

1 228: Fedora Asahi, GNOME Tiling, KDE Plasma 6, systemd, Zorin, Inkscape & more Linux news! This Week in Linux 2023-08-06T18:32:16Z https://media.blubrry.com/tuxdigital_thisweekinlinux_mp3/aphid.fireside.fm/d/1437767933/2389be04-5c79-485e-b1ca-3a5b2cebb006/ad5270a5-e830-41c6-bc57-219a04b528af.mp3


2 Rorschach, QNAP, We Got Hacked, SystemD, UTF-8, & Grub2 Music - PSW #779 Paul's Security Weekly (Video-Only) 2023-04-06T15:37:03Z https://dts.podtrac.com/redirect.mp3/traffic.libsyn.com/secure/pswvideoonly/PSW_779_Seg_2_converted_sd.mp4?dest-id=388584


3 systemd su WSL e PNG che contengono il loro stesso MD5 Buongiorno da Edo 2022-09-27T08:25:50Z https://anchor.fm/s/b1bf48a0/podcast/play/58193690/https%3A%2F%2Fd3ctxlq1ktw2nl.cloudfront.net%2Fstaging%2F2022-8-27%2F287900717-44100-2-14b62f86d8c1d.mp3


4 ATA 293 Aventuras y desventuras con Systemd Atareao con Linux 2021-06-28T05:12:52Z https://anchor.fm/s/5a5b39c/podcast/play/36305521/https%3A%2F%2Fd3ctxlq1ktw2nl.cloudfront.net%2Fstaging%2

In [13]:
def get_openai_client():
    return OpenAI(
        base_url = 'http://localhost:11434/v1',
        api_key='ollama', # required, but unused
    )

In [28]:
def guess_language(content="", client=None, raw=None):
    if not isinstance(client, OpenAI):
        client = get_openai_client()
    system_prompt = "".join([
        "You are an expert at deciphering the type of language of text.",
    ])
    prompt_start = "".join([
        "Repond only with your best guess of what the language is of the input text. Use real human languages.",
        "Use the following:"
    ])
    prompt_end="Using format of \"{'language': <generated-answer>}\" return a response with json"
    messages=[
        {"role": "system", "content": system_prompt},
        {
            "role": "user", 
            "content": f"{prompt_start} {content} {prompt_end}",
        }
    ]
    response = client.chat.completions.create(
      model="llama2",
      messages=messages,
     response_format={ "type" : "json_object" }
    )
    if raw:
        return response
    try:
        return json.loads(response.choices[0].message.content), True
    except:
        return response.choices[0].message.content, False

In [29]:
pred, is_json = guess_language("Binärgewitter Talk #320: Für die Liebe zu systemd Binärgewitter ")
if is_json:
    print('language', pred.get('language'))

language German


In [31]:
url = get_url(search_term="systemd")
r = requests.get(url, headers={"Content-Type": "application/json"})

data = r.json()

results = data.get('results')

results = sorted(results, key=lambda x: x['releaseDate'], reverse=True)
ignore_langs = [x.lower() for x in ['German', 'Russian', 'Japanese', 'Chinese', "Spanish"]]

for idx, result in enumerate(results):
    # print(result)
    kind = result.get('kind')
    if kind != "podcast-episode":
        continue
    releaseDate = result.get('releaseDate')
    podcastName = result['collectionName']
    title = result['trackName']
    pred_lang, is_json = guess_language(title)
    lang = None
    if is_json:
        lang = pred_lang.get("language")
    if f"{lang}".lower() in ignore_langs:
        continue
    episodeUrl = result['episodeUrl']
    print(idx+1, lang, title, podcastName, releaseDate, episodeUrl)

1 English 228: Fedora Asahi, GNOME Tiling, KDE Plasma 6, systemd, Zorin, Inkscape & more Linux news! This Week in Linux 2023-08-06T18:32:16Z https://media.blubrry.com/tuxdigital_thisweekinlinux_mp3/aphid.fireside.fm/d/1437767933/2389be04-5c79-485e-b1ca-3a5b2cebb006/ad5270a5-e830-41c6-bc57-219a04b528af.mp3
2 Rorschach Rorschach, QNAP, We Got Hacked, SystemD, UTF-8, & Grub2 Music - PSW #779 Paul's Security Weekly (Video-Only) 2023-04-06T15:37:03Z https://dts.podtrac.com/redirect.mp3/traffic.libsyn.com/secure/pswvideoonly/PSW_779_Seg_2_converted_sd.mp4?dest-id=388584
3 C++ systemd su WSL e PNG che contengono il loro stesso MD5 Buongiorno da Edo 2022-09-27T08:25:50Z https://anchor.fm/s/b1bf48a0/podcast/play/58193690/https%3A%2F%2Fd3ctxlq1ktw2nl.cloudfront.net%2Fstaging%2F2022-8-27%2F287900717-44100-2-14b62f86d8c1d.mp3
6 English Supply Chain Integrity, Format Strings, Systemd Bug, Instagram Bounty, & Refactoring - ASW #155 Security Weekly Podcast Network (Video) 2021-06-22T09:00:00Z https:/