## Dependnecies

### Mac

#### Install homebrew if you don't have it

#### Get ffmpeg

In [None]:
# ! brew install ffmpeg

#### Get Youtube-dlp
https://github.com/yt-dlp/yt-dlp/wiki/Installation

In [None]:
# ! curl -L https://github.com/yt-dlp/yt-dlp/releases/latest/download/yt-dlp -o ~/.local/bin/yt-dlp
# ! chmod a+rx ~/.local/bin/yt-dlp  # Make executable

#### Whisper.cpp

https://github.com/ggerganov/whisper.cpp?tab=readme-ov-file#core-ml-support

In [None]:
# ! git clone https://github.com/ggerganov/whisper.cpp.git
# ! pip install ane_transformers
# ! pip install openai-whisper
# ! pip install coremltools

## Imports

Fill out the environment variables in `.env-example` and change the file name to `.env`:
```bash
mv .env-example .env
```

In [1]:
import os
import ast
import subprocess
import shutil
import tempfile
import re
import json
import requests

### 3rd party
# ! pip install -r requirements.txt -qqq
from bs4 import BeautifulSoup 
from slack_sdk import WebClient
import pandas as pd
from IPython.core.display import HTML
from dotenv import load_dotenv
load_dotenv()

True

## Data

In [2]:
DATA_ROOT = os.path.join(os.getcwd(), 'data')

## Slack auth

In [3]:
slack_token = os.environ.get("SLACK_USER_OAUTH_TOKEN")
client = WebClient(token=slack_token)

## Sample query

In [4]:
for channel in client.conversations_list()["channels"]:
    print(channel["name"])
    print(channel["id"])
    print('---')

making-nft-collections-based-on-novel-binning
C02ERCRHNUX
---
coding
C02ERCUMZFZ
---
papers
C02ES9Q120P
---
general
C02EZBMJ2BY
---
market-chatter
C02F0KUPS5U
---
challenges
C02F67D8RK5
---
random
C02FCAL01T6
---
machine-learning
C02P824JWDV
---
data
C02PZ3YSML7
---
alignment
C04081BDY8K
---
alignment-applications
C04GNUDFXHT
---
ai-images
C04MG56HLDU
---
as-project-planning
C04TX6J0QAE
---
as-user-facing
C051JTPP51R
---
qpl
C0588QJUZS9
---
public-calls-for-alignment
C05EVHWKKC6
---
2024-research-foundation
C069EBAQCB0
---


## Extracting messages in a Slack channel

In [5]:
CHANNEL_ID = "C05EVHWKKC6"
PAGE_SIZE = 100
SLACK_MESSAGE_DF = 'public_calls_raw.csv'

In [6]:
conversastion_info_res = client.conversations_info(channel=CHANNEL_ID)
conversastion_info = conversastion_info_res.data
conversation_created_at = conversastion_info["channel"]["created"]

In [7]:
history_res = client.conversations_history(channel=CHANNEL_ID, limit=PAGE_SIZE)

In [8]:
message_history = history_res.data['messages']

In [9]:
data = {
    'user': [],
    'type': [],
    'ts': [],
    'client_msg_id': [],
    'text': [],
    'team': [],
    'attachments': [],
    'blocks': []
}

for m in message_history:
    if 'subtype' in m:
        continue
    data['user'].append(m['user'])
    data['type'].append(m['type'])
    data['ts'].append(m['ts'])
    try:
        data['client_msg_id'].append(m['client_msg_id'])
    except:
        data['client_msg_id'].append(None)
    data['text'].append(m['text'])
    data['team'].append(m['team'])
    try:
        data['attachments'].append(m['attachments'])
    except:
        data['attachments'].append(None)
    data['blocks'].append(m['blocks'])

df = pd.DataFrame(data)


df.to_csv(SLACK_MESSAGE_DF, index=False)

In [10]:
def parse_ast_cols(row):
    if pd.isna(row):
        return None
    return ast.literal_eval(row)

def find_url(row):
    if pd.isna(row):
        return None
    return row[0]['original_url'] # NOTE: assumes only 1 attachment

def find_platform(row):
    if pd.isna(row):
        return None
    if 'youtube' in row or 'youtu.be' in row:
        return 'youtube'
    if 'podcasts.apple' in row:
        return 'apple podcast'
    if 'apple.news' in row:
        return 'apple news'

In [11]:
df = pd.read_csv(SLACK_MESSAGE_DF)
df.attachments = df.attachments.apply(parse_ast_cols)
df.blocks = df.blocks.apply(parse_ast_cols)
df['url'] = df.attachments.apply(find_url) 
df['platform'] = df.url.apply(find_platform)

In [12]:
df.head(2)

Unnamed: 0,user,type,ts,client_msg_id,text,team,attachments,blocks,url,platform
0,U02F33WP5FX,message,1716043000.0,CC964B6A-76C4-42CF-B796-460D256963FC,<https://youtu.be/lrebGkYy7pE?si=0lbIkizxWO8Me...,T02FVNL7X6U,[{'from_url': 'https://youtu.be/lrebGkYy7pE?si...,"[{'type': 'rich_text', 'block_id': 'Hy5ZJ', 'e...",https://youtu.be/lrebGkYy7pE?si=0lbIkizxWO8MefXk,youtube
1,U02F33WP5FX,message,1714913000.0,FCDF3A69-0A12-4B14-B4E8-364BE71BF14C,<https://podcasts.apple.com/us/podcast/theorie...,T02FVNL7X6U,[{'image_url': 'https://is1-ssl.mzstatic.com/i...,"[{'type': 'rich_text', 'block_id': '8jcbA', 'e...",https://podcasts.apple.com/us/podcast/theories...,apple podcast


## Extracting audio from podcasts and YouTube

In [34]:
AUDIO_OUTPUT = os.path.join(os.getcwd(), DATA_ROOT, 'audio')

def ytdlp_extract_audio(
    url, 
    out_dir=AUDIO_OUTPUT, 
    filename=None, 
    ffmpeg_location='/opt/homebrew/bin/ffmpeg',
    verbosity=1
):
    with tempfile.TemporaryDirectory() as tmpdir:
        if verbosity >= 1:
            print("Extracting audio from {}...".format(url), end=' ')
        subprocess.run(
            [
                "yt-dlp",
                url,
                "-o",
                f"{tmpdir}/%(upload_date)s_%(title)s.%(ext)s",
                "--extract-audio",
                "--audio-format", "mp3",
                "--ffmpeg-location", ffmpeg_location  
            ],
            stdout = subprocess.DEVNULL,
            stderr = subprocess.DEVNULL
        )
        tmp_filename_mp3 = os.listdir(tmpdir)[0]
        tmp_filename_wav = tmp_filename_mp3.replace('.mp3', '.wav')
        if verbosity >= 1:
            print("Converting .mp3 to .wav 16-bit.")
        subprocess.run(
            [
                ffmpeg_location,
                '-i', os.path.join(tmpdir, tmp_filename_mp3),
                '-ar', '16000',
                '-ac', '1',
                '-c:a', 'pcm_s16le',
                os.path.join(tmpdir, tmp_filename_wav)
            ],
            stdout = subprocess.DEVNULL,
            stderr = subprocess.DEVNULL
        )
        if not os.path.exists(out_dir):
            os.makedirs(out_dir)
        if filename is None:
            audio_filename = os.path.join(out_dir, tmp_filename_wav)
        else:
            audio_filename = os.path.join(out_dir, tmp_filename_wav)
        shutil.move(os.path.join(tmpdir, tmp_filename_wav), audio_filename)
    return audio_filename

In [35]:
# yt_row = df[df.platform=='youtube'].sample(1)
# apple_pod_row = df[df.platform=='apple podcast'].sample(1)
# file_path = ytdlp_extract_mp3(yt_row.url.values[0])

rows = []
for idx, row in df.iterrows():
    if row.platform == 'youtube' or row.platform == 'apple podcast':
        row['audio_path'] = ytdlp_extract_audio(row.url)
    else:
        row['audio_path'] = None
    rows.append(row)

Extracting audio from https://youtu.be/lrebGkYy7pE?si=0lbIkizxWO8MefXk... Converting .mp3 to .wav 16-bit.
Extracting audio from https://podcasts.apple.com/us/podcast/theories-of-everything-with-curt-jaimungal/id1521758802?i=1000654451317... Converting .mp3 to .wav 16-bit.
Extracting audio from https://podcasts.apple.com/us/podcast/making-sense-with-sam-harris/id733163012?i=1000652525999... Converting .mp3 to .wav 16-bit.
Extracting audio from https://youtube.com/shorts/fUgmUwnOwws?si=VcO0emCr0cAn1g_G... Converting .mp3 to .wav 16-bit.
Extracting audio from https://youtube.com/shorts/1cHKhvJe2UY?si=ohHztvJqU46HYAn8... Converting .mp3 to .wav 16-bit.
Extracting audio from https://youtu.be/xQJq9nVSa4E?si=eUfG19BIVn8NVOR3... Converting .mp3 to .wav 16-bit.
Extracting audio from https://youtu.be/TkCo1vHgIQM?si=2KfcJXMrlv3nlFQv... Converting .mp3 to .wav 16-bit.
Extracting audio from https://youtu.be/5xgbRxA_7DI?si=mZuRrjEcn3UYiY9e... Converting .mp3 to .wav 16-bit.
Extracting audio from htt

In [15]:
audio_df = pd.DataFrame(rows)

## Transcribing audio

In [76]:
TRANSCRIPTION_OUTPUT = os.path.join(os.getcwd(), DATA_ROOT, 'processed')
MODEL_BIN = 'models/ggml-base.en.bin' # compile this before running

def transcribe(
    wav_filepath_full, 
    path_to_whisper_cpp = os.path.join(os.getcwd(), 'whisper.cpp'),
    out_dir = TRANSCRIPTION_OUTPUT
):
    print(f'Transcribing {wav_filepath_full}...')
    subprocess.run(
        [
            os.path.join(path_to_whisper_cpp, 'main'),
            "-m", os.path.join(path_to_whisper_cpp, MODEL_BIN),
            "-f", wav_filepath_full,
            "--output-txt"
        ],
        stdout = subprocess.DEVNULL,
        stderr = subprocess.DEVNULL
    )
    tmp_txt_filepath_full = wav_filepath_full.replace('.wav', '.wav.txt')
    wav_file_name = wav_filepath_full.split('/')[-1]
    txt_file_name = wav_file_name.replace('.wav', '.txt')
    if not os.path.exists(out_dir):
        os.makedirs(out_dir)
    out_txt_filepath_full = os.path.join(out_dir, txt_file_name)
    shutil.move(tmp_txt_filepath_full, out_txt_filepath_full)
    return out_txt_filepath_full

In [64]:
audio_row = audio_df[~audio_df.audio_path.isna()].sample(1)
wav_file = audio_row.audio_path.values[0]
wav_file

'/Users/eddie/Dev/Alignment/public-calls-bot/data/audio/20240305_Why Jeff Bezos Prefers Group Inventions.wav'

In [84]:
rows = []
for idx, row in audio_df.iterrows():
    if row.audio_path is not None:
        txt_path = row.audio_path.replace('data/audio', 'data/processed').replace('.wav', '.txt')
        with open(txt_path, 'r') as f:
            transcription = f.read().strip()
    else:
        txt_path = None
        transcription = None
    row['txt_path'] = txt_path
    row['transcription'] = transcription
    rows.append(row)

In [78]:
# txt_file = transcribe(wav_file)
# with open(txt_file, 'r') as f:
#     transcription = f.read().strip()
# print(transcription)

rows = []
for idx, row in audio_df.iterrows():
    if row.audio_path is not None:
        try:
            txt_path = transcribe(row.audio_path)
            with open(txt_path, 'r') as f:
                transcription = f.read().strip()
        except:
            print(f"Error transcribing {row.audio_path}")
            break
    else:
        txt_path = None
        transcription = None
    row['txt_path'] = txt_path
    row['transcription'] = transcription
    rows.append(row)

Transcribing /Users/eddie/Dev/Alignment/public-calls-bot/data/audio/20240430_These 12 Lawmakers Agree： It's Time to Leave Congress ｜ NYT Opinion.wav...
Transcribing /Users/eddie/Dev/Alignment/public-calls-bot/data/audio/20240503_Consciousness, Free Will, The Subconscious, Quantum Mechanics ｜ George Musser.wav...
Transcribing /Users/eddie/Dev/Alignment/public-calls-bot/data/audio/20240415_#363 — Knowledge Work.wav...
Transcribing /Users/eddie/Dev/Alignment/public-calls-bot/data/audio/20240305_Why Jeff Bezos Prefers Group Inventions.wav...
Transcribing /Users/eddie/Dev/Alignment/public-calls-bot/data/audio/20240118_Jeff Bezos on Why Companies Fail to Make the Right Decisions.wav...
Transcribing /Users/eddie/Dev/Alignment/public-calls-bot/data/audio/20240211_Race & Reason： A Conversation with Coleman Hughes (Episode #353).wav...
Transcribing /Users/eddie/Dev/Alignment/public-calls-bot/data/audio/20240130_Sam Harris： Debating conspiracy theorists, wokeness, Trump, collapse of Intellectual 

In [85]:
transcription_df = pd.DataFrame(rows)
transcription_df.head(3)

Unnamed: 0,user,type,ts,client_msg_id,text,team,attachments,blocks,url,platform,audio_path,txt_path,transcription
0,U02F33WP5FX,message,1716043000.0,CC964B6A-76C4-42CF-B796-460D256963FC,<https://youtu.be/lrebGkYy7pE?si=0lbIkizxWO8Me...,T02FVNL7X6U,[{'from_url': 'https://youtu.be/lrebGkYy7pE?si...,"[{'type': 'rich_text', 'block_id': 'Hy5ZJ', 'e...",https://youtu.be/lrebGkYy7pE?si=0lbIkizxWO8MefXk,youtube,/Users/eddie/Dev/Alignment/public-calls-bot/da...,/Users/eddie/Dev/Alignment/public-calls-bot/da...,[MUSIC]\n >> And okay.\n >> I knew you couldn'...
1,U02F33WP5FX,message,1714913000.0,FCDF3A69-0A12-4B14-B4E8-364BE71BF14C,<https://podcasts.apple.com/us/podcast/theorie...,T02FVNL7X6U,[{'image_url': 'https://is1-ssl.mzstatic.com/i...,"[{'type': 'rich_text', 'block_id': '8jcbA', 'e...",https://podcasts.apple.com/us/podcast/theories...,apple podcast,/Users/eddie/Dev/Alignment/public-calls-bot/da...,/Users/eddie/Dev/Alignment/public-calls-bot/da...,This podcast is sponsored by Monarch Money.\n ...
2,U02F33WP5FX,message,1713236000.0,,One of the best “public calls for alignment” I...,T02FVNL7X6U,[{'from_url': 'https://podcasts.apple.com/us/p...,"[{'type': 'rich_text', 'block_id': '/pN', 'ele...",https://podcasts.apple.com/us/podcast/making-s...,apple podcast,/Users/eddie/Dev/Alignment/public-calls-bot/da...,/Users/eddie/Dev/Alignment/public-calls-bot/da...,[MUSIC]\n Welcome to the Making Sense Podcast....


In [87]:
transcription_df.to_csv('with_transcriptions.csv', index=False)

## Scrape news articles

Ideally, we should avoid linking to articles that are not on the public web. If we do we can manually "scrape" them.

In [132]:
row = df[df.platform == 'apple news'].sample(1)
row.url

9    https://apple.news/ArrrFBhCqSfGrwDkGluirWg
Name: url, dtype: object

In [133]:
uuid = 'ArrrFBhCqSfGrwDkGluirWg'

In [134]:
res = requests.get(f'https://news-api.apple.com/articles/{uuid}')

In [135]:
res.reason # :(

'Unauthorized'

[Here](https://www.washingtonpost.com/technology/2024/01/22/ai-deepfake-elections-politicians/) is the link to the WSJ article.

In [136]:
og_url = 'https://www.washingtonpost.com/technology/2024/01/22/ai-deepfake-elections-politicians/'
res = requests.get(og_url)
soup = BeautifulSoup(res.content, 'html.parser')

In [137]:
title = soup.title.text
title

'Politicians around the world are blaming AI to swat away allegations - The Washington Post'

In [138]:
# specific to this article's in WSJ HTML
attrs = {
    'class': 'wpds-c-cNdzuP wpds-c-cNdzuP-ejzZdU-isLink-true',
    'data-qa': 'author-name',
    'rel': 'author'
}
a_tags = soup.find_all('a', attrs=attrs)
authors = [t.text for t in a_tags]
authors

['Pranshu Verma', 'Gerrit De Vynck']

In [139]:
pattern = r'\{"_id":"[A-Z0-9]+","additional_properties":\{\},"content":".*?","type":"text","originalIdx":\d+\}'
json_blocks = re.findall(pattern, str(soup), re.DOTALL)
content = "<br><br>".join([
    json.loads(b)['content']
    for b in json_blocks
])
display(HTML(content))

In [144]:
txt_path = os.path.join(DATA_ROOT, 'processed', f'{title}.txt')
transcription = content
row['audio_path'] = None
row['txt_path'] = txt_path
row['transcription'] = transcription
with open(txt_path, 'w') as f:
    f.write(transcription)

In [142]:
transcription_df.iloc[row.index, :] = row.values

In [143]:
transcription_df.iloc[row.index]

Unnamed: 0,user,type,ts,client_msg_id,text,team,attachments,blocks,url,platform,audio_path,txt_path,transcription
9,U02F33WP5FX,message,1705973000.0,,\n<https://apple.news/ArrrFBhCqSfGrwDkGluirWg>,T02FVNL7X6U,[{'from_url': 'https://apple.news/ArrrFBhCqSfG...,"[{'type': 'rich_text', 'block_id': 'MFhK', 'el...",https://apple.news/ArrrFBhCqSfGrwDkGluirWg,apple news,,/Users/eddie/Dev/Alignment/public-calls-bot/da...,Experts in artificial intelligence have long w...


In [145]:
OUT_DF = 'public_alignment_text.csv'
transcription_df.to_csv(OUT_DF, index=False)