In [1]:
import pandas as pd
import numpy as np
import json
import requests
import xmltodict
import time
import pickle
import os
import sqlalchemy
import urllib.request
from mutagen.mp3 import MP3

In [2]:
config = json.load(open('../config.json', 'r'))

In [3]:
user = config['db']['username']
password = config['db']['password']
host = config['db']['host']
port = config['db']['port']
dbname = config['db']['dbname']

In [4]:
template = "mysql+pymysql://{user}:{password}@{host}:{port}/{dbname}"
connection_string = template.format(user=user, password=password, host=host, port=port, dbname=dbname)

In [5]:
conn = sqlalchemy.create_engine(connection_string, pool_size=1)

In [6]:
fname = 'feed.xml'

In [7]:
if os.path.isfile(fname):
    os.remove(fname)

In [8]:
url = 'http://dataskeptic.com/feed.rss'
url = 'http://dataskeptic.libsyn.com/rss'

if not(os.path.isfile(fname)):
    print('fetching')
    r = requests.get(url)
    f = open(fname, 'wb')
    f.write(r.text.encode('utf-8'))
    f.close()

with open(fname) as fd:
    xml = xmltodict.parse(fd.read())

fetching


In [9]:
emap = {}
episodes = xml['rss']['channel']['item']
for episode in episodes:
    guid = episode['guid']['#text']
    url = episode['enclosure']['@url']
    emap[guid] = url

In [10]:
len(episodes)

220

In [13]:
q = """
SELECT t1.blog_id, prettyname, guid
 , t2.content_id as mp3_content, t3.content_id as homepage_content
 , t1.title, t1.abstract
FROM blog t1
LEFT JOIN related_content t2
 on t1.blog_id = t2.blog_id
 and t2.type='mp3'
LEFT JOIN related_content t3
 on t1.blog_id = t3.blog_id
 and t3.type='homepage-image'
WHERE t1.prettyname like '/episodes/%%'
AND (t2.blog_id is NULL or t3.blog_id is null)
LIMIT 10
"""
df = pd.read_sql(q, conn)

In [14]:
df.shape

(1, 7)

In [15]:
df

Unnamed: 0,blog_id,prettyname,guid,mp3_content,homepage_content,title,abstract
0,393,/episodes/2018/discovering-blind-spots-in-rein...,a8feaf4664e54744b24e9950b9da50a5,,,Discovering Blind Spots in Reinforcement Learning,An intelligent agent trained in a simulated en...


In [16]:
tpl = """
INSERT INTO related_content (blog_id, dest, type, title, body, duration) VALUES 
({blog_id}, '{dest}', '{type}', '{title}', '{body}', {duration})
"""

In [17]:
for r in range(df.shape[0]):
    row = df.iloc[r]
    blog_id = row['blog_id']
    guid = row['guid']
    title = row['title'].replace("'", "\\'")
    body = row['abstract'].replace("'", "\\'")
    if guid in emap:
        dest = emap[guid]
        response = urllib.request.urlopen(dest)
        data = response.read()
        fname = 'temp.mp3'
        f = open(fname, 'wb')
        f.write(data)
        f.close()
        audio = MP3(fname)
        duration = int(audio.info.length)
        q1 = tpl.format(blog_id=blog_id, dest=dest, type='mp3', title=title, body=body, duration=duration)
        r1 = conn.execute(q1)
        dest = 'https://s3.amazonaws.com/dataskeptic.com/img/2018/ai-cover.png'
        q2 = tpl.format(blog_id=blog_id, dest=dest, type='homepage-image', title=title, body=body, duration=-1)
        r2 = conn.execute(q2)

In [18]:
#conn.execute("delete from related_content where blog_id=382")

In [23]:
pd.read_sql("SELECT prettyname from blog order by blog_id desc limit 2", conn).iloc[0]['prettyname']

'/episodes/2018/discovering-blind-spots-in-reinforcement-learning'

In [19]:
pd.read_sql("SELECT * from related_content order by content_id desc limit 10", conn)

Unnamed: 0,content_id,blog_id,dest,type,title,body,created_date,blog_id2,duration
0,684,393,https://s3.amazonaws.com/dataskeptic.com/img/2...,homepage-image,Discovering Blind Spots in Reinforcement Learning,An intelligent agent trained in a simulated en...,2018-06-29 15:36:18,,-1.0
1,683,393,http://traffic.libsyn.com/dataskeptic/blind-sp...,mp3,Discovering Blind Spots in Reinforcement Learning,An intelligent agent trained in a simulated en...,2018-06-29 15:36:18,,1654.0
2,682,390,https://s3.amazonaws.com/dataskeptic.com/guest...,person,Gokula Krishnan Santhanam,Gokula Krishnan Santhanam is currently a maste...,2018-06-22 16:39:54,-1.0,
3,681,390,https://s3.amazonaws.com/dataskeptic.com/img/2...,homepage-image,Defending Against Adversarial Attacks,"In this weeks episode, our host Kyle intervie...",2018-06-22 16:29:59,,-1.0
4,680,390,http://traffic.libsyn.com/dataskeptic/defendin...,mp3,Defending Against Adversarial Attacks,"In this weeks episode, our host Kyle intervie...",2018-06-22 16:29:59,,1888.0
5,679,392,https://s3.amazonaws.com/dataskeptic.com/img/2...,homepage-image,Transfer Learning,"On a long car ride, Linhda and Kyle record a s...",2018-06-15 15:24:03,,-1.0
6,678,392,http://traffic.libsyn.com/dataskeptic/transfer...,mp3,Transfer Learning,"On a long car ride, Linhda and Kyle record a s...",2018-06-15 15:24:02,,1083.0
7,677,391,https://s3.amazonaws.com/dataskeptic.com/img/2...,homepage-image,Medical Imaging Training Techniques,Medical imaging is a highly effective tool use...,2018-06-08 17:39:12,,-1.0
8,676,391,http://traffic.libsyn.com/dataskeptic/medical-...,mp3,Medical Imaging Training Techniques,Medical imaging is a highly effective tool use...,2018-06-08 17:39:12,,1520.0
9,675,391,https://s3.amazonaws.com/dataskeptic.com/guest...,person,Gabriel Maicas,Gabriel Maicas is currently a Ph.D. candidate ...,2018-06-08 16:37:21,-1.0,
