# Case Study - The Current - Part 2

* The Current is an alternative radio station
* We will pull information about the play list.

# Step 0 - Insert current progress

Copy over all the relevant code from part 1 of the lab.

http://www.thecurrent.org/playlist/2014-01-01/01

In [1]:
# Import modules here
import requests
from bs4 import BeautifulSoup

In [2]:
from composable import pipeable
from composable.strict import map, filter
from composablesoup import find, find_all, get_text, has_attr
from composablesoup.soup import find_parent, parents, children, find_previous_sibling, find_previous_siblings, find_next_sibling, find_next_siblings, find_previous_sibling
from composable.sequence import to_list, head
from composable.string import strip
from composable import from_toolz as tlz

In [3]:
# Read in the page here
s = requests.Session()
r = s.get('https://www.thecurrent.org/playlist/2014-01-01/01')
current = BeautifulSoup(r.content, "html.parser")

# Pull off the song start time

1. Inspect the element
    1. This one is tricky
    2. Time tag does not have a tag, but
    3. The surrounding div does have a class
2. Identify the html tag and class
3. Use `find_all` to make a list of all relevant tags
4. Pull off an example case
5. Write a function that extracts the start time.
6. Write a single pipe to extract the start time.
7. Confirm you have the right number of times.
8. Package your code in a function called `get_start_time`

in "time" tag, contained in "a" tag, contained in "div" tag with  class="two columns songTime"

In [4]:
current.find_all("div", class_='two columns songTime')

[<div class="two columns songTime">
 <a href="#song226645">
 <time>  1:59 </time>
 </a>
 </div>,
 <div class="two columns songTime">
 <a href="#song196069">
 <time>  1:54 </time>
 </a>
 </div>,
 <div class="two columns songTime">
 <a href="#song229900">
 <time>  1:51 </time>
 </a>
 </div>,
 <div class="two columns songTime">
 <a href="#song235779">
 <time>  1:46 </time>
 </a>
 </div>,
 <div class="two columns songTime">
 <a href="#song132616">
 <time>  1:44 </time>
 </a>
 </div>,
 <div class="two columns songTime">
 <a href="#song224268">
 <time>  1:38 </time>
 </a>
 </div>,
 <div class="two columns songTime">
 <a href="#song236492">
 <time>  1:34 </time>
 </a>
 </div>,
 <div class="two columns songTime">
 <a href="#song237794">
 <time>  1:31 </time>
 </a>
 </div>,
 <div class="two columns songTime">
 <a href="#song234211">
 <time>  1:27 </time>
 </a>
 </div>,
 <div class="two columns songTime">
 <a href="#song235959">
 <time>  1:23 </time>
 </a>
 </div>,
 <div class="two columns songT

In [5]:
ex_time = current.find_all("div", class_='two columns songTime')[0]
ex_time

<div class="two columns songTime">
<a href="#song226645">
<time>  1:59 </time>
</a>
</div>

In [6]:
ex_time.a.time

<time>  1:59 </time>

In [7]:
for tag in current.find_all("div", class_="two columns songTime"):
    print(tag.a.time.text.strip())

1:59
1:54
1:51
1:46
1:44
1:38
1:34
1:31
1:27
1:23
1:19
1:13
1:09
1:05
1:03
1:01


In [8]:
# alternative approach
for tag in current.find_all("div", class_="two columns songTime"):
    print(tag.find("time").text.strip())

1:59
1:54
1:51
1:46
1:44
1:38
1:34
1:31
1:27
1:23
1:19
1:13
1:09
1:05
1:03
1:01


In [9]:
current.find_all("div", class_="two columns songTime")[0]
current.find_all("div", class_="two columns songTime")[0].a.time.text.strip()

'1:59'

In [10]:
def get_times(soup: BeautifulSoup) -> list:
    return [t.a.time.text.strip() for t in soup.find_all("div", class_="two columns songTime")]

In [11]:
get_times(current)

['1:59',
 '1:54',
 '1:51',
 '1:46',
 '1:44',
 '1:38',
 '1:34',
 '1:31',
 '1:27',
 '1:23',
 '1:19',
 '1:13',
 '1:09',
 '1:05',
 '1:03',
 '1:01']

In [12]:
times = (current
     >> find_all("div", class_="two columns songTime")
     >> map(find("time"))
     >> map(get_text)
     >> map(strip)
)

In [13]:
len(times) == len(get_times(current)) == 16

True

# Alternate (simpler) approach

In [14]:
current.find_all("time", attrs={"datetime":False}) # would also work - lot cleaner too

[<time>  1:59 </time>,
 <time>  1:54 </time>,
 <time>  1:51 </time>,
 <time>  1:46 </time>,
 <time>  1:44 </time>,
 <time>  1:38 </time>,
 <time>  1:34 </time>,
 <time>  1:31 </time>,
 <time>  1:27 </time>,
 <time>  1:23 </time>,
 <time>  1:19 </time>,
 <time>  1:13 </time>,
 <time>  1:09 </time>,
 <time>  1:05 </time>,
 <time>  1:03 </time>,
 <time>  1:01 </time>]

# Pull address of the album art image address

Follow a similar process to pull off the web address of the album cover image. 


img tag, class contains = "album-art"

In [15]:
import re
album_art = re.compile(r'album-art')
current.find_all("img", attrs={"class": album_art})

[<img alt="We The Common" class="album-art lazyload" data-src="https://albumart.publicradio.org/mb/e2/e2749c25-c2b6-493e-a2bb-10898152bd2d_5158.jpg" src="" title="Thao and The Get Down Stay Down - We The Common"/>,
 <img alt="default album cover image" class="album-art" src="/assets/album-cover-default-32217dc68a771f3a44aa2b7a640cf91133b61bd1f2ae68c9ddb00055e9a8ac1d.png"/>,
 <img alt="default album cover image" class="album-art" src="/assets/album-cover-default-32217dc68a771f3a44aa2b7a640cf91133b61bd1f2ae68c9ddb00055e9a8ac1d.png"/>,
 <img alt="Wildewoman" class="album-art lazyload" data-src="https://albumart.publicradio.org/mb/5e/5e5c8b95-d04c-432f-8cd2-c1c8d99e6e5a_3556.jpg" src="" title="Lucius - Wildewoman"/>,
 <img alt="Frosting on the Beater" class="album-art lazyload" data-src="https://albumart.publicradio.org/mb/48/48445b64-d965-369a-af3c-8193de389fd8_3ff4.jpg" src="" title="The Posies - Frosting on the Beater"/>,
 <img alt="default album cover image" class="album-art" src="/ass

In [16]:
ex_art = current.find_all("img", attrs={"class": album_art})[0]
ex_art

<img alt="We The Common" class="album-art lazyload" data-src="https://albumart.publicradio.org/mb/e2/e2749c25-c2b6-493e-a2bb-10898152bd2d_5158.jpg" src="" title="Thao and The Get Down Stay Down - We The Common"/>

In [17]:
ex_art["src"] if ex_art["src"] else ex_art["data-src"]

'https://albumart.publicradio.org/mb/e2/e2749c25-c2b6-493e-a2bb-10898152bd2d_5158.jpg'

In [18]:
def get_art_links(soup):
    return [tag["src"] if tag["src"] else tag["data-src"] for tag in soup.find_all("img", attrs={"class": album_art})]

In [19]:
get_art_links(current)

['https://albumart.publicradio.org/mb/e2/e2749c25-c2b6-493e-a2bb-10898152bd2d_5158.jpg',
 '/assets/album-cover-default-32217dc68a771f3a44aa2b7a640cf91133b61bd1f2ae68c9ddb00055e9a8ac1d.png',
 '/assets/album-cover-default-32217dc68a771f3a44aa2b7a640cf91133b61bd1f2ae68c9ddb00055e9a8ac1d.png',
 'https://albumart.publicradio.org/mb/5e/5e5c8b95-d04c-432f-8cd2-c1c8d99e6e5a_3556.jpg',
 'https://albumart.publicradio.org/mb/48/48445b64-d965-369a-af3c-8193de389fd8_3ff4.jpg',
 '/assets/album-cover-default-32217dc68a771f3a44aa2b7a640cf91133b61bd1f2ae68c9ddb00055e9a8ac1d.png',
 '/assets/album-cover-default-32217dc68a771f3a44aa2b7a640cf91133b61bd1f2ae68c9ddb00055e9a8ac1d.png',
 'https://albumart.publicradio.org/mb/e9/e999c049-c65b-4c5e-ad12-5596998679c7_92f9.jpg',
 'https://albumart.publicradio.org/mb/d6/d62320e2-20c4-4589-aa76-2f8ac28447dd_e03b.jpg',
 'https://albumart.publicradio.org/mb/02/028b8602-3bde-495a-a7da-15594fc4f786_351a.jpg',
 'https://albumart.publicradio.org/mb/c9/c92f73ee-527f-42ed-a5

In [20]:
get_src_or_data_src = lambda tag: tag["src"] if tag["src"] else tag["data-src"]
art_links = (current
             >> find_all("img", attrs={"class": album_art})
             >> map(get_src_or_data_src)
            )
art_links

['https://albumart.publicradio.org/mb/e2/e2749c25-c2b6-493e-a2bb-10898152bd2d_5158.jpg',
 '/assets/album-cover-default-32217dc68a771f3a44aa2b7a640cf91133b61bd1f2ae68c9ddb00055e9a8ac1d.png',
 '/assets/album-cover-default-32217dc68a771f3a44aa2b7a640cf91133b61bd1f2ae68c9ddb00055e9a8ac1d.png',
 'https://albumart.publicradio.org/mb/5e/5e5c8b95-d04c-432f-8cd2-c1c8d99e6e5a_3556.jpg',
 'https://albumart.publicradio.org/mb/48/48445b64-d965-369a-af3c-8193de389fd8_3ff4.jpg',
 '/assets/album-cover-default-32217dc68a771f3a44aa2b7a640cf91133b61bd1f2ae68c9ddb00055e9a8ac1d.png',
 '/assets/album-cover-default-32217dc68a771f3a44aa2b7a640cf91133b61bd1f2ae68c9ddb00055e9a8ac1d.png',
 'https://albumart.publicradio.org/mb/e9/e999c049-c65b-4c5e-ad12-5596998679c7_92f9.jpg',
 'https://albumart.publicradio.org/mb/d6/d62320e2-20c4-4589-aa76-2f8ac28447dd_e03b.jpg',
 'https://albumart.publicradio.org/mb/02/028b8602-3bde-495a-a7da-15594fc4f786_351a.jpg',
 'https://albumart.publicradio.org/mb/c9/c92f73ee-527f-42ed-a5

In [21]:
len(art_links) == len(get_art_links(current)) == 16

True

Via parent approach (skipped): Find figure tag, figure tag contains "a" tag, "a" tag contains img tag of interest

# Putting it all together

* Make a function for each of the previous steps
* Make an overall function
    * input is a soup
    * output is a list of lists

**Hint:** You should use `zip` to put all the information together.

In [22]:
# Functions from pt 1:

def get_titles(soup: BeautifulSoup) -> list:
    return [tag.text for tag in soup.find_all("h5", class_="title")]

def get_artists(soup: BeautifulSoup) -> list:
    return [tag.text for tag in soup.find_all("h5", class_="artist")]

# Functions for this part defined above

In [23]:
def get_songs_data(soup):
    titles = get_titles(soup)
    artists = get_artists(soup)
    times = get_times(soup)
    arts = get_art_links(soup)
    return [data for data in zip(titles, artists, times, arts)] # convert to list of lists rather than list of tuples

In [24]:
get_songs_data(current)

[('Holy Roller',
  'Thao and The Get Down Stay Down',
  '1:59',
  'https://albumart.publicradio.org/mb/e2/e2749c25-c2b6-493e-a2bb-10898152bd2d_5158.jpg'),
 ('Kingdom of Rust',
  'Doves',
  '1:54',
  '/assets/album-cover-default-32217dc68a771f3a44aa2b7a640cf91133b61bd1f2ae68c9ddb00055e9a8ac1d.png'),
 ('Black Dog',
  'Frankie Lee',
  '1:51',
  '/assets/album-cover-default-32217dc68a771f3a44aa2b7a640cf91133b61bd1f2ae68c9ddb00055e9a8ac1d.png'),
 ('Turn It Around',
  'Lucius',
  '1:46',
  'https://albumart.publicradio.org/mb/5e/5e5c8b95-d04c-432f-8cd2-c1c8d99e6e5a_3556.jpg'),
 ('Flavor of the Month',
  'The Posies',
  '1:44',
  'https://albumart.publicradio.org/mb/48/48445b64-d965-369a-af3c-8193de389fd8_3ff4.jpg'),
 ('Potential Wife',
  'Strange Names',
  '1:38',
  '/assets/album-cover-default-32217dc68a771f3a44aa2b7a640cf91133b61bd1f2ae68c9ddb00055e9a8ac1d.png'),
 ('24 Hours',
  'Sky Ferreira',
  '1:34',
  '/assets/album-cover-default-32217dc68a771f3a44aa2b7a640cf91133b61bd1f2ae68c9ddb0005