# School Board Minutes

Scrape all of the school board minutes from http://www.mineral.k12.nv.us/pages/School_Board_Minutes

Save a CSV called `minutes.csv` with the date and the URL to the file. The date should be formatted as YYYY-MM-DD.

**Bonus:** Download the PDF files

**Bonus 2:** Use [PDF OCR X](https://solutions.weblite.ca/pdfocrx/index.php) on one of the PDF files and see if it can be converted into text successfully.

* **Hint:** If you're just looking for links, there are a lot of other links on that page! Can you look at the link to know whether it links or minutes or not? You'll want to use an "if" statement.
* **Hint:** You could also filter out bad links later on using pandas instead of when scraping
* **Hint:** If you get a weird error that you can't really figure out, you can always tell Python to just ignore it using `try` and `except`, like below. Python will try to do the stuff inside of 'try', but if it hits an error it will skip right out.
* **Hint:** Remember the codes at http://strftime.org
* **Hint:** If you have a date that you've parsed, you can use `.dt.strftime` to turn it into a specially-formatted string. You use the same codes (like %B etc) that you use for converting strings into dates.

```python
try:
  blah blah your code
  your code
  your code
except:
  pass
```

In [25]:
import requests
import re
import pandas as pd
from datetime import datetime
from bs4 import BeautifulSoup

In [2]:
url = "http://www.mineral.k12.nv.us/pages/School_Board_Minutes"
raw_html = requests.get(url).content
soup_doc = BeautifulSoup(raw_html, "html.parser")
print(type(soup_doc))

<class 'bs4.BeautifulSoup'>


In [3]:
### TEST VIEWS OF DATA
# raw_html
# print(soup_doc)
# print(soup_doc.prettify())

In [None]:
### CODE BROKE BECAUSE OF MISORDERED TAGS IN 2018
links = soup_doc.find_all('a')
# links[25]
# links[25]['href']
# links[25].string
# links[25].text
# links[25]['span']
# links[25]['color']

# for link in links:
#     if 'pdf' in link['href']:
#         print(link.string)
#         print(link['href'])
#         print(link['span'])
#         print('====================================================================')

In [None]:
### CONTINUED TESTING AROUND HREF
links = soup_doc.find_all('a')
# links[25]
# links[25]['href']
# links[25].string
# links[25].text
# links[25]['span']
# links[25]['color']

for link in links:
    if 'pdf' in link['href']:
        print('GOOD GOOD GOOD')
#         print(link)
        print(link.parent)
    else:
        print('no pdf')
        print(link.parent)
#         print(link.string)
#         print(link['href'])
#         print(link['span'])
    print('====================================================================')

In [None]:
### INDIVIDUAL LINE TESTS
# string = string.replace(u'\xa0', u'')
paras = soup_doc.find_all('p')[19:78]
# links
paras[1]
# paras[1].contents
# paras[1].find('a')
# paras[1].a['href']

In [13]:
### WORKING CODE WITH DIRTY RESPONSES
minutes_list = []
paras = soup_doc.find_all('p')[19:78]
for para in paras[1:]:
    minutes = []
    link_label = para.text.replace(u'\xa0', u'')
    minutes.append(link_label)
    if para.find('a'):
        link_url = para.a['href']
    else:
        link_url = 'http://'
    minutes.append(link_url)
    minutes_list.append(minutes)
minutes_list

[['June 4, 2019', '/files/6.4.19_minutes.pdf'],
 ['May 28, 2019', '/files/5.28.19_minutes.pdf'],
 ['May 21, 2019 CANCELLED', 'http://'],
 ['May 7, 2019', '/files/5.7.19_minutes.pdf'],
 ['April 23, 2019', '/files/4.23.19_minutes.pdf'],
 ['April 8, 2019', '/files/4.8.19_minutes.pdf'],
 ['March 19, 2019', '/files/3.5.19_minutes.pdf'],
 ['March 5, 2019', '/files/3.5.19.pdf'],
 ['February 26, 2019', '/files/2.26.19_minutes.pdf'],
 ['February 5, 2019', '/files/2.5.19_minutes.pdf'],
 ['January 22, 2019', '/files/January_22_minutes.pdf'],
 ['January 8, 2019', '/files/January_8_minutes.pdf'],
 ['', 'http://'],
 ['2018 Board Meeting Minutes', 'http://'],
 ['December 20, 2018', '/files/12.20.18_minutes.pdf'],
 ['December 4, 2018', '/files/12.4.18_minutes.pdf'],
 ['November 20, 2018', '/files/11.20.18.pdf'],
 ['November 7, 2018', 'http://'],
 ['October 16, 2018', 'http://'],
 ['September 25, 2018', '/files/9.25.18_minutes.pdf'],
 ['September 13, 2018', '/files/9.13.18_minutes.pdf'],
 ['September 4

In [53]:
### WORKING WITH DELETED ROWS
minutes_list = []
paras = soup_doc.find_all('p')[19:78]
for para in paras[1:]:
    minutes = []
    link_label_raw = para.text.replace(u'\xa0', u'')
    if not link_label_raw: 
        pass
    elif 'Meeting' in link_label_raw:
        pass
    else:
        minutes.append(link_label_raw)
        if para.find('a'):
            link_url = para.a['href']
        else:
            link_url = 'http://'
        minutes.append(link_url)
        minutes_list.append(minutes)
minutes_list.insert(0,['link_label_raw','link_url','comment'])
minutes_list

[['link_label_raw', 'link_url', 'comment'],
 ['June 4, 2019', '/files/6.4.19_minutes.pdf'],
 ['May 28, 2019', '/files/5.28.19_minutes.pdf'],
 ['May 21, 2019 CANCELLED', 'http://'],
 ['May 7, 2019', '/files/5.7.19_minutes.pdf'],
 ['April 23, 2019', '/files/4.23.19_minutes.pdf'],
 ['April 8, 2019', '/files/4.8.19_minutes.pdf'],
 ['March 19, 2019', '/files/3.5.19_minutes.pdf'],
 ['March 5, 2019', '/files/3.5.19.pdf'],
 ['February 26, 2019', '/files/2.26.19_minutes.pdf'],
 ['February 5, 2019', '/files/2.5.19_minutes.pdf'],
 ['January 22, 2019', '/files/January_22_minutes.pdf'],
 ['January 8, 2019', '/files/January_8_minutes.pdf'],
 ['December 20, 2018', '/files/12.20.18_minutes.pdf'],
 ['December 4, 2018', '/files/12.4.18_minutes.pdf'],
 ['November 20, 2018', '/files/11.20.18.pdf'],
 ['November 7, 2018', 'http://'],
 ['October 16, 2018', 'http://'],
 ['September 25, 2018', '/files/9.25.18_minutes.pdf'],
 ['September 13, 2018', '/files/9.13.18_minutes.pdf'],
 ['September 4, 2018', '/files/9

In [52]:
### ABANDONED STRIP ATTEMPT, SWITCHING TO PANDAS FOR CLEANUP
# minutes_list = []
# paras = soup_doc.find_all('p')[19:78]
# for para in paras[1:]:
#     minutes = []
#     link_label_raw = para.text.replace(u'\xa0', u'')
#     if not link_label_raw: 
#         pass
#     elif 'Meeting' in link_label_raw:
#         pass
#     else:
#         comment = re.findall(r"\d\d\d\d(.*)", link_label_raw)
#         link_label = re.findall(r"(.* \d\d\d\d)", link_label_raw)
# #         link_label = datetime.strptime(link_label_raw, '%B %d, %Y')
#         if para.find('a'):
#             link_url = para.a['href']
#         else:
#             comment.append('missing link')
#         minutes.append(link_url)
#         minutes.append(link_label)
#         minutes.append(comment)
#         minutes_list.append(minutes)
# minutes_list.insert(0,['link_label','link_url','comment'])
# minutes_list
### *************************************************************************************************************

In [21]:
type(minutes_list[13][1])

str