In [23]:
import requests
from bs4 import BeautifulSoup
import re

## 1 requests
### 1. Basic exceptions in requests
#### 1) ConnectionError
Internet connection error, such as DNS error.
#### 2) HTTPError
#### 3) URLRequired
Incomplete URL
#### 4) TooManyRedirects
#### 5) ConnectTimeout
Connect to remote server timeout
#### 6) Timeout
### 2. Basic instruction types on resources in requests
#### 1) get
#### 2) put
#### 3) post
Update and cover resources
#### 4) patch
Update specific position's resources
#### 5) delete
#### 6) head
Request head information
### 3. How to get html contents?
See codes below.

In [91]:
for i in range(2):
    ## Define URL for scraping.
    page = i + 1
    url = 'https://www.names4muslims.com/baby-boys.php?page=%s' % page
    #kv = {'user-agent':'Mozilla/5.0'} #for website which inspect web scrapper
    try:
        r=requests.get(url, timeout = 30)
        #r=requests.get(url, header = kv)
        r.raise_for_status() #check if the request is successful
        r.encoding = r.apparent_encoding
        print(r.text[:100])
        #r.content returns the two-digit contents like 
        #pictures or videos
    except:
        print("Fail to request!")

<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="utf-8">
<meta http-equiv="X-UA-Compatible
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="utf-8">
<meta http-equiv="X-UA-Compatible


## 2 BeautifulSoup
### 1. Basic Elements of BeautifulSoup Class
#### 1) Tag
**html**: whole document  
**body**: document body  
**title**: page name  
**a**: hyperbolic link  
**p**: paragraph  
**h1**: heading  
**br**: Empty HTML elements
#### 2) Name
tag.name: p
#### 3) Attributes
tag.attrs
#### 4) NavigableString
<>...</> tag.string
#### 5) Comment
Begin as '!'

In [58]:
bs = BeautifulSoup(r.text, 'html.parser')
print(bs.p.name)
print(bs.p.attributes)
print(bs.p.comment)
print(bs.p.string)
type(bs.p.string)

p
None
None
A Good and Beautiful Name is the Greatest Gifts You can Give Your Children.


bs4.element.NavigableString

### 2. Traversal of BeautifulSoup
The structure of html is like a tree, so there are three ways of traversing html, top-down, bottom-up and level-order. Just to mention, you can only use level-order traversal under same parent.
#### 1) top-down
.contents: return a list of all sons  
.children: return an iterator of all sons  
.descendants: return an iterator of all descendants
#### 2) bottom-up
.parent: return nearest parent  
.parents: return an iterator of all parents
#### 3) level-order
.next-sibling: return next sibling  
.previous_sibling: return previous sibling  
.next_siblings: return an itertor of all next siblings  
.previous_siblings: return an iterator of all previous siblings

In [70]:
print(bs.p.contents)
print(bs.p.parent.name)

['A Good and Beautiful Name is the Greatest Gifts You can Give Your Children.']
div


In [68]:
for parent in bs.p.parents:
    if parent is None:
        print(parent)
    else:
        print(parent.name)

div
div
div
body
html
[document]


### 3 Example of Web Scrapper
#### 1) Analyze the structure of html using BeautifulSoup and requests

In [19]:
def getHTMLText(url):
    try:
        r = requests.get(url, timeout = 30)
        r.raise_for_status()
        r.encoding = r.apparent_encoding
        return r.text
    except:
        return ""

def parsePage(ilt, html):
    bs = BeautifulSoup(html, 'html.parser')
    for td in bs.body.table.find_all('td'):
        try:
            if td.attrs['data-title'] == 'Name':
                ilt.append(td.string)
        except:
            continue
    
def saveNamesList(ilt):
    with open('GirlNamesList.txt', 'w') as f:
        f.write(str(ilt))
    f.closed

def main():
    page = 75
    infoList = []
    for i in range(page):
        try:
            url = 'https://www.names4muslims.com/baby-girls.php?page=' + str(i+1)
            html = getHTMLText(url)
            parsePage(infoList, html)
        except:
            continue
    saveNamesList(infoList)

if __name__ == "__main__":
    main()

#### 2) Extract names using regular expressions and requests

In [58]:
def getHTMLText(url):
    try:
        r = requests.get(url, timeout = 30)
        r.raise_for_status()
        r.encoding = r.apparent_encoding
        return r.text
    except:
        return ""
    
def parsePage(ilt, html):
    try:
        ilt.append(re.findall(r'\"Muslim Boy Name\: ([A-Za-z\-\' ]+)\"', html))
    except:
        ilt = ilt
    
def saveNamesList(ilt):
    with open('BoyNamesList.txt', 'w') as f:
        f.write(str(ilt))
    f.closed

def main():
    page = 75
    infoList = []
    for i in range(page):
        try:
            url = 'https://www.names4muslims.com/baby-boys.php?page=' + str(i+1)
            html = getHTMLText(url)
            parsePage(infoList, html)
        except:
            continue
    saveNamesList(infoList)

if __name__ == "__main__":
    main()