#  Data extraction with urllib

In [15]:
import os
import sys
import urllib.request as ur

Set image url and path

In [24]:
imgURL = "https://gaussian37.github.io/blog/assets/img/kjsandkjs.jpg"
htmlURL = "https://gaussian37.github.io/blog/"
savePath1 = "./data/sample.jpg"
savePath2 = "./data/index.html"

### 1. urlretrieve : convinent to use

urlretrive is under the process of
1. save
2. open("r")
3. assign a variable
4. parsing
5. resave

In [17]:
ur.urlretrieve(imgURL, savePath1)

('./data/sample.jpg', <http.client.HTTPMessage at 0x13261ca0550>)

Show the image from ``"./data/sample.jpg"``
<img src="./data/sample.jpg" alt="Drawing" style="width: 200px;"/>

In [18]:
urllib.request.urlretrieve(htmlURL, savePath2)

('./data/index.html', <http.client.HTTPMessage at 0x13261ca0860>)

### 2. urlopen : use only if you need to process on extracted data

urlopen is under the process of 
1. assign a variable
2. parsing
3. save

In [21]:
f1 = ur.urlopen(imgURL).read()
f2 = ur.urlopen(htmlURL).read()

1st way

In [22]:
saveFile1 = open(savePath1, "wb") # w : write, r : read, a : add, b : binary
saveFile1.write(f1)
saveFile1.close()

Show the image from "./data/sample.jpg"
<img src="./data/sample.jpg" alt="Drawing" style="width: 200px;"/>

2th way

In [27]:
with open(savePath2, "wb") as saveFile2: # w : write, r : read, a : add, b : binary
    saveFile2.write(f2)

### 3. get url information

In [28]:
url = "https://www.udacity.com/"
mem = ur.urlopen(url)
print(mem)

<http.client.HTTPResponse object at 0x0000013261CB3AC8>


In [39]:
print("geturl : ", mem.geturl())
print()
print("status : ", mem.status) # 200 : normal, 404 : no page, 403 : reject, 500 : server error
print()
print("header : ", mem.getheaders())
print()
print("info : ", mem.info())
print()
print("read : ", mem.read(50))

geturl :  https://www.udacity.com/

status :  200

header :  [('Content-Type', 'text/html; charset=utf-8'), ('Content-Length', '264606'), ('Connection', 'close'), ('Age', '51108'), ('Cache-Control', 'max-age=2592000, public'), ('Date', 'Sat, 11 Aug 2018 19:52:45 GMT'), ('ETag', '"b3ebe34ca1e20c996e916e47f38b9034"'), ('Last-Modified', 'Sat, 11 Aug 2018 19:50:30 GMT'), ('Server', 'AmazonS3'), ('Set-Cookie', '_gaexp=GAX1.2.lYsDUp0dTGae_7xq5AB7VQ.17815.0; Path=/; Domain=.udacity.com; Max-Age=7776000;'), ('Vary', 'Accept-Encoding'), ('x-amz-id-2', 'zMz8KVmXxV2aUj4r4R5isLHRTA3+IiU3IC6Y1ML5PlzNmiZZshTM2elfeHi/u6yB69XRJ6q0srU='), ('x-amz-request-id', '3EFB7CB79DCC5CF4'), ('X-Frame-Options', 'DENY'), ('X-XSS-Protection', '1; mode=block'), ('X-Cache', 'Hit from cloudfront'), ('Via', '1.1 d98420743a69852491bbdea73f7680bd.cloudfront.net (CloudFront)'), ('X-Amz-Cf-Id', 's67AXtC66m422C6bhzHQWEmhdIQ79kOIWf5jJtSwWd4NvmvapNQY_g=='), ('X-Berlioz-Country', 'KR'), ('Access-Control-Allow-Headers', 'X-Berli

In [46]:
from urllib.parse import urlparse

print(urlparse("https://www.udacity.com/"))
print(urlparse("https://www.udacity.com/course/android-basics-nanodegree-by-google--nd803"))

ParseResult(scheme='https', netloc='www.udacity.com', path='/', params='', query='', fragment='')
ParseResult(scheme='https', netloc='www.udacity.com', path='/course/android-basics-nanodegree-by-google--nd803', params='', query='', fragment='')


In [50]:
from urllib.parse import urlencode

API = "https://www.ipify.org/"
values = {
    "format" : "json"
}   

In [51]:
print("before : ", values)
print("after : ", urlencode(values))

before :  {'format': 'json'}
after :  format=json


In [52]:
params = urlencode(values)
url = API + "?" + params
print("requested url", url)

requested url https://www.ipify.org/?format=json


In [58]:
reqData = ur.urlopen(url).read().decode('utf-8')
print("output", reqData)

output <!DOCTYPE html>
<html lang="en">
  <head>
    <meta charset="utf-8">
    <meta http-equiv="X-UA-Compatible" content="IE=edge">
    <meta name="viewport" content="width=device-width, initial-scale=1">
    <title>ipify - A Simple Public IP Address API</title>
    <meta name="description" content="ipify is a simple public IP address API, easy enough to integrate into any application in seconds.">
    <link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.4/css/bootstrap.min.css">
    <link href="https://fonts.googleapis.com/css?family=Raleway" rel="stylesheet">

    <!--[if lt IE 9]>
      <script src="https://oss.maxcdn.com/html5shiv/3.7.2/html5shiv.min.js"></script>
      <script src="https://oss.maxcdn.com/respond/1.4.2/respond.min.js"></script>
    <![endif]-->
    <link href="/static/css/prism.css" rel="stylesheet" type="text/css">
    <link href="/static/css/style.css" rel="stylesheet" type="text/css">
    <link rel="apple-touch-icon" sizes="57x57" href="/