### OCI Data Science - Useful Tips
Everything stored in the <span style="background-color: #d5d8dc ">/home/datascience</span> folder is now stored on your block volume drive. The <span style="background-color: #d5d8dc ">ads-examples</span> folder has moved outside of your working space. Notebook examples are now accessible through a Launcher tab "Notebook Examples" button.
<details>
<summary><font size="2">1. Check for Public Internet Access</font></summary>

```python
import requests
response = requests.get("https://oracle.com")
assert response.status_code==200, "Internet connection failed"
```
</details>
<details>
<summary><font size="2">2. OCI Configuration and Key Files Set Up</font></summary><p>Follow the instructions in the getting-started notebook. That notebook is accessible via the "Getting Started" Launcher tab button.</p>
</details>
<details>
<summary><font size="2">3. Helpful Documentation </font></summary>
<ul><li><a href="https://docs.cloud.oracle.com/en-us/iaas/data-science/using/data-science.htm">Data Science Service Documentation</a></li>
<li><a href="https://docs.cloud.oracle.com/iaas/tools/ads-sdk/latest/index.html">ADS documentation</a></li>
</ul>
</details>
<details>
<summary><font size="2">4. Typical Cell Imports and Settings</font></summary>

```python
%load_ext autoreload
%autoreload 2
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

import logging
logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.ERROR)

import ads
from ads.dataset.factory import DatasetFactory
from ads.automl.provider import OracleAutoMLProvider
from ads.automl.driver import AutoML
from ads.evaluations.evaluator import ADSEvaluator
from ads.common.data import MLData
from ads.explanations.explainer import ADSExplainer
from ads.explanations.mlx_global_explainer import MLXGlobalExplainer
from ads.explanations.mlx_local_explainer import MLXLocalExplainer
from ads.catalog.model import ModelCatalog
from ads.common.model_artifact import ModelArtifact
```
</details>
<details>
<summary><font size="2">5. Useful Environment Variables</font></summary>

```python
import os
print(os.environ["NB_SESSION_COMPARTMENT_OCID"])
print(os.environ["PROJECT_OCID"])
print(os.environ["USER_OCID"])
print(os.environ["TENANCY_OCID"])
print(os.environ["NB_REGION"])
```
</details>

In [58]:
import requests
from bs4 import BeautifulSoup
from datetime import datetime
import pandas as pd
import re

In [59]:
headers = {'User-Agent': 
           'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 Safari/537.36'}
page = "https://www.transfermarkt.com/lionel-messi/profil/spieler/28003"
pageTree = requests.get(page, headers=headers)
pageSoup = BeautifulSoup(pageTree.content, 'html.parser')

In [60]:
playerName = pageSoup.find_all("h1", {"itemprop": "name"})
playerName = playerName[0].text
playerName

'Lionel Messi'

In [61]:
birthDate = pageSoup.find_all("span", {"itemprop": "birthDate"})
birthDate = birthDate[0].text.replace("\t", "").replace("\n", "")
reg = r'\(.*?\)'
birthDate = re.sub(reg,'',birthDate)
birthDate = datetime.strptime(birthDate,'%b %d, %Y').date()

In [62]:
birthDate

datetime.date(1987, 6, 24)

In [32]:
birthPlace = pageSoup.find_all("span", {"itemprop": "birthPlace"})
birthPlace[0].text.replace("\t", "").replace("\n", "")

'Rosario'

In [33]:
nationality = pageSoup.find_all("span", {"itemprop": "nationality"})
nationality[0].text.replace("\t", "").replace("\n", "")

'Argentina'

In [34]:
height = pageSoup.find_all("span", {"itemprop": "height"})
height = int(float(height[0].text.replace("\t", "").replace("\n", "").replace("m","").replace(" ","").replace(",","."))*100)
height

170

In [35]:
position = pageSoup.find_all("span", {"class": "dataValue"})
position = position[4].text.replace("\t", "").replace("\n", "")
position

'Right Winger                            '

In [36]:
team = pageSoup.find_all("span", {"itemprop":"affiliation"})
team[0].text

'FC Barcelona'

In [37]:
marketValue = pageSoup.find_all("div", {"class": "right-td"})
currentMarketValue= marketValue[0].text.replace("\t", "").replace("\n", "").replace(" ","")
marketValueLastUpdate = marketValue[1].text.replace("\t", "").replace("\n", "")
marketValueLastUpdate = datetime.strptime(marketValueLastUpdate,'%b %d, %Y').date()
maxMarketValue= marketValue[2].text.replace("\n", "").replace("\t", "").split("                                            ")[1]
maxMarketValueDate = marketValue[2].text.replace("\n", "").replace("\t", "").split("                                            ")[2]
maxMarketValueDate = datetime.strptime(maxMarketValueDate,'%b %d, %Y').date()

In [38]:
currentMarketValue

'€112.00m'

In [39]:
marketValueLastUpdate

datetime.date(2020, 4, 8)

In [40]:
maxMarketValue

'€180.00m'

In [41]:
maxMarketValueDate

datetime.date(2018, 1, 1)

In [42]:
#####################################
import requests
from bs4 import BeautifulSoup
from datetime import datetime
import pandas as pd
import re
from urllib.request import Request, urlopen
import time

In [45]:
headers={'User-Agent':'Mozilla/5.0 (compatible; Yahoo! Slurp; http://help.yahoo.com/help/us/ysearch/slurp)'}
for i in range(1,20):
    #prx = proxy_generator()
    print('#############')    
    #headers = {'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 Safari/537.36'}
    #headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36'}
    #url = "https://www.transfermarkt.com.tr/lionel-messi/profil/spieler/"+str(i)
    #pageTree = requests.get(page, headers=headers)
    #pageSoup = BeautifulSoup(pageTree.content, 'html.parser')
    
    headers = {'User-Agent':'Mozilla/5.0'}
    page = "https://www.transfermarkt.com/marian-hristov/profil/spieler/"+str(i)
    pageTree = requests.get(page, headers=headers)
    pageSoup = BeautifulSoup(pageTree.content, 'html.parser')
    
    #pageTree = requests.post(url=url,headers=headers,timeout=7)
    #page = urlopen(pageTree,timeout=10).read()
    #pageSoup = BeautifulSoup(page)
    
    try:
        playerName = pageSoup.find_all("h1", {"itemprop": "name"})
        playerName = playerName[0].text
        
        print(playerName)

        birthDate = pageSoup.find_all("span", {"itemprop": "birthDate"})
        birthDate = birthDate[0].text.replace("\t", "").replace("\n", "")
        reg = r'\(.*?\)'
        birthDate = re.sub(reg,'',birthDate)
        birthDate = datetime.strptime(birthDate,'%b %d, %Y').date()
        
        print(birthDate)

        birthPlace = pageSoup.find_all("span", {"itemprop": "birthPlace"})
        birthPlace = birthPlace[0].text.replace("\t", "").replace("\n", "")
        
        print(birthPlace)

        nationality = pageSoup.find_all("span", {"itemprop": "nationality"})
        nationality = nationality[0].text.replace("\t", "").replace("\n", "")
        
        print(nationality)

        height = pageSoup.find_all("span", {"itemprop": "height"})
        height = int(float(height[0].text.replace("\t", "").replace("\n", "").replace("m","").replace(" ","").replace(",","."))*100)

        print(height)
        
        position = pageSoup.find_all("span", {"class": "dataValue"})
        position = position[4].text.replace("\t", "").replace("\n", "")

        print(position)
        
        team = pageSoup.find_all("span", {"itemprop":"affiliation"})
        team = team[0].text
        
        print(team)

        marketValue = pageSoup.find_all("div", {"class": "right-td"})
        currentMarketValue = marketValue[0].text.replace("\t", "").replace("\n", "").replace(" ","")
        marketValueLastUpdate = marketValue[1].text.replace("\t", "").replace("\n", "")
        marketValueLastUpdate = datetime.strptime(marketValueLastUpdate,'%b %d, %Y').date()
        maxMarketValue= marketValue[2].text.replace("\n", "").replace("\t", "").split("                                            ")[1]
        maxMarketValueDate = marketValue[2].text.replace("\n", "").replace("\t", "").split("                                            ")[2]
        maxMarketValueDate = datetime.strptime(maxMarketValueDate,'%b %d, %Y').date()
        
        print(marketValue)
        time.sleep(3)
        print('#############')
    except Exception as e:
        print(str(e))
    

#############
list index out of range
#############
list index out of range
#############
list index out of range
#############
list index out of range
#############
list index out of range
#############
list index out of range
#############
list index out of range
#############
list index out of range
#############
list index out of range
#############
list index out of range
#############
list index out of range
#############
list index out of range
#############
list index out of range
#############
list index out of range
#############
list index out of range
#############
list index out of range
#############
list index out of range
#############
list index out of range
#############
list index out of range


In [42]:
pageTree

<Response [403]>

In [43]:
pageSoup



In [24]:
import asyncio
from proxybroker import Broker

In [26]:
async def show(proxies):
    while True:
        proxy = await proxies.get()
        if proxy is None: break
        print('Found proxy: %s' % proxy)

proxies = asyncio.Queue()
broker = Broker(proxies)
tasks = asyncio.gather(
    broker.find(types=['HTTP', 'HTTPS'], limit=10),
    show(proxies))

loop = asyncio.get_event_loop()

In [39]:
response = requests.get("https://sslproxies.org/") 

In [40]:
soup = BeautifulSoup(response.content, 'html5lib') 

In [None]:
list(map(lambda x:x[0]+':'+x[1], list(zip(map(lambda x:x.text, soup.findAll('td')[::8]),map(lambda x:x.text, soup.findAll('td')[1::8]))))) 

In [62]:
from random import choice
def proxy_generator():
    response = requests.get("https://sslproxies.org/")
    soup = BeautifulSoup(response.content, 'html5lib')
    proxy = {'https': choice(list(map(lambda x:x[0]+':'+x[1], list(zip(map(lambda x:x.text, 
	   soup.findAll('td')[::8]), map(lambda x:x.text, soup.findAll('td')[1::8]))))))}
    return proxy

In [63]:
x = proxy_generator()
x

{'https': '103.233.158.34:8888'}

In [1]:
from lxml.html import fromstring
import requests
from itertools import cycle
import traceback

def get_proxies():
    url = 'https://free-proxy-list.net/'
    response = requests.get(url)
    parser = fromstring(response.text)
    proxies = set()
    for i in parser.xpath('//tbody/tr')[:10]:
        if i.xpath('.//td[7][contains(text(),"yes")]'):
            proxy = ":".join([i.xpath('.//td[1]/text()')[0], i.xpath('.//td[2]/text()')[0]])
            proxies.add(proxy)
    return proxies


#If you are copy pasting proxy ips, put in the list below
#proxies = ['121.129.127.209:80', '124.41.215.238:45169', '185.93.3.123:8080', '194.182.64.67:3128', '106.0.38.174:8080', '163.172.175.210:3128', '13.92.196.150:8080']
proxies = get_proxies()
proxy_pool = cycle(proxies)

url = 'https://httpbin.org/ip'
for i in range(1,11):
    #Get a proxy from the pool
    proxy = next(proxy_pool)
    print("Request #%d"%i)
    try:
        response = requests.get(url,proxies={"http": proxy, "https": proxy})
        print(response.json())
    except:
        #Most free proxies will often get connection errors. You will have retry the entire request using another proxy to work. 
        #We will just skip retries as its beyond the scope of this tutorial and we are only downloading a single url 
        print("Skipping. Connnection error")


Request #1
Skipping. Connnection error
Request #2
Skipping. Connnection error
Request #3
Skipping. Connnection error
Request #4
Skipping. Connnection error
Request #5
Skipping. Connnection error
Request #6
Skipping. Connnection error
Request #7
Skipping. Connnection error
Request #8
Skipping. Connnection error
Request #9
Skipping. Connnection error
Request #10
Skipping. Connnection error


In [None]:
from urllib.request import Request, urlopen
from bs4 import BeautifulSoup
from fake_useragent import UserAgent
import random

ua = UserAgent() # From here we generate a random user agent
proxies = [] # Will contain proxies [ip, port]

def main():
    # Retrieve latest proxies
    proxies_req = Request('https://www.sslproxies.org/')
    proxies_req.add_header('User-Agent', ua.random)
    proxies_doc = urlopen(proxies_req).read().decode('utf8')

    soup = BeautifulSoup(proxies_doc, 'html.parser')
    proxies_table = soup.find(id='proxylisttable')

  # Save proxies in the array
    for row in proxies_table.tbody.find_all('tr'):
        proxies.append({
          'ip':   row.find_all('td')[0].string,
          'port': row.find_all('td')[1].string
        })
    
    proxy_index = random_proxy()
    proxy = proxies[proxy_index]
    
    for n in range(1, 100): 
        req = Request('http://icanhazip.com')
        req.set_proxy(proxy['ip'] + ':' + proxy['port'], 'http')
        
        if n % 10 == 0:
            proxy_index = random_proxy()
            proxy = proxies[proxy_index]

        # Make the call
        try:
            my_ip = urlopen(req).read().decode('utf8')
            print('#' + str(n) + ': ' + my_ip)
        except: # If error, delete this proxy and find another one
            del proxies[proxy_index]
            print('Proxy ' + proxy['ip'] + ':' + proxy['port'] + ' deleted.')
            proxy_index = random_proxy()
            proxy = proxies[proxy_index]
    
def random_proxy():
    return random.randint(0, len(proxies) - 1)

if __name__ == '__main__':
    main()

#1: 206.196.118.216

#2: 206.196.118.216

#3: 206.196.118.216

#4: 206.196.118.216



In [5]:
from urllib.request import Request, urlopen
from bs4 import BeautifulSoup
from fake_useragent import UserAgent
import random
ua = UserAgent() # From here we generate a random user agent
proxies = [] # Will contain proxies [ip, port]
proxies_req = Request('https://www.sslproxies.org/')
proxies_req.add_header('User-Agent', ua.random)
proxies_doc = urlopen(proxies_req).read().decode('utf8')

In [31]:
def main():
    proxies_req = Request('https://www.sslproxies.org/')
    proxies_req.add_header('User-Agent', ua.random)
    proxies_doc = urlopen(proxies_req).read().decode('utf8')

    soup = BeautifulSoup(proxies_doc, 'html.parser')
    proxies_table = soup.find(id='proxylisttable')

    # Save proxies in the array
    for row in proxies_table.tbody.find_all('tr'):
        proxies.append({
        'ip':   row.find_all('td')[0].string,
        'port': row.find_all('td')[1].string
      })
        
    proxy_index = random_proxy()
    proxy = proxies[proxy_index]

    for n in range(1, 5):
        req = Request('http://icanhazip.com')
        req.set_proxy(proxy['ip'] + ':' + proxy['port'], 'http')

        # Every 10 requests, generate a new proxy
        if n % 10 == 0:
            proxy_index = random_proxy()
            proxy = proxies[proxy_index]
    
    my_ip = urlopen(req).read().decode('utf8')

def random_proxy():
    return random.randint(0, len(proxies) - 1)
        
if __name__ == '__main__':
    main()

RemoteDisconnected: Remote end closed connection without response

In [30]:
proxies

[{'ip': '83.97.23.90', 'port': '18080'},
 {'ip': '188.68.56.248', 'port': '3128'},
 {'ip': '95.174.67.50', 'port': '18080'},
 {'ip': '104.41.54.53', 'port': '3128'},
 {'ip': '103.141.46.154', 'port': '8080'},
 {'ip': '81.201.60.130', 'port': '80'},
 {'ip': '213.230.107.125', 'port': '3128'},
 {'ip': '138.201.223.250', 'port': '31288'},
 {'ip': '64.110.145.126', 'port': '3128'},
 {'ip': '206.196.118.206', 'port': '3838'},
 {'ip': '142.44.221.126', 'port': '8080'},
 {'ip': '18.223.103.221', 'port': '3838'},
 {'ip': '191.33.228.162', 'port': '80'},
 {'ip': '88.99.10.250', 'port': '1080'},
 {'ip': '139.99.105.5', 'port': '80'},
 {'ip': '200.73.128.5', 'port': '8080'},
 {'ip': '46.218.155.194', 'port': '3128'},
 {'ip': '88.99.10.252', 'port': '1080'},
 {'ip': '162.144.36.250', 'port': '3838'},
 {'ip': '103.87.236.46', 'port': '41183'},
 {'ip': '159.224.243.185', 'port': '37793'},
 {'ip': '18.223.213.237', 'port': '3838'},
 {'ip': '182.253.204.66', 'port': '31758'},
 {'ip': '128.0.179.234', 

Proxy 80.94.229.130:8080 deleted.


In [25]:
import urllib.request as urllib2
opener = urllib2.build_opener()
opener.addheaders = [('User-agent', 'Mozilla/5.0')]
response = opener.open("http://www.transfermarkt.com")


for i in range(1,20):
    #prx = proxy_generator()
    print('#############')    
    #headers = {'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 Safari/537.36'}
    #headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36'}
    #url = "https://www.transfermarkt.com.tr/lionel-messi/profil/spieler/"+str(i)
    #pageTree = requests.get(page, headers=headers)
    #pageSoup = BeautifulSoup(pageTree.content, 'html.parser')
    
    opener = urllib2.build_opener()
    opener.addheaders = [('User-agent', 'Mozilla/5.0')]
    response = opener.open("https://www.transfermarkt.co.uk/lionel-messi/profil/spieler/"+str(i))
    #page = "https://www.transfermarkt.co.uk/lionel-messi/profil/spieler/"+str(i)

#############
#############
#############
#############
#############
#############
#############


HTTPError: HTTP Error 403: Forbidden