In [2]:
import requests
import pandas as pd
# These libraries are essentially borrowed code with classes and methods you can use.
# When you do "requests." or "pd." you're creating an object and when you're using .text
# or .read_html you're calling a method of that object's class

# send an HTTP GET request to the Wikipedia page
url = 'https://en.wikipedia.org/wiki/List_of_NHL_scoring_leaders_by_season'
response = requests.get(url)
print(response.status_code)
# 200 means the request was successful and the server is returning the requested data

print(type(response), type(response.text)) #class object converted to string

print(response.text)

200
<class 'requests.models.Response'> <class 'str'>
<!DOCTYPE html>
<html class="client-nojs vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-language-alert-in-sidebar-enabled vector-feature-sticky-header-disabled vector-feature-page-tools-disabled vector-feature-page-tools-pinned-disabled vector-feature-main-menu-pinned-disabled vector-feature-limited-width-enabled vector-feature-limited-width-content-enabled" lang="en" dir="ltr">
<head>
<meta charset="UTF-8"/>
<title>List of NHL scoring leaders by season - Wikipedia</title>
<script>document.documentElement.className="client-js vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-language-alert-in-sidebar-enabled vector-feature-sticky-header-disabled vector-feature-page-tools-disabled vector-feature-page-tools-pinned-disabled vector-feature-main-menu-pinned-disabled vector-feature-limited-width-enabled vector-feature-

In [3]:
tables = pd.read_html(response.text) # or .context
# .text would be preferred for textual responses, such as an HTML or XML document,
# and .content would be preferred for "binary" filetypes, such as an image or PDF file
# The method read_html takes in a string of HTML code and parses it into tables.
# This is a list of pandas dataframes
print('tables:',type(tables), '\ntables[0]:', type(tables[0]))

tables: <class 'list'> 
tables[0]: <class 'pandas.core.frame.DataFrame'>


In [48]:
print('How many tables:', len(tables))

How many tables: 1


In [4]:
print(tables)

[             Season          Top scorer(s)                 Team  \
0           1917–18       Joe Malone (1) *   Montreal Canadiens   
1           1918–19    Newsy Lalonde (1) *   Montreal Canadiens   
2           1919–20       Joe Malone (2) *     Quebec Athletics   
3           1920–21         Babe Dye (1) *       Multiple teams   
4           1921–22  Punch Broadbent (1) *      Ottawa Senators   
..              ...                    ...                  ...   
116         2019–20   David Pastrňák (1) †        Boston Bruins   
117         2020–21  Auston Matthews (1) †  Toronto Maple Leafs   
118         2021–22  Auston Matthews (2) †  Toronto Maple Leafs   
119  Ongoing season         Ongoing season       Ongoing season   
120         2022–23       Connor McDavid †      Edmonton Oilers   

              Goals              GP            G/GP             Ref  
0                44              20             2.2             [1]  
1                23              17            1.35   

In [5]:
tables[0].to_csv('hockeytable.csv', index = False, sep = ',', encoding='utf-8')
# Microsoft Excel is unable to properly display UTF-8 compliant CSV files when they contain non-English characters
# Reads fine in R when you do read_csv('hockeytable.csv')

In [6]:
import requests
import pandas as pd

# send an HTTP GET request to the Wikipedia page
url = 'https://en.wikipedia.org/wiki/Season_structure_of_the_NHL'
response = requests.get(url)
print(response.status_code)

200


In [7]:
print(response.text)

<!DOCTYPE html>
<html class="client-nojs vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-language-alert-in-sidebar-enabled vector-feature-sticky-header-disabled vector-feature-page-tools-disabled vector-feature-page-tools-pinned-disabled vector-feature-main-menu-pinned-disabled vector-feature-limited-width-enabled vector-feature-limited-width-content-enabled" lang="en" dir="ltr">
<head>
<meta charset="UTF-8"/>
<title>Season structure of the NHL - Wikipedia</title>
<script>document.documentElement.className="client-js vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-language-alert-in-sidebar-enabled vector-feature-sticky-header-disabled vector-feature-page-tools-disabled vector-feature-page-tools-pinned-disabled vector-feature-main-menu-pinned-disabled vector-feature-limited-width-enabled vector-feature-limited-width-content-enabled";(function(){var cookie=document.

In [8]:
tables = pd.read_html(response.text) # or .context
# .text would be preferred for textual responses, such as an HTML or XML document,
# and .content would be preferred for "binary" filetypes, such as an image or PDF file
# This is a list of pandas dataframes
print('tables:',type(tables), '\ntables[0]:', type(tables[0]))

tables: <class 'list'> 
tables[0]: <class 'pandas.core.frame.DataFrame'>


In [9]:
print('How many tables:', len(tables))

How many tables: 9


In [55]:
print(tables)

[     Western Conference                        Eastern Conference  \
       Pacific Division     Central Division    Atlantic Division   
0         Anaheim Ducks      Arizona Coyotes        Boston Bruins   
1        Calgary Flames   Chicago Blackhawks       Buffalo Sabres   
2       Edmonton Oilers   Colorado Avalanche    Detroit Red Wings   
3     Los Angeles Kings         Dallas Stars     Florida Panthers   
4       San Jose Sharks       Minnesota Wild   Montreal Canadiens   
5        Seattle Kraken  Nashville Predators      Ottawa Senators   
6     Vancouver Canucks      St. Louis Blues  Tampa Bay Lightning   
7  Vegas Golden Knights        Winnipeg Jets  Toronto Maple Leafs   

                          
   Metropolitan Division  
0    Carolina Hurricanes  
1  Columbus Blue Jackets  
2      New Jersey Devils  
3     New York Islanders  
4       New York Rangers  
5    Philadelphia Flyers  
6    Pittsburgh Penguins  
7    Washington Capitals  ,                             Division 

In [10]:
# Print the tables to console
for i, table in enumerate(tables):
    print(f"Table {i}:")
    print(table)

Table 0:
     Western Conference                        Eastern Conference  \
       Pacific Division     Central Division    Atlantic Division   
0         Anaheim Ducks      Arizona Coyotes        Boston Bruins   
1        Calgary Flames   Chicago Blackhawks       Buffalo Sabres   
2       Edmonton Oilers   Colorado Avalanche    Detroit Red Wings   
3     Los Angeles Kings         Dallas Stars     Florida Panthers   
4       San Jose Sharks       Minnesota Wild   Montreal Canadiens   
5        Seattle Kraken  Nashville Predators      Ottawa Senators   
6     Vancouver Canucks      St. Louis Blues  Tampa Bay Lightning   
7  Vegas Golden Knights        Winnipeg Jets  Toronto Maple Leafs   

                          
   Metropolitan Division  
0    Carolina Hurricanes  
1  Columbus Blue Jackets  
2      New Jersey Devils  
3     New York Islanders  
4       New York Rangers  
5    Philadelphia Flyers  
6    Pittsburgh Penguins  
7    Washington Capitals  
Table 1:
                     

In [11]:
# If the output is too large for the console, you can write the output to txt.
with open("output.txt", "w", encoding='utf-8') as file:
    for i, table in enumerate(tables):
        file.write(f"\nTable {i}:\n")
        file.write(str(table))

In [12]:
# Pick the table index you are interested in. We are interested in index 0 and 1.
print(tables[0])
print(tables[1])

     Western Conference                        Eastern Conference  \
       Pacific Division     Central Division    Atlantic Division   
0         Anaheim Ducks      Arizona Coyotes        Boston Bruins   
1        Calgary Flames   Chicago Blackhawks       Buffalo Sabres   
2       Edmonton Oilers   Colorado Avalanche    Detroit Red Wings   
3     Los Angeles Kings         Dallas Stars     Florida Panthers   
4       San Jose Sharks       Minnesota Wild   Montreal Canadiens   
5        Seattle Kraken  Nashville Predators      Ottawa Senators   
6     Vancouver Canucks      St. Louis Blues  Tampa Bay Lightning   
7  Vegas Golden Knights        Winnipeg Jets  Toronto Maple Leafs   

                          
   Metropolitan Division  
0    Carolina Hurricanes  
1  Columbus Blue Jackets  
2      New Jersey Devils  
3     New York Islanders  
4       New York Rangers  
5    Philadelphia Flyers  
6    Pittsburgh Penguins  
7    Washington Capitals  
                            Division  \

In [59]:
# Export the pandas dataframe to csv.
tables[0].to_csv('hockeydivisions.csv', index = False, sep = ',', encoding='utf-8')
tables[1].to_csv('hockeytable.csv', index = False, sep = ',', encoding='utf-8')
# Make sure this file isn't open when you rerun the code

In [20]:
import requests
from bs4 import BeautifulSoup

url = "https://www.pointzero.ca/collections/fw22-sweaters-chandails"

# This creates a requests object with the HTML info
response = requests.get(url)
print(type(response))

<class 'requests.models.Response'>


In [21]:
# This creates a Beautiful Soup object that can parse the HTML
soup = BeautifulSoup(response.text, 'html.parser')
print(type(soup))

<class 'bs4.BeautifulSoup'>


In [22]:
print(soup)

<!DOCTYPE html>

<html class="no-js" lang="en">
<head>
<script async="" src="//cdn.shopify.com/s/files/1/0586/8575/1504/t/126/assets/geolizr-lib.js?v=121498662020399124351637330967" type="text/javascript"></script>
<script>
    if(typeof Geolizr === "undefined") {
        var Geolizr = {};
        Geolizr.currencyDynamicCountry = false;
        Geolizr.version = 20200327;
    }

    // save geolizr-lib.js url to the sessionStorage
    sessionStorage.setItem('geolizr_lib_url', "//cdn.shopify.com/s/files/1/0586/8575/1504/t/126/assets/geolizr-lib.js?v=121498662020399124351637330967");
    if(typeof Geolizr.events === "undefined") Geolizr.events = {};
    if(typeof Geolizr.geolizrEvents === "undefined") Geolizr.geolizrEvents = [];

    Geolizr.config = function(config) {
        Geolizr.currency_widget_enabled = config.currency_widget_enabled === "true" || false;
        Geolizr.shopCurrency = 'CAD';
        Geolizr.shopifyFormatMoneySet = false;
        Geolizr.observeElements = [];
    }

In [23]:
print(response.text)

<!doctype html>
<html class="no-js" lang="en">
  <head>
      
    




    <script async src="//cdn.shopify.com/s/files/1/0586/8575/1504/t/126/assets/geolizr-lib.js?v=121498662020399124351637330967" type="text/javascript"></script>

<script>
    if(typeof Geolizr === "undefined") {
        var Geolizr = {};
        Geolizr.currencyDynamicCountry = false;
        Geolizr.version = 20200327;
    }

    // save geolizr-lib.js url to the sessionStorage
    sessionStorage.setItem('geolizr_lib_url', "//cdn.shopify.com/s/files/1/0586/8575/1504/t/126/assets/geolizr-lib.js?v=121498662020399124351637330967");
    if(typeof Geolizr.events === "undefined") Geolizr.events = {};
    if(typeof Geolizr.geolizrEvents === "undefined") Geolizr.geolizrEvents = [];

    Geolizr.config = function(config) {
        Geolizr.currency_widget_enabled = config.currency_widget_enabled === "true" || false;
        Geolizr.shopCurrency = 'CAD';
        Geolizr.shopifyFormatMoneySet = false;
        Geolizr.observeE

In [24]:
# But now we have a beautiful soup object that can parse the HTML

# The method ".find_all" is used on the soup object which finds all 
# the <span class='price-item price-item--regular'> THIS IS TEXT <span>

#  the method ".find" finds the first one and it isn't in a list
list = soup.find_all('span', {'class': 'price-item price-item--regular'})
print(list)

[<span class="price-item price-item--regular" data-price="">
          $55
        </span>, <span class="price-item price-item--regular" data-price="">
          $55
        </span>, <span class="price-item price-item--regular" data-price="">
          $55
        </span>, <span class="price-item price-item--regular" data-price="">
          $55
        </span>, <span class="price-item price-item--regular" data-price="">
          $55
        </span>, <span class="price-item price-item--regular" data-price="">
          $55
        </span>, <span class="price-item price-item--regular" data-price="">
          $55
        </span>, <span class="price-item price-item--regular" data-price="">
          $55
        </span>, <span class="price-item price-item--regular" data-price="">
          $55
        </span>, <span class="price-item price-item--regular" data-price="">
          $55
        </span>, <span class="price-item price-item--regular" data-price="">
          $75
        </span>

In [25]:
print(list[0])

<span class="price-item price-item--regular" data-price="">
          $55
        </span>


In [26]:
print(list[0].text)


          $55
        


In [27]:
print(list[0].text.strip())

$55


In [28]:
print(list[0].text.strip().lstrip('$'))

55


In [69]:
prices = []
for price_item in soup.find_all('span', {'class': 'price-item price-item--regular'}):
    price = price_item.text.strip().lstrip('$')
    prices.append(price)
print(prices)

['55', '55', '55', '55', '55', '55', '55', '55', '55', '55', '75', '75', '75', '85', '85', '85', '85', '85', '70', '70', '70', '70', '70', '90', '90', '70', '70', '70', '70', '70', '70', '75', '75', '75', '75', '75', '70', '70', '90', '60', '75', '75']


In [29]:
# Get the first element
element = soup.find('product-card', {'product-handle': 'joseph-soft-mock-micro-polar-fleece'})
print(element)

<product-card product-handle="joseph-soft-mock-micro-polar-fleece" variant="42969200197840">
<div class="product-card__image-wrapper">
<a class="js-image-link" href="/collections/fw22-sweaters-chandails/products/joseph-soft-mock-micro-polar-fleece">
<swiper-container config='{
        "nested": true,
        "allowTouchMove": false,
        "onBreakpoint": "tablet",
        "resizeObserver": true
      }' data-images-swiper="">
<div class="swiper-wrapper" data-product-images="">
<div class="swiper-slide" data-featured-media="">
<div class="responsive-image__wrapper ratio-rectangle">
<img alt="Navytobacco||Marinetabac" class="motion-reduce" height="2173" loading="lazy" sizes="(min-width: 1100px) 535px, (min-width: 750px) calc((100vw - 130px) / 2), calc((100vw - 50px) / 2)" src="//cdn.shopify.com/s/files/1/0586/8575/1504/products/Untitleddesign-2022-11-22T130001.925_533x.png?v=1669141133" srcset="//cdn.shopify.com/s/files/1/0586/8575/1504/products/Untitleddesign-2022-11-22T130001.925_165

In [31]:
# Search for elements h2 that have 'p product-card__title' as their class
elements = soup.find_all('h2', {'class': 'p product-card__title'})
print(elements[0])

<h2 class="p product-card__title">
<a data-i18n='{"en":"JOSEPH \u003cbr\u003e color block micro polar pullover fleece ","fr":" JOSEPH \u003cbr\u003e molleton micro polaire doux"}' href="/collections/fw22-sweaters-chandails/products/joseph-soft-mock-micro-polar-fleece">
        JOSEPH <br/> color block micro polar pullover fleece
      </a>
<!-- include 'wishlist-button-collection' with '7773401153744' -->
</h2>


In [32]:
# You can use the method .<element> to get the html inside that element.
print(elements[0].a) 

<a data-i18n='{"en":"JOSEPH \u003cbr\u003e color block micro polar pullover fleece ","fr":" JOSEPH \u003cbr\u003e molleton micro polaire doux"}' href="/collections/fw22-sweaters-chandails/products/joseph-soft-mock-micro-polar-fleece">
        JOSEPH <br/> color block micro polar pullover fleece
      </a>


In [33]:
# You can use the .text method to only get the text inside the element, removing the <> tags
print(elements[0].a.text)


        JOSEPH  color block micro polar pullover fleece
      


In [34]:
print(elements[0].a.text.strip()) 

JOSEPH  color block micro polar pullover fleece


In [35]:
# now you can loop through all the elements in the list and get all the titles
titles = []
for card in elements:
    titles.append(card.a.text.strip())
    print(card.a.text.strip())
# You can either store the data in a list of titles or just print it like this

JOSEPH  color block micro polar pullover fleece
JOSEPH  color block micro polar pullover fleece
JOSEPH  color block micro polar pullover fleece
JOSEPH  color block micro polar pullover fleece
JOSEPH  color block micro polar pullover fleece
CAMERON  Unisex french terry crew neck
CAMERON  Unisex french terry crew neck
CAMERON  Unisex french terry crew neck
CAMERON  Unisex french terry crew neck
CAMERON  Unisex french terry crew neck
JIMMY  Unisex zip front french terry hoodie
JIMMY  Unisex zip front french terry hoodie
JIMMY  Unisex zip front french terry hoodie
SIMEON  Recycled mock zip sweater
LIONEL  Recycled mock zip sweater
JETT  CHANDAIL FIN EN COTON À CARREAUX
JETT  CHANDAIL FIN EN COTON À CARREAUX
KODE Crewneck twill effect sweater
STEN   Cotton v neck fine gauge sweater
STEN   Cotton v neck fine gauge sweater
STEN   Cotton v neck fine gauge sweater
STEN   Cotton v neck fine gauge sweater
STEN   Cotton v neck fine gauge sweater
JACK   Cross funnel neck sweater
JACK   Cross funnel

In [36]:
print(soup.find_all('img'))

[<img alt="" class="" height="179" loading="lazy" src="//cdn.shopify.com/s/files/1/0586/8575/1504/files/POINT_ZERO_LOGO_5_BLACK_500x.png?v=1643819924" tabindex="-1" width="2184"/>, <img alt="" class="" height="353" loading="lazy" src="//cdn.shopify.com/s/files/1/0586/8575/1504/files/pz-logo_500x.png?v=1627933836" tabindex="-1" width="1000"/>, <img alt="Navytobacco||Marinetabac" class="motion-reduce" height="2173" loading="lazy" sizes="(min-width: 1100px) 535px, (min-width: 750px) calc((100vw - 130px) / 2), calc((100vw - 50px) / 2)" src="//cdn.shopify.com/s/files/1/0586/8575/1504/products/Untitleddesign-2022-11-22T130001.925_533x.png?v=1669141133" srcset="//cdn.shopify.com/s/files/1/0586/8575/1504/products/Untitleddesign-2022-11-22T130001.925_165x.png?v=1669141133 165w,//cdn.shopify.com/s/files/1/0586/8575/1504/products/Untitleddesign-2022-11-22T130001.925_360x.png?v=1669141133 360w,//cdn.shopify.com/s/files/1/0586/8575/1504/products/Untitleddesign-2022-11-22T130001.925_533x.png?v=16691

In [37]:
# Find all the img elements"
img_elements = soup.find_all('img')

# Extract the src attribute of each img element to get the image URL
image_urls = []
for img_element in img_elements:
    image_url = img_element['src']
    image_urls.append(image_url)

# Print the image URLs
print(image_urls)

['//cdn.shopify.com/s/files/1/0586/8575/1504/files/POINT_ZERO_LOGO_5_BLACK_500x.png?v=1643819924', '//cdn.shopify.com/s/files/1/0586/8575/1504/files/pz-logo_500x.png?v=1627933836', '//cdn.shopify.com/s/files/1/0586/8575/1504/products/Untitleddesign-2022-11-22T130001.925_533x.png?v=1669141133', '//cdn.shopify.com/s/files/1/0586/8575/1504/products/7956082_BLACKOLIVE__2_533x.png?v=1669141133', '//cdn.shopify.com/s/files/1/0586/8575/1504/products/Untitleddesign-2022-11-22T124707.476_533x.png?v=1669141133', '//cdn.shopify.com/s/files/1/0586/8575/1504/products/7956082_TABACCOMILK_2_533x.jpg?v=1669140929', '//cdn.shopify.com/s/files/1/0586/8575/1504/products/7956082_BLACKCHARCOAL_1_533x.jpg?v=1669141133', '//cdn.shopify.com/s/files/1/0586/8575/1504/products/7066009-military_1_533x.jpg?v=1665674413', '//cdn.shopify.com/s/files/1/0586/8575/1504/products/7066009-black_1_533x.jpg?v=1673556927', '//cdn.shopify.com/s/files/1/0586/8575/1504/products/7066009-flora_1_533x.jpg?v=1673556927', '//cdn.sho

In [38]:
# Let's see if we can use our tools on this website. This is the website I was 
# tasked to scrape for my Major Analytics Project

import requests

url = "https://www.levantineceramics.org/petrographics"

response = requests.get(url)
print(response.text)

with open(f'levantine.txt', mode='w', encoding='utf-8') as file:
    file.write(response.text)

# This doesn't look like we are getting any of the table info from this url

<!DOCTYPE html>
<html>
<head>
<script type="text/javascript">window.NREUM||(NREUM={});NREUM.info={"beacon":"bam.nr-data.net","errorBeacon":"bam.nr-data.net","licenseKey":"efbb51e92a","applicationID":"1443790785","transactionName":"cQsNRUULDwlVExhHVkYWDFZFBRMNWQJEGFpcAAZJ","queueTime":1,"applicationTime":22,"agent":""}</script>
<script type="text/javascript">(window.NREUM||(NREUM={})).init={ajax:{deny_list:["bam.nr-data.net"]}};(window.NREUM||(NREUM={})).loader_config={licenseKey:"efbb51e92a",applicationID:"1443790785"};;(()=>{var e,t,r={9071:(e,t,r)=>{"use strict";r.d(t,{I:()=>n});var n=0,i=navigator.userAgent.match(/Firefox[\/\s](\d+\.\d+)/);i&&(n=+i[1])},6900:(e,t,r)=>{"use strict";let n;if(r.d(t,{H:()=>i}),r(2374).il){const e=document.createElement("div");e.innerHTML="\x3c!--[if lte IE 6]><div></div><![endif]--\x3e\x3c!--[if lte IE 7]><div></div><![endif]--\x3e\x3c!--[if lte IE 8]><div></div><![endif]--\x3e\x3c!--[if lte IE 9]><div></div><![endif]--\x3e",n=e.getElementsByTagName("di

In [2]:
import pandas as pd
import requests

url = 'https://www.levantineceramics.org/petrographics.json'

params = {
    "sEcho": "2",
    "iColumns": "12",
    "sColumns": ",,,,,,,,,,,",
    "iDisplayStart": "0",
    "iDisplayLength": "100",
}
data = requests.get(url, params).json()
type(data)

dict

In [40]:
print(data)

{'sEcho': 2, 'iTotalRecords': 6025, 'iTotalDisplayRecords': 6025, 'aaData': [['<a target="_blank" href="/petrographics/1066-002">002</a>', '<a target="_blank" href="/vessels/11590-judean-desert-cave-32-74">Judean Desert: cave 32/74</a>', 'Judean Desert', 'Israel-Palestinian Authority/Central Highlands', 'Chalcolithic', '<a class="btn btn-primary" data-remote="true" href="/images/load_images?cat=Thin-Section&amp;id=1066&amp;model_type=Petrographic">View</a>', '', 'dolomitic marl? calc., active, ds, 25% voids, silty', '', '', '<p><a target="_blank" href="/contributors/220">David Ben-Shlomo</a></p>', '<a class="tooltipIcon" data-placement="top" data-toggle="tooltip" data-original-title="View" target="_blank" href="/petrographics/1066-002"><i class=\'fa fa-eye\'></i></a> <a class="tooltipIcon" data-placement="top" data-toggle="tooltip" data-original-title="View" target="_blank" href="/map?petrographic_id=1066-002"><i class=\'fa fa-map\'></i></a>'], ['<a target="_blank" href="/petrographics

In [3]:
dataframe = pd.DataFrame.from_records(data)
print(dataframe)

                                               aaData  iTotalDisplayRecords  \
0   [<a target="_blank" href="/petrographics/1066-...                  6025   
1   [<a target="_blank" href="/petrographics/2256-...                  6025   
2   [<a target="_blank" href="/petrographics/2052-...                  6025   
3   [<a target="_blank" href="/petrographics/1090-...                  6025   
4   [<a target="_blank" href="/petrographics/1070-...                  6025   
..                                                ...                   ...   
95  [<a target="_blank" href="/petrographics/5514-...                  6025   
96  [<a target="_blank" href="/petrographics/6147-...                  6025   
97  [<a target="_blank" href="/petrographics/5513-...                  6025   
98  [<a target="_blank" href="/petrographics/5500-...                  6025   
99  [<a target="_blank" href="/petrographics/247-2...                  6025   

    iTotalRecords  sEcho  
0            6025      2

In [11]:
print(dataframe['aaData'][2])

['<a target="_blank" href="/petrographics/2052-003">003</a>', '<a target="_blank" href="/vessels/12560-qubur-walagda-reg-no-qubur-walagdareg-no-121-1-2-18">Qubur Walagda Reg. No.: Qubur WalagdaReg. No.: 121-1/2-18</a>', 'Qubur Walagda', 'Israel-Palestinian Authority/Southern Coastal Plain', '', '<a class="btn btn-primary" data-remote="true" href="/images/load_images?cat=Thin-Section&amp;id=2052&amp;model_type=Petrographic">View</a>', '', 'Loess active, ss, 7% voids, silty', '', 'southern Philistia (non coast)', '<p><a target="_blank" href="/contributors/220">David Ben-Shlomo</a></p><p><a target="_blank" href="/contributors/706">Gunnar Lehmann</a></p>', '<a class="tooltipIcon" data-placement="top" data-toggle="tooltip" data-original-title="View" target="_blank" href="/petrographics/2052-003"><i class=\'fa fa-eye\'></i></a> <a class="tooltipIcon" data-placement="top" data-toggle="tooltip" data-original-title="View" target="_blank" href="/map?petrographic_id=2052-003"><i class=\'fa fa-map

In [12]:
def strip_quotes(s):
    return s.rstrip('"')

In [13]:
frames = []
for i in range(1):
    params['iDisplayStart'] = str(i*100)
    data = requests.get(url, params).json()
    dataframe = pd.DataFrame.from_records(data)
    dataframe_with_ids = dataframe["aaData"].str.get(0).str[40:44].apply(strip_quotes).str.strip('-')
    frames.append(dataframe_with_ids)
petrographic_ids = pd.concat(frames)
print(petrographic_ids)

0     1066
1     2256
2     2052
3     1090
4     1070
      ... 
95    5514
96    6147
97    5513
98    5500
99     247
Name: aaData, Length: 100, dtype: object


In [14]:
import requests
import json
import pandas as pd
from bs4 import BeautifulSoup

# Make a GET request to the webpage
url = "https://www.levantineceramics.org/map?petrographic_id=5040-017"
response = requests.get(url)

# Parse the HTML using Beautiful Soup
soup = BeautifulSoup(response.content, "html.parser")

# Find the <div> element containing the latitude and longitude information
map_div = soup.find("div", {"id": "map"})

default_markers = map_div.get("data-default-markers")

# Extract the latitude and longitude from the "data-default-markers" attribute
default_markers = json.loads(default_markers.replace('&quot;', '"'))
marker_id = next(iter(default_markers[0]))
marker = default_markers[0][marker_id]

lat, lng = marker['coordinate']
print(lat, lng)

36.43993 38.2881


In [17]:
print(marker)

{'coordinate': ['36.43993', '38.2881'], 'id': 5040, 'link': '/petrographics/5040-017', 'image_url': 'https://s3.amazonaws.com/lux-production/images/avatars/000/009/814/preview/VP_phil_surv_15_21_a.jpg?1456460082', 'original_image_url': 'https://s3.amazonaws.com/lux-production/images/avatars/000/009/814/large/VP_phil_surv_15_21_a.jpg?1456460082', 'site': 'Tall al-Banat', 'record_type': 'Petrographic'}


In [50]:
# This script will allow you to download any photo from a URL to your working directory
import pandas as pd
import requests

# You could have a csv with photo urls and you can use pd.read_csv to load them in.
# Or just put them in a list
urls = ['https://media-cldnry.s-nbcnews.com/image/upload/t_fit-1240w,f_auto,q_auto:best/rockcms/2022-08/220805-domestic-cat-mjf-1540-382ba2.jpg',
           'https://www.alleycat.org/wp-content/uploads/2019/03/FELV-cat.jpg']

def download_image(url, path):
    response = requests.get(url)
    if response.status_code == 200:
        with open(path, 'wb') as f:
            f.write(response.content)

for url in urls:
    filename = url.split("/")[-1]
    download_image(url, filename)

In [51]:
# Let's try putting in the list of Image URLs from the pointzero website which we stored earlier into 
# a variable named image_urls

# The urls need https: added to the start of the string and the filename needs the jpg extension
import pandas as pd
import requests

def download_image(url, path):
    response = requests.get(url)
    if response.status_code == 200:
        with open(path, 'wb') as f:
            f.write(response.content)

for url in image_urls:
    url = 'https:' + url
    filename = url[-6:-1] + '.jpg'
    download_image(url, filename)