# Web Scraping

____

## Import libraries 

In [27]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

____

## Create the soup object from web site

In [4]:
#url
start_url = 'https://en.wikipedia.org/wiki/Tesla,_Inc.'

#download site
downloaded_html = requests.get(start_url)

#parse the html
soup = BeautifulSoup(downloaded_html.text)

#save a local copy
with open('downloaded_html', 'w', encoding="utf-8") as file:
    file.write(soup.prettify())

____

## Extract data

#### Select table.wikitable

In [18]:
full_table = soup.select('table.wikitable tbody')[0] #use [] to change tables

#### Extract the table heading

In [23]:
table_head = full_table.select('tr th')

#this is the facilities table
table_columns = []
for element in table_head:
    table_columns.append(element.text.strip())
table_columns

['Opened', 'Name', 'City', 'Country', 'Employees', 'Products', 'Notes']

#### Extract table data

In [25]:
table_rows = full_table.select('tr')

table_data = []
for index, element in enumerate(table_rows):
    if index > 0:
        row_list = []
        values = element.select('td')
        for value in values:
            row_list.append(value.text.strip())
        table_data.append(row_list)
table_data

[['2010[31]',
  'Tesla Fremont Factory',
  'Fremont, California',
  'United States',
  '10,000[289]',
  'Model S, Model 3, Model X, Model Y[289]',
  'Previously the GM/Toyota NUMMI joint venture factory.'],
 ['2016[290]',
  'Giga Nevada',
  'Storey County, Nevada',
  'United States',
  '7,000[291]',
  'Lithium-ion batteries, Powerwall, Powerpack, Megapack, Tesla Semi (future)[292]',
  'Also known as Gigafactory 1.'],
 ['2017[293]',
  'Giga New York',
  'Buffalo, New York',
  'United States',
  '1,500[294]',
  'Solar Roof, Superchargers[294]',
  'Also known as Gigafactory 2.[294]'],
 ['2019[295]',
  'Giga Shanghai',
  'Shanghai',
  'China',
  '15,000[296]',
  'Model 3, Model Y',
  'Also known as Gigafactory 3.'],
 ['(under construction)',
  'Giga Berlin',
  'Grünheide, Brandenburg',
  'Germany',
  '10,000[297]',
  'Lithium-ion batteries, Model 3, Model Y[298]',
  'Also known as Gigafactory 4.'],
 ['(under construction)',
  'Giga Texas',
  'Austin, Texas',
  'United States',
  '10,000[29

____

## Create pd dataframe

In [28]:
df = pd.DataFrame(table_data, columns=table_columns)

In [30]:
df

Unnamed: 0,Opened,Name,City,Country,Employees,Products,Notes
0,2010[31],Tesla Fremont Factory,"Fremont, California",United States,"10,000[289]","Model S, Model 3, Model X, Model Y[289]",Previously the GM/Toyota NUMMI joint venture f...
1,2016[290],Giga Nevada,"Storey County, Nevada",United States,"7,000[291]","Lithium-ion batteries, Powerwall, Powerpack, M...",Also known as Gigafactory 1.
2,2017[293],Giga New York,"Buffalo, New York",United States,"1,500[294]","Solar Roof, Superchargers[294]",Also known as Gigafactory 2.[294]
3,2019[295],Giga Shanghai,Shanghai,China,"15,000[296]","Model 3, Model Y",Also known as Gigafactory 3.
4,(under construction),Giga Berlin,"Grünheide, Brandenburg",Germany,"10,000[297]","Lithium-ion batteries, Model 3, Model Y[298]",Also known as Gigafactory 4.
5,(under construction),Giga Texas,"Austin, Texas",United States,"10,000[299]","Cybertruck, Model 3, Model Y, Semi[300]",Also known as Gigafactory 5. Serves as the loc...
