This project extracts data about the List of production battery electric vehicles from a Wikipedia page.

In [1]:
#Import necessary libraries
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
import re
import time


In [2]:
#The url for the desired data
url = 'https://en.wikipedia.org/wiki/List_of_production_battery_electric_vehicles'

#send an HTTP GET request to the specified URL
response = requests.get(url)

#create a BeautifulSoup object named soup by parsing the HTML content of the response variable.
soup = BeautifulSoup(response.text, 'html')


In [3]:
#Take a look at the whole page
print(soup.prettify())


<!DOCTYPE html>
<html class="client-nojs vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-sticky-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-enabled vector-feature-main-menu-pinned-disabled vector-feature-limited-width-enabled vector-feature-limited-width-content-enabled vector-feature-zebra-design-disabled" dir="ltr" lang="en">
 <head>
  <meta charset="utf-8"/>
  <title>
   List of production battery electric vehicles - Wikipedia
  </title>
  <script>
   document.documentElement.className="client-js vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-sticky-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-enabled vector-feature-main-menu-pinned-disabled vector-feature-limited-width-enabled vector-feature-limited-width-content-enabled vector-feature-zebra-design-disabled";(function(){var cookie=

In [4]:
#Specify the table we intend to scrap data from (Production highway-capable battery electric automobiles originating outside the Chinese market)
table = soup.find_all('table')[1]



In [5]:
#Take a look at the information to be sure it is what was intended for scrapping
print(table.prettify())



<table class="wikitable sortable" style="font-size:95%">
 <tbody>
  <tr>
   <th>
    Model
   </th>
   <th>
    Calendar year
    <br/>
    produced
   </th>
   <th>
    Body style
   </th>
   <th>
    Platform
   </th>
   <th>
    Dedicated battery
    <br/>
    electric vehicle?
    <sup class="reference" id="cite_ref-3">
     <a href="#cite_note-3">
      [nb 1]
     </a>
    </sup>
   </th>
   <th>
    Manufacturer
   </th>
   <th>
    Marque origin
   </th>
  </tr>
  <tr>
   <td>
    <a href="/wiki/Aspark_Owl" title="Aspark Owl">
     Aspark Owl
    </a>
   </td>
   <td>
    2021
   </td>
   <td>
    <a href="/wiki/Coupe" title="Coupe">
     Coupé
    </a>
   </td>
   <td>
   </td>
   <td>
    Yes
   </td>
   <td>
    <a href="/wiki/Aspark" title="Aspark">
     Aspark
    </a>
   </td>
   <td>
    Japan
   </td>
  </tr>
  <tr>
   <td>
    <a href="/wiki/Audi_Q8_e-tron" title="Audi Q8 e-tron">
     Audi Q8 e-tron
    </a>
   </td>
   <td>
    2018
   </td>
   <td>
    <a class="mw-

In [6]:
#Get table titles
titles = table.find_all('th')




In [7]:
#check the titles
titles

[<th>Model
 </th>,
 <th>Calendar year<br/>produced
 </th>,
 <th>Body style
 </th>,
 <th>Platform
 </th>,
 <th>Dedicated battery<br/>electric vehicle?<sup class="reference" id="cite_ref-3"><a href="#cite_note-3">[nb 1]</a></sup>
 </th>,
 <th>Manufacturer
 </th>,
 <th>Marque origin
 </th>]

In [8]:
#clean it up a little bit
table_titles = [title.text.strip() for title in titles]




In [9]:
table_titles

['Model',
 'Calendar yearproduced',
 'Body style',
 'Platform',
 'Dedicated batteryelectric vehicle?[nb 1]',
 'Manufacturer',
 'Marque origin']

In [10]:
#Crete a dataframe with the titles as columns
df = pd.DataFrame(columns=table_titles)




In [11]:
#Take a look at the dataframe
df

Unnamed: 0,Model,Calendar yearproduced,Body style,Platform,Dedicated batteryelectric vehicle?[nb 1],Manufacturer,Marque origin


In [12]:
#Get column data
column_data = table.find_all('tr')




In [17]:
#Get row data, check to see its the desired information
for row in column_data:
    row_data = row.find_all('td')
    single_row_data = [data.text.strip() for data in row_data]
    print(single_row_data)























[]
['Aspark Owl', '2021', 'Coupé', '', 'Yes', 'Aspark', 'Japan']
['Audi Q8 e-tron', '2018', 'Crossover SUV', 'MLB evo[3]', 'Yes', 'Audi', 'Germany']
['Audi e-tron GT', '2021', 'Sedan', 'J1[4]', 'Yes', 'Audi', 'Germany']
['Audi Q4 e-tron', '2021', 'Crossover SUV', 'MEB[5]', 'Yes', 'Audi', 'Germany']
['BMW i4', '2021', 'Liftback', 'CLAR[6]', 'No', 'BMW', 'Germany']
['BMW i7', '2022', 'Sedan', 'CLAR', 'No', 'BMW', 'Germany']
['BMW iX', '2021', 'Crossover SUV', 'iX[7]', 'Yes', 'BMW', 'Germany']
['BMW iX1', '2022', 'Crossover SUV', 'UKL2', 'No', 'BMW', 'Germany']
['BMW iX3', '2020', 'Crossover SUV', 'CLAR[8]', 'No', 'BMW', 'Germany']
['BrightDrop Zevo', '2022', 'Van', 'BT1', 'Yes', 'General Motors', 'United States']
['Cadillac Lyriq', '2022', 'Crossover SUV', 'BEV3[9]', 'Yes', 'General Motors', 'United States']
['Chevrolet Bolt EUV', '2021', 'Crossover SUV', 'BEV2[10]', 'Yes', 'General Motors', 'United States']
['Chevrolet Bolt EV', '2016', 'Hatchback', 'BEV2[10]', 'Yes', 'General Motors', 

In [20]:
#Get row data into the dataframe
for row in column_data[1:]:
    row_data = row.find_all('td')
    single_row_data = [data.text.strip() for data in row_data]
    df.loc[len(df)] = single_row_data



In [21]:
#A sneak peak at the dataframe
df.head()


Unnamed: 0,Model,Calendar yearproduced,Body style,Platform,Dedicated batteryelectric vehicle?[nb 1],Manufacturer,Marque origin
0,Aspark Owl,2021,Coupé,,Yes,Aspark,Japan
1,Audi Q8 e-tron,2018,Crossover SUV,MLB evo[3],Yes,Audi,Germany
2,Audi e-tron GT,2021,Sedan,J1[4],Yes,Audi,Germany
3,Audi Q4 e-tron,2021,Crossover SUV,MEB[5],Yes,Audi,Germany
4,BMW i4,2021,Liftback,CLAR[6],No,BMW,Germany


In [16]:
#Export into a csv file
df.to_csv('battery_electric_vehicles.csv', index = False)

