In [1]:

from bs4 import BeautifulSoup
import pandas as pd
import requests
import re
import numpy as np
import zipfile



import os
import shutil

In [2]:
# Open url for scraping. All data sources are stored within a table. All target data files have the
# same naming scheme, we will take advantage of this

In [3]:

url = 'http://www.fueleconomy.gov/feg/download.shtml'
page = requests.get(url)
soup = BeautifulSoup(page.content, 'lxml')


reg = 'epadata/\d{2}data.zip' ##luckily our target has a unique identifier for href
suf = 'http://www.fueleconomy.gov/feg/'

In [4]:
# Find target data sources within html page

In [5]:
links = soup.find_all('a')
zlist = []
for tag in links:
    link = tag.get('href',None)
    if link is not None:
        if re.match(reg, link):
            zlist.append(link)

In [6]:
# Download data to current working directory. These files are zipped. Unzip files.

In [7]:
filelst = []
for x in zlist:
    url = suf+x
    localfn = url.split('/')[-1]
    yearno = int(localfn[:2])
    if yearno < 50 and yearno > 00:
        r = requests.get(url, stream=True)
        with open(localfn, 'wb') as f:
            for chunk in r.iter_content(chunk_size=1024): 
                if chunk: # filter out keep-alive new chunks
                    f.write(chunk)
        zip_ref = zipfile.ZipFile(localfn, 'r')
        lcl = "dldir"+localfn
        filelst.append(lcl)
        zip_ref.extractall(lcl)
        zip_ref.close()
       
        


In [8]:
# create directory to store data

In [9]:
datdir = "shibe_data"
if not os.path.exists(datdir):
    os.makedirs(datdir)

In [10]:
# extract downloaded data to single directory. Rename files so that they have uniform names

In [11]:
reg = "(.xlsx|.xls|.csv)$"
for item in filelst:
    for filename in os.listdir(item):
        curname =  item + "\\" + filename 
        
        
        m = re.search(reg, filename)
        if m:
            newname = datdir + "\\" + item[:-4] + m.group(0)
            os.rename(curname, newname)


In [12]:
# clean up files and directories created while scraping data

In [13]:

lst = os.listdir(os.getcwd())
for x in lst:
    m = re.search(".zip$", x)
    if m:
        n = re.search("^dldir",x)
        if n:
            shutil.rmtree(x)
        else:
            os.remove(x)


In [19]:
# Reads data files into Pandas Dataframes according to file type (.csv .xlx .xlsx). The df are stored in a temporary dict 
# according to their respective years. The original zip file from the EPA site does not give year in YYYY format, so we have 
# chosen not to use the YYYY format until this point where it becomes necessary and after we have gleaned out only the years 
# prefixed by "20-"
# Some lines of the original data files are formed incorrectly. Adding the option "error_bad_lines=False" skips these lines

In [189]:
%%capture
datfiles = os.listdir(datdir)
pdfiles = {}
for x in datfiles:
    y = re.search("[0-9]{2}",x)
    year = "20" + y.group(0)
    cs = re.search(".csv$", x)
    if cs:
        newp = pd.read_csv(datdir + "\\" + x, error_bad_lines=False)
    else:
        newp = pd.read_excel(datdir + "\\" + x)
        
    pdfiles[year] = newp
    

In [121]:
# Went through all the data files from 2001-2018 and found all the persisting variables and listed their aliases in a dict, vars

In [188]:
vars['Class'] = ['CLASS','Carline Class Desc', 'Class']
vars['Manufacturer'] = ['MFR', 'Manufacturer', 'Mfr Name', 'Mfr Name ']
vars['carline name'] = ['CAR LINE', 'Carline', 'carline name']
vars['cmb'] = ['COMB MPG (GUIDE)', 'Comb FE (Guide) - Conventional Fuel', 'cmb']
vars['cty'] = ['CITY MPG (GUIDE)', 'City FE (Guide) - Conventional Fuel', 'cty']
vars['cyl'] = ['# Cyl', 'NUMB CYL', 'cyl']
vars['displ'] = ['DISPLACEMENT', 'Eng Displ', 'displ']
vars['fcost'] = ['ANL FL CST', 'Annual Fuel1 Cost - Conventional Fuel', 'fcost']
vars['fl'] = ['FUEL TYPE', 'Fuel Usage Desc - Conventional Fuel', 'fl']
vars['hwy'] = ['HWY MPG (GUIDE)', 'Hwy FE (Guide) - Conventional Fuel', 'hwy']
vars['trans'] = ['TRANS', 'Trans as listed in FE Guide (derived from col AA thru AF)', 'Trans in FE Guide (MFR entered for data entered after May 13 2011)', 'Transmission', 'trans']
vars['ucmb'] = ['Comb Unadj FE - Conventional Fuel', 'UNRND COMP (EPA)', 'ucmb']
vars['ucty'] = ['City Unadj FE - Conventional Fuel', 'UNRND CITY (EPA)', 'ucty']
vars['uhwy'] = ['Hwy Unadj FE - Conventional Fuel', 'UNRND HWY (EPA)', 'uhwy']
vars['drv'] = ['DRIVE SYS', 'Drive Sys', 'drv']

['Carline Class Desc', 'Class']

In [193]:
varlst = list(vars.keys())
yearlst = list(pdfiles.keys())


for z in yearlst:
    curpd = pdfiles[z]
    cols = list(curpd.columns.values)
    checklst = []
    rename = []
    
    for x in cols:
        for y in varlst:
            if x in vars[y]:
                checklst.append(x)
                rename.append(y)
    curpd = curpd[checklst]
    curpd.columns = rename
    
    print(z, rename,cols)
    pdfiles[z] = curpd[varlst]


2001 ['Class', 'Manufacturer', 'carline name', 'displ', 'cyl', 'trans', 'cty', 'hwy', 'cmb', 'ucty', 'uhwy', 'ucmb', 'fl', 'fcost', 'drv'] ['Class', 'Manufacturer', 'carline name', 'displ', 'cyl', 'trans', 'cty', 'hwy', 'cmb', 'ucty', 'uhwy', 'ucmb', 'fl', 'fcost', 'drv']
2002 ['Class', 'Manufacturer', 'carline name', 'displ', 'cyl', 'trans', 'cty', 'hwy', 'cmb', 'ucty', 'uhwy', 'ucmb', 'fl', 'fcost', 'drv'] ['Class', 'Manufacturer', 'carline name', 'displ', 'cyl', 'trans', 'cty', 'hwy', 'cmb', 'ucty', 'uhwy', 'ucmb', 'fl', 'fcost', 'drv']
2003 ['Class', 'Manufacturer', 'carline name', 'displ', 'cyl', 'trans', 'cty', 'hwy', 'cmb', 'ucty', 'uhwy', 'ucmb', 'fl', 'fcost', 'drv'] ['Class', 'Manufacturer', 'carline name', 'displ', 'cyl', 'trans', 'cty', 'hwy', 'cmb', 'ucty', 'uhwy', 'ucmb', 'fl', 'fcost', 'drv']
2004 ['Class', 'Manufacturer', 'carline name', 'displ', 'cyl', 'trans', 'cty', 'hwy', 'cmb', 'ucty', 'uhwy', 'ucmb', 'fl', 'fcost', 'drv'] ['Class', 'Manufacturer', 'carline name', 

KeyError: "['Manufacturer'] not in index"

In [185]:
pdfiles["2011"]


Unnamed: 0,Manufacturer,carline name,displ,cyl,trans,cty,hwy,cmb,ucty,uhwy,ucmb,drv,fl,fcost,Class
0,aston martin,V12 Vantage,5.9,12.0,Manual(M6),11,17,13,13.7271,22.9258,16.7517,R,Gasoline (Premium Unleaded Recommended),3691,Two Seaters
1,aston martin,V8 Vantage,4.7,8.0,Auto(AM6),14,20,16,17.081,27.3512,20.5541,R,Gasoline (Premium Unleaded Recommended),3000,Two Seaters
2,aston martin,V8 Vantage,4.7,8.0,Manual(M6),13,19,15,16.0147,26.5231,19.4895,R,Gasoline (Premium Unleaded Recommended),3202,Two Seaters
3,aston martin,V8 Vantage S,4.7,8.0,Auto(AM7),14,21,16,16.6866,28.793,20.5806,R,Gasoline (Premium Unleaded Recommended),3000,Two Seaters
4,Audi,R8,4.2,8.0,Auto(AM6),13,21,16,15.9506,26.7678,19.496,A,Gasoline (Premium Unleaded Recommended),3000,Two Seaters
5,Audi,R8,4.2,8.0,Manual(M6),11,20,14,13.6465,24.301,17.0007,A,Gasoline (Premium Unleaded Recommended),3427,Two Seaters
6,Audi,R8,5.2,10.0,Auto(AM6),13,19,15,15.331,24.3325,18.3929,A,Gasoline (Premium Unleaded Recommended),3202,Two Seaters
7,Audi,R8,5.2,10.0,Manual(M6),12,19,14,13.7,23.0667,16.7632,A,Gasoline (Premium Unleaded Recommended),3427,Two Seaters
8,Audi,R8 Spyder,4.2,8.0,Auto(AM6),13,21,16,15.9506,26.7678,19.496,A,Gasoline (Premium Unleaded Recommended),3000,Two Seaters
9,Audi,R8 Spyder,4.2,8.0,Manual(M6),11,20,14,13.6465,24.301,17.0007,A,Gasoline (Premium Unleaded Recommended),3427,Two Seaters
