# 1 Data Extraction Fundamentals 

## Parsing CSV files 

### csv to dict

In [21]:
# Your task is to read the input DATAFILE line by line, and for the first 10 lines (not including the header)
# split each line on "," and then for each line, create a dictionary
# where the key is the header title of the field, and the value is the value of that field in the row.
# The function parse_file should return a list of dictionaries,
# each data line in the file being a single list entry.
# Field names and values should not contain extra whitespace, like spaces or newline characters.
# You can use the Python string method strip() to remove the extra whitespace.
# You have to parse only the first 10 data lines in this exercise,
# so the returned list should have 10 entries!
import os

DATADIR = "data/"
DATAFILE = "beatles-diskography.csv"


def parse_file(datafile):
    data = []
    i = 0
    with open(datafile, "r") as f:
        for line in f:
            values = [s.strip() for s in line.split(',')]
            if i==0 :
                head = values
            elif i <= 10:
                items = dict(zip(head, values))
                data.append(items)
            else:
                break
            i += 1
            
    return data


datafile = os.path.join(DATADIR, DATAFILE)
d = parse_file(datafile)
print(d[0])

{'Title': 'Please Please Me', 'Released': '22 March 1963', 'Label': 'Parlophone(UK)', 'UK Chart Position': '1', 'US Chart Position': '—', 'BPI Certification': 'Gold', 'RIAA Certification': 'Platinum'}


In [28]:
# read all lines of csv file

def parse_file2(datafile):
    data = []
    with open(datafile, "r") as f:
        header = [s.strip() for s in f.readline().split(',')]
        for line in f:
            values = [s.strip() for s in line.split(',')]
            data.append(dict(zip(header, values)))
    return data

In [38]:
# another way - using csv module
import csv

def parse_file3(datafile):
    data = []
    with open(datafile, "r") as f:
        r = csv.DictReader(f)
        for line in r:
            data.append(line)
    return data

d = parse_file3(datafile)
print(d[0])

OrderedDict([('Title', 'Please Please Me'), ('Released', '22 March 1963'), ('Label', 'Parlophone(UK)'), ('UK Chart Position', '1'), ('US Chart Position', '—'), ('BPI Certification', 'Gold'), ('RIAA Certification', 'Platinum')])


## Intro to XLRD

In [60]:
import xlrd
datafile = "data/2013_ERCOT_Hourly_Load_Data.xls"

# parse excel to list of lists
def parse_file(datafile):
    workbook = xlrd.open_workbook(datafile)
    sheet = workbook.sheet_by_index(0)
    
    data = [[sheet.cell_value(row, col) 
                for col in range(sheet.ncols)]
                   for row in range(sheet.nrows)]
    
    return data

data = parse_file(datafile)

In [57]:
workbook = xlrd.open_workbook(datafile)
sheet = workbook.sheet_by_index(0)

#### ROWS, COLUMNS, and CELLS

In [50]:
# Number of rows in the sheet
sheet.nrows

7296

In [46]:
sheet.ncols

10

In [51]:
# Type of data in cell (row 3, col 2)
sheet.cell_type(3,2)

2

In [52]:
# Value in cell (row 3, col 2)
sheet.cell_value(3,2)

1036.0886969999988

In [53]:
# Get a slice of values in column 3, from rows 1-3
sheet.col_values(3, start_rowx=1, end_rowx=4)

[1411.7505669999982, 1403.4722870000019, 1395.053150000001]

#### DATES

In [55]:
# Type of data in cell (row 1, col 0)
sheet.cell_type(1,0)

3

In [62]:
# Convert time to a Python datetime tuple
exceltime = sheet.cell_value(1,0)
xlrd.xldate_as_tuple(exceltime, 0)

(2013, 1, 1, 1, 0, 0)

#### exercise

In [89]:
"""
Your task is as follows:
- read the provided Excel file
- find and return the min, max and average values for the COAST region
- find and return the time value for the min and max entries
- the time values should be returned as Python tuples

Please see the test function for the expected return format
"""

import xlrd
import numpy

datafile = "data/2013_ERCOT_Hourly_Load_Data.xls"

def parse_file(datafile):
    workbook = xlrd.open_workbook(datafile)
    sheet = workbook.sheet_by_index(0)

    coast_values = sheet.col_values(1, start_rowx=1)
    time_values = sheet.col_values(0, start_rowx=1)

    maxvalue = np.max(coast_values)
    minvalue = np.min(coast_values)
    avgvalue = np.mean(coast_values)

    maxtime = time_values[np.argmax(coast_values)]
    mintime = time_values[np.argmin(coast_values)]
    maxtime = xlrd.xldate_as_tuple(maxtime, 0)
    mintime = xlrd.xldate_as_tuple(mintime, 0)
    
    data = {
            'maxtime': maxtime,
            'maxvalue': maxvalue,
            'mintime': mintime,
            'minvalue': minvalue,
            'avgcoast': avgvalue
    }
    return data

parse_file(datafile)

{'avgcoast': 10976.933460679784,
 'maxtime': (2013, 8, 13, 17, 0, 0),
 'maxvalue': 18779.025510000003,
 'mintime': (2013, 2, 3, 4, 0, 0),
 'minvalue': 6602.1138989999818}

In [97]:
# solution without numpy

import xlrd
import pprint

datafile = "data/2013_ERCOT_Hourly_Load_Data.xls"

def parse_file(datafile):
    workbook = xlrd.open_workbook(datafile)
    sheet = workbook.sheet_by_index(0)

    coast_values = sheet.col_values(1, start_rowx=1)

    maxvalue = max(coast_values)
    minvalue = min(coast_values)
    avgvalue = sum(coast_values) / len(coast_values)
    
    maxrow = coast_values.index(maxvalue) + 1
    minrow = coast_values.index(minvalue) + 1

    maxtime = sheet.cell_value(maxrow, 0)
    mintime = sheet.cell_value(minrow, 0)
    maxtime = xlrd.xldate_as_tuple(maxtime, 0)
    mintime = xlrd.xldate_as_tuple(mintime, 0)
    
    data = {
            'maxtime': maxtime,
            'maxvalue': maxvalue,
            'mintime': mintime,
            'minvalue': minvalue,
            'avgcoast': avgvalue
    }
    return data

pprint.pprint(parse_file(datafile))

{'avgcoast': 10976.933460679751,
 'maxtime': (2013, 8, 13, 17, 0, 0),
 'maxvalue': 18779.025510000003,
 'mintime': (2013, 2, 3, 4, 0, 0),
 'minvalue': 6602.113898999982}


## Intro to JSON

In [108]:
# To experiment with this code freely you will have to run this code locally.
# Take a look at the main() function for an example of how to use the code.
# We have provided example json output in the other code editor tabs for you to
# look at, but you will not be able to run any queries through our UI.
import json
import requests


BASE_URL = "http://musicbrainz.org/ws/2/"
ARTIST_URL = BASE_URL + "artist/"

# query parameters are given to the requests.get function as a dictionary; this
# variable contains some starter parameters.
query_type = {  "simple": {},
                "atr": {"inc": "aliases+tags+ratings"},
                "aliases": {"inc": "aliases"},
                "releases": {"inc": "releases"}}


def query_site(url, params, uid="", fmt="json"):
    # This is the main function for making queries to the musicbrainz API.
    # A json document should be returned by the query.
    params["fmt"] = fmt
    r = requests.get(url + uid, params=params)
    print("requesting", r.url)

    if r.status_code == requests.codes.ok:
        return r.json()
    else:
        r.raise_for_status()


def query_by_name(url, params, name):
    # This adds an artist name to the query parameters before making
    # an API call to the function above.
    params["query"] = "artist:" + name
    return query_site(url, params)


def pretty_print(data, indent=4):
    # After we get our output, we can format it to be more readable
    # by using this function.
    if type(data) == dict:
        print(json.dumps(data, indent=indent, sort_keys=True))
    else:
        print(data)




results = query_by_name(ARTIST_URL, query_type["simple"], "Nirvana")
#    pretty_print(results)

artist_id = results["artists"][1]["id"]
print("\nARTIST:")
pretty_print(results["artists"][1])

artist_data = query_site(ARTIST_URL, query_type["releases"], artist_id)
releases = artist_data["releases"]
print("\nONE RELEASE:")
pretty_print(releases[0], indent=2)
release_titles = [r["title"] for r in releases]

print("\nALL TITLES:")
for t in release_titles:
    print(t)


requesting http://musicbrainz.org/ws/2/artist/?query=artist%3ANirvana&fmt=json

ARTIST:
{
    "area": {
        "id": "6a264f94-6ff1-30b1-9a81-41f7bfabd616",
        "name": "Finland",
        "sort-name": "Finland"
    },
    "country": "FI",
    "disambiguation": "Early 1980's Finnish punk band",
    "id": "85af0709-95db-4fbc-801a-120e9f4766d0",
    "life-span": {
        "ended": null
    },
    "name": "Nirvana",
    "score": "100",
    "sort-name": "Nirvana",
    "tags": [
        {
            "count": 1,
            "name": "punk"
        },
        {
            "count": 1,
            "name": "finland"
        }
    ],
    "type": "Group"
}
requesting http://musicbrainz.org/ws/2/artist/85af0709-95db-4fbc-801a-120e9f4766d0?inc=releases&fmt=json

ONE RELEASE:
{
  "barcode": "",
  "country": "FI",
  "date": "1980",
  "disambiguation": "",
  "id": "3e25396c-5c66-4609-8e47-37f250d323c7",
  "packaging": "Cardboard/Paper Sleeve",
  "packaging-id": "f7101ce3-0384-39ce-9fde-fbbd0044d35

#### Quiz : How many bands named "first aid kit"?


In [111]:
results = query_by_name(ARTIST_URL, query_type["simple"], "FIRST AID KIT")
len(results)


requesting http://musicbrainz.org/ws/2/artist/?query=artist%3AFIRST+AID+KIT&fmt=json


4

#### Quiz: Begin-area name for Queen?


In [115]:
results = query_by_name(ARTIST_URL, query_type["simple"], "Queen")
results['artists'][2]['begin-area']['name']

requesting http://musicbrainz.org/ws/2/artist/?query=artist%3AQueen&fmt=json


'London'