In [1]:
!pip install tabula-py xlrd==1.2.0 lxml --quiet

[K     |████████████████████████████████| 12.0 MB 3.6 MB/s 
[K     |████████████████████████████████| 103 kB 66.5 MB/s 
[?25h

# Working with Headers 

In [2]:
import pandas as pd
import numpy as np
from urllib.request import urlretrieve

In [3]:
url = (
    'https://raw.githubusercontent.com/PacktWorkshops/'
    'The-Data-Wrangling-Workshop/master/Chapter05/datasets/CSV_EX_1.csv'
)
urlretrieve(url, 'ex1.csv')

('ex1.csv', <http.client.HTTPMessage at 0x7f91a5566f90>)

In [4]:
df = pd.read_csv('ex1.csv')
df

Unnamed: 0,Bedroom,Sq. foot,Locality,Price ($)
0,2,1500,Good,300000
1,3,1300,Fair,240000
2,3,1900,Very good,450000
3,3,1850,Bad,280000
4,2,1640,Good,310000


### Read a .csv file with no header

In [5]:
url = (
    'https://raw.githubusercontent.com/PacktWorkshops/'
    'The-Data-Wrangling-Workshop/master/Chapter05/datasets/CSV_EX_2.csv'
)
urlretrieve(url, 'ex2.csv')

df2 = pd.read_csv('ex2.csv')
df2

Unnamed: 0,2,1500,Good,300000
0,3,1300,Fair,240000
1,3,1900,Very good,450000
2,3,1850,Bad,280000
3,2,1640,Good,310000


In [6]:
df2 = pd.read_csv('ex2.csv', header=None)
df2

Unnamed: 0,0,1,2,3
0,2,1500,Good,300000
1,3,1300,Fair,240000
2,3,1900,Very good,450000
3,3,1850,Bad,280000
4,2,1640,Good,310000


### Add the names argument to get the correct headers

In [7]:
df2 = pd.read_csv('ex2.csv', header=None,
                  names=['Bedroom','Sq.ft', 'Locality','Price($)'])
df2

Unnamed: 0,Bedroom,Sq.ft,Locality,Price($)
0,2,1500,Good,300000
1,3,1300,Fair,240000
2,3,1900,Very good,450000
3,3,1850,Bad,280000
4,2,1640,Good,310000


# Reading from a CSV File Where Delimiters Are Not Commas

In [8]:
url = (
    'https://raw.githubusercontent.com/PacktWorkshops/'
    'The-Data-Wrangling-Workshop/master/Chapter05/datasets/CSV_EX_3.csv'
)
urlretrieve(url, 'ex3.csv')

df3 = pd.read_csv('ex3.csv')
df3

Unnamed: 0,Bedroom; Sq. foot; Locality; Price ($)
0,2; 1500; Good; 300000
1,3; 1300; Fair; 240000
2,3; 1900; Very good; 450000
3,3; 1850; Bad; 280000
4,2; 1640; Good; 310000


In [9]:
df3 = pd.read_csv('ex3.csv', sep=';')
df3

Unnamed: 0,Bedroom,Sq. foot,Locality,Price ($)
0,2,1500,Good,300000
1,3,1300,Fair,240000
2,3,1900,Very good,450000
3,3,1850,Bad,280000
4,2,1640,Good,310000


# Bypassing and Renaming the Headers of a CSV File

In [11]:
df4 = pd.read_csv('ex1.csv', names=['A','B','C','D'])
df4

Unnamed: 0,A,B,C,D
0,Bedroom,Sq. foot,Locality,Price ($)
1,2,1500,Good,300000
2,3,1300,Fair,240000
3,3,1900,Very good,450000
4,3,1850,Bad,280000
5,2,1640,Good,310000


In [12]:
df4 = pd.read_csv('ex1.csv', header=0, names=['A','B','C','D'])
df4

Unnamed: 0,A,B,C,D
0,2,1500,Good,300000
1,3,1300,Fair,240000
2,3,1900,Very good,450000
3,3,1850,Bad,280000
4,2,1640,Good,310000


# Skipping Initial Rows and Footers When Reading a CSV File

In [13]:
url = (
    'https://raw.githubusercontent.com/PacktWorkshops/'
    'The-Data-Wrangling-Workshop/master/Chapter05/datasets/CSV_EX_skiprows.csv'
)
urlretrieve(url, 'ex5.csv')

df5 = pd.read_csv('ex5.csv')
df5

Unnamed: 0,Filetype: CSV,Unnamed: 1,Unnamed: 2,Unnamed: 3
0,,Info about some houses,,
1,Bedroom,Sq. foot,Locality,Price ($)
2,2,1500,Good,300000
3,3,1300,Fair,240000
4,3,1900,Very good,450000
5,3,1850,Bad,280000
6,2,1640,Good,310000


In [14]:
df5 = pd.read_csv('ex5.csv', skiprows=2)
df5

Unnamed: 0,Bedroom,Sq. foot,Locality,Price ($)
0,2,1500,Good,300000
1,3,1300,Fair,240000
2,3,1900,Very good,450000
3,3,1850,Bad,280000
4,2,1640,Good,310000


In [15]:
url = (
    'https://raw.githubusercontent.com/PacktWorkshops/'
    'The-Data-Wrangling-Workshop/master/Chapter05/datasets/CSV_EX_skipfooter.csv'
)
urlretrieve(url, 'ex6.csv')

df6 = pd.read_csv('ex6.csv')
df6

Unnamed: 0,Filetype: CSV,Unnamed: 1,Unnamed: 2,Unnamed: 3
0,,Info about some houses,,
1,Bedroom,Sq. foot,Locality,Price ($)
2,2,1500,Good,300000
3,3,1300,Fair,240000
4,3,1900,Very good,450000
5,3,1850,Bad,280000
6,2,1640,Good,310000
7,,This is the end of file,,


In [16]:
df6 = pd.read_csv('ex6.csv', skiprows=2, skipfooter=1, engine='python')
df6

Unnamed: 0,Bedroom,Sq. foot,Locality,Price ($)
0,2,1500,Good,300000
1,3,1300,Fair,240000
2,3,1900,Very good,450000
3,3,1850,Bad,280000
4,2,1640,Good,310000
