# Excel Parsing


In [1]:
import pandas
import numpy

## Import and inspect data

In [2]:
data = pandas.read_excel("/Users/danielcorcoran/PycharmProjects/daniels_mac_proj/datasets/csa_data.xlsx", sheet_name = "Table 3")

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 104 entries, 0 to 103
Data columns (total 10 columns):
Unnamed: 0    0 non-null float64
Unnamed: 1    6 non-null object
Unnamed: 2    84 non-null object
Unnamed: 3    85 non-null object
Unnamed: 4    84 non-null object
Unnamed: 5    84 non-null object
Unnamed: 6    84 non-null object
Unnamed: 7    84 non-null object
Unnamed: 8    0 non-null float64
Unnamed: 9    84 non-null object
dtypes: float64(2), object(8)
memory usage: 8.2+ KB


In [4]:
data

Unnamed: 0.1,Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9
0,,,,,,,,,,
1,,,,,,,,,,
2,,,,,,,,,,
3,,,,,,,,,,
4,,,,,,,,,,
5,,,,,,,,,,
6,,,,,,,,,,
7,,Table 3. Family incidents recorded by police r...,,,,,,,,
8,,,,,,,,,,
9,,,,Family incidents,,,,,,


## Drop columns and rows where all cells are nulls 

In [6]:
data2 = data.dropna(how = "all", axis = 1)

In [7]:
data3 = data2.dropna(how = "all", axis = 0)

In [8]:
data3

Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 9
7,Table 3. Family incidents recorded by police r...,,,,,,,
9,,,Family incidents,,,,,
10,,,Jul 2012 - Jun 2013,Jul 2013 - Jun 2014,Jul 2014 - Jun 2015,Jul 2015 - Jun 2016,Jul 2016 - Jun 2017,% change 2016 - 2017
13,North West Metro Region,Banyule,1170,1222,1342,1317,1303,-0.0106302
14,,Brimbank,1927,2143,2480,2725,2246,-0.17578
15,,Darebin,1497,1682,1605,1646,1645,-0.000607533
16,,Hobsons Bay,804,971,1057,1146,1016,-0.113438
17,,Hume,2546,2906,3013,3096,3040,-0.0180879
18,,Maribyrnong,758,853,835,929,783,-0.157158
19,,Melbourne,1052,1155,1280,1464,1451,-0.00887978


## Drop the first two rows, the headers are located beneath

In [9]:
data4 = data3.drop([7, 9], axis = 0)

In [10]:
data4.head()

Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 9
10,,,Jul 2012 - Jun 2013,Jul 2013 - Jun 2014,Jul 2014 - Jun 2015,Jul 2015 - Jun 2016,Jul 2016 - Jun 2017,% change 2016 - 2017
13,North West Metro Region,Banyule,1170,1222,1342,1317,1303,-0.0106302
14,,Brimbank,1927,2143,2480,2725,2246,-0.17578
15,,Darebin,1497,1682,1605,1646,1645,-0.000607533
16,,Hobsons Bay,804,971,1057,1146,1016,-0.113438


## Drop first column, its useless

In [11]:
data5 = data4.drop(["Unnamed: 1"], axis = 1)

In [12]:
data5.head()

Unnamed: 0,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 9
10,,Jul 2012 - Jun 2013,Jul 2013 - Jun 2014,Jul 2014 - Jun 2015,Jul 2015 - Jun 2016,Jul 2016 - Jun 2017,% change 2016 - 2017
13,Banyule,1170,1222,1342,1317,1303,-0.0106302
14,Brimbank,1927,2143,2480,2725,2246,-0.17578
15,Darebin,1497,1682,1605,1646,1645,-0.000607533
16,Hobsons Bay,804,971,1057,1146,1016,-0.113438


## Reset index

In [13]:
data6 = data5.reset_index(drop = True)

In [14]:
data6.head()

Unnamed: 0,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 9
0,,Jul 2012 - Jun 2013,Jul 2013 - Jun 2014,Jul 2014 - Jun 2015,Jul 2015 - Jun 2016,Jul 2016 - Jun 2017,% change 2016 - 2017
1,Banyule,1170,1222,1342,1317,1303,-0.0106302
2,Brimbank,1927,2143,2480,2725,2246,-0.17578
3,Darebin,1497,1682,1605,1646,1645,-0.000607533
4,Hobsons Bay,804,971,1057,1146,1016,-0.113438


## Store headers in a list from the first row

In [15]:
headers = data6.iloc[0,].tolist()

In [16]:
headers

[nan,
 'Jul 2012 - Jun 2013',
 'Jul 2013 - Jun 2014',
 'Jul 2014 - Jun 2015',
 'Jul 2015 - Jun 2016',
 'Jul 2016 - Jun 2017',
 '% change 2016 - 2017']

## Change the first item to suitable name, since it is nan

In [17]:
headers[0] = "Area"
data6.columns = headers

In [18]:
data6.head()

Unnamed: 0,Area,Jul 2012 - Jun 2013,Jul 2013 - Jun 2014,Jul 2014 - Jun 2015,Jul 2015 - Jun 2016,Jul 2016 - Jun 2017,% change 2016 - 2017
0,,Jul 2012 - Jun 2013,Jul 2013 - Jun 2014,Jul 2014 - Jun 2015,Jul 2015 - Jun 2016,Jul 2016 - Jun 2017,% change 2016 - 2017
1,Banyule,1170,1222,1342,1317,1303,-0.0106302
2,Brimbank,1927,2143,2480,2725,2246,-0.17578
3,Darebin,1497,1682,1605,1646,1645,-0.000607533
4,Hobsons Bay,804,971,1057,1146,1016,-0.113438


## After assigning new headers drop the row which contained them, and reset index once again

In [19]:
data7 =data6.drop([0], axis = 0)

In [20]:
data8 = data7.reset_index(drop = True)

In [21]:
data8.head()

Unnamed: 0,Area,Jul 2012 - Jun 2013,Jul 2013 - Jun 2014,Jul 2014 - Jun 2015,Jul 2015 - Jun 2016,Jul 2016 - Jun 2017,% change 2016 - 2017
0,Banyule,1170,1222,1342,1317,1303,-0.0106302
1,Brimbank,1927,2143,2480,2725,2246,-0.17578
2,Darebin,1497,1682,1605,1646,1645,-0.000607533
3,Hobsons Bay,804,971,1057,1146,1016,-0.113438
4,Hume,2546,2906,3013,3096,3040,-0.0180879


## Some rows contain aggregates, they need to be removed

In [22]:
drop_list = []


In [23]:
for row_index in range(data8.shape[0]):
    name = data8.iloc[row_index, 0]
    if "total" in str(name).lower():
        drop_list.append(row_index)

In [24]:
drop_list

[14, 40, 51, 82]

## The rows with index 14, 40, 51, 82 need to be dropped  

In [25]:
data8.iloc[drop_list]

Unnamed: 0,Area,Jul 2012 - Jun 2013,Jul 2013 - Jun 2014,Jul 2014 - Jun 2015,Jul 2015 - Jun 2016,Jul 2016 - Jun 2017,% change 2016 - 2017
14,Sub total,18705,20624,22687,24633,23657,-0.0396216
40,Sub total,14997,16479,17959,20156,19155,-0.0496626
51,Sub total,14835,15197,16087,17579,18213,0.0360658
82,Sub total1,12000,12865,14155,15626,15465,-0.0103033


In [26]:
data9 = data8.drop(drop_list, axis = 0).reset_index(drop = True)

In [27]:
data9

Unnamed: 0,Area,Jul 2012 - Jun 2013,Jul 2013 - Jun 2014,Jul 2014 - Jun 2015,Jul 2015 - Jun 2016,Jul 2016 - Jun 2017,% change 2016 - 2017
0,Banyule,1170,1222,1342,1317,1303,-0.0106302
1,Brimbank,1927,2143,2480,2725,2246,-0.17578
2,Darebin,1497,1682,1605,1646,1645,-0.000607533
3,Hobsons Bay,804,971,1057,1146,1016,-0.113438
4,Hume,2546,2906,3013,3096,3040,-0.0180879
5,Maribyrnong,758,853,835,929,783,-0.157158
6,Melbourne,1052,1155,1280,1464,1451,-0.00887978
7,Melton,1422,1620,1942,2110,2104,-0.0028436
8,Moonee Valley,934,915,1035,1097,1054,-0.0391978
9,Moreland,1456,1462,1663,1860,1799,-0.0327957


## Finally drop last two rows which contain nulls in any cell 

In [28]:
data10 = data9.dropna(how = "any", axis = 0)

In [29]:
data10

Unnamed: 0,Area,Jul 2012 - Jun 2013,Jul 2013 - Jun 2014,Jul 2014 - Jun 2015,Jul 2015 - Jun 2016,Jul 2016 - Jun 2017,% change 2016 - 2017
0,Banyule,1170,1222,1342,1317,1303,-0.0106302
1,Brimbank,1927,2143,2480,2725,2246,-0.17578
2,Darebin,1497,1682,1605,1646,1645,-0.000607533
3,Hobsons Bay,804,971,1057,1146,1016,-0.113438
4,Hume,2546,2906,3013,3096,3040,-0.0180879
5,Maribyrnong,758,853,835,929,783,-0.157158
6,Melbourne,1052,1155,1280,1464,1451,-0.00887978
7,Melton,1422,1620,1942,2110,2104,-0.0028436
8,Moonee Valley,934,915,1035,1097,1054,-0.0391978
9,Moreland,1456,1462,1663,1860,1799,-0.0327957


## Optional export cleaned data to csv/excel 

In [31]:
data10.to_csv("cleaned_excel_table3.csv")
data10.to_excel("cleaned_excel_table3.xlsx")