###Working with Large Excel Files in Pandas

##### Websites Used for the Tutorial

In [3]:
# Web Site for Tutorial
website = 'https://realpython.com/blog/python/working-with-large-excel-files-in-pandas/'
import webbrowser
webbrowser.open(website)

True

In [4]:
# Web Site for Data Source Files
website = 'https://data.gov.uk/dataset/road-accidents-safety-data/resource/80b76aec-a0a1-4e14-8235-09cc6b92574a'
import webbrowser
webbrowser.open(website)

True

#####Analyze the File

In [None]:
# Set up the environment
import pandas as pd
import numpy as np

In [13]:
# Import the excel data into a DataFrame
# Check the first 10 rows
# Note - File reads slowly
df = pd.read_csv("Accidents7904.csv", low_memory=False)
df.head(10)

Unnamed: 0,﻿Accident_Index,Location_Easting_OSGR,Location_Northing_OSGR,Longitude,Latitude,Police_Force,Accident_Severity,Number_of_Vehicles,Number_of_Casualties,Date,...,Pedestrian_Crossing-Human_Control,Pedestrian_Crossing-Physical_Facilities,Light_Conditions,Weather_Conditions,Road_Surface_Conditions,Special_Conditions_at_Site,Carriageway_Hazards,Urban_or_Rural_Area,Did_Police_Officer_Attend_Scene_of_Accident,LSOA_of_Accident_Location
0,197901A11AD14,,,,,1,3,2,1,18/01/1979,...,-1,-1,1,8,1,-1,0,-1,-1,
1,197901A1BAW34,198460.0,894000.0,,,1,3,1,1,01/01/1979,...,-1,-1,4,8,3,-1,0,-1,-1,
2,197901A1BFD77,406380.0,307000.0,,,1,3,2,3,01/01/1979,...,-1,-1,4,8,3,-1,0,-1,-1,
3,197901A1BGC20,281680.0,440000.0,,,1,3,2,2,01/01/1979,...,-1,-1,4,8,3,-1,0,-1,-1,
4,197901A1BGF95,153960.0,795000.0,,,1,2,2,1,01/01/1979,...,-1,-1,4,3,3,-1,0,-1,-1,
5,197901A1CBC96,300370.0,146000.0,,,1,3,1,1,01/01/1979,...,-1,-1,4,8,3,-1,0,-1,-1,
6,197901A1DAK71,143370.0,951000.0,,,1,3,2,2,01/01/1979,...,-1,-1,4,8,3,-1,0,-1,-1,
7,197901A1DAP95,471960.0,845000.0,,,1,3,2,1,01/01/1979,...,-1,-1,4,8,3,-1,0,-1,-1,
8,197901A1EAC32,323880.0,632000.0,,,1,2,1,1,01/01/1979,...,-1,-1,4,3,3,-1,0,-1,-1,
9,197901A1FBK75,136380.0,245000.0,,,1,3,2,1,01/01/1979,...,-1,-1,4,8,3,-1,0,-1,-1,


In [14]:
# Get number of rows in DataFrame
df.shape[0]

6224198

In [33]:
# Get list of headers
list(df.columns.values)

['\xef\xbb\xbfAccident_Index',
 'Location_Easting_OSGR',
 'Location_Northing_OSGR',
 'Longitude',
 'Latitude',
 'Police_Force',
 'Accident_Severity',
 'Number_of_Vehicles',
 'Number_of_Casualties',
 'Date',
 'Day_of_Week',
 'Time',
 'Local_Authority_(District)',
 'Local_Authority_(Highway)',
 '1st_Road_Class',
 '1st_Road_Number',
 'Road_Type',
 'Speed_limit',
 'Junction_Detail',
 'Junction_Control',
 '2nd_Road_Class',
 '2nd_Road_Number',
 'Pedestrian_Crossing-Human_Control',
 'Pedestrian_Crossing-Physical_Facilities',
 'Light_Conditions',
 'Weather_Conditions',
 'Road_Surface_Conditions',
 'Special_Conditions_at_Site',
 'Carriageway_Hazards',
 'Urban_or_Rural_Area',
 'Did_Police_Officer_Attend_Scene_of_Accident',
 'LSOA_of_Accident_Location']

In [34]:
# Grab all rows where accidents occur on a Sunday
# The code for Sunday is 1
# This is sourced from the Road Accident Safety Data Guide file
df[df['Day_of_Week']==1]

Unnamed: 0,﻿Accident_Index,Location_Easting_OSGR,Location_Northing_OSGR,Longitude,Latitude,Police_Force,Accident_Severity,Number_of_Vehicles,Number_of_Casualties,Date,...,Pedestrian_Crossing-Human_Control,Pedestrian_Crossing-Physical_Facilities,Light_Conditions,Weather_Conditions,Road_Surface_Conditions,Special_Conditions_at_Site,Carriageway_Hazards,Urban_or_Rural_Area,Did_Police_Officer_Attend_Scene_of_Accident,LSOA_of_Accident_Location
385,197901A7ADD27,247480,27000,,,1,3,1,1,07/01/1979,...,-1,-1,4,8,2,-1,0,-1,-1,
386,197901A7ADI29,281180,439000,,,1,3,1,1,07/01/1979,...,-1,-1,4,2,2,-1,0,-1,-1,
387,197901A7AHE55,129370,535000,,,1,3,2,2,07/01/1979,...,-1,-1,4,8,2,-1,0,-1,-1,
388,197901A7AJC53,372980,353000,,,1,3,3,2,07/01/1979,...,-1,-1,4,8,2,-1,0,-1,-1,
389,197901A7BHD25,258470,885000,,,1,3,2,1,07/01/1979,...,0,5,4,8,2,-1,0,-1,-1,
390,197901A7CCC20,263880,460000,,,1,2,1,1,07/01/1979,...,0,5,4,8,2,-1,0,-1,-1,
391,197901A7CDC91,312070,601000,,,1,3,2,1,07/01/1979,...,-1,-1,4,8,2,-1,0,-1,-1,
392,197901A7CKD97,453260,887000,,,1,2,1,1,07/01/1979,...,-1,-1,4,2,2,-1,0,-1,-1,
393,197901A7EAC13,279880,73000,,,1,1,2,1,07/01/1979,...,-1,-1,4,8,2,-1,0,-1,-1,
394,197901A7EBC28,289680,428000,,,1,3,2,1,07/01/1979,...,-1,-1,4,8,2,-1,0,-1,-1,


In [35]:
# Get number of accidents on a Sunday
# Count number of rows in DataFrame where Day_of_Week = 1
# For all rows
df[df['Day_of_Week']==1].count()

﻿Accident_Index                                693847
Location_Easting_OSGR                          692648
Location_Northing_OSGR                         692648
Longitude                                      148358
Latitude                                       148358
Police_Force                                   693847
Accident_Severity                              693847
Number_of_Vehicles                             693847
Number_of_Casualties                           693847
Date                                           693846
Day_of_Week                                    693847
Time                                           693744
Local_Authority_(District)                     693847
Local_Authority_(Highway)                      693847
1st_Road_Class                                 693847
1st_Road_Number                                693847
Road_Type                                      693847
Speed_limit                                    693847
Junction_Detail             

In [36]:
# Get number of accidents on a Sunday
# Count number of rows in DataFrame where Day_of_Week = 1
# Just show number of rows for Day_of_Week column
len(df[df['Day_of_Week']==1])

693847

In [43]:
# Or Use
accidents_sunday = df[df['Day_of_Week']==1]
len(accidents_sunday)

693847

In [55]:
# Or Use
accidents_sunday = df[df['Day_of_Week']==1]
print("Accidents on Sunday: {0}".format(
        len(accidents_sunday)))

Accidents on Sunday: 693847


In [56]:
# Grab number of accidents on a Sunday
# And involving more than 20 cars
accidents_sunday_twenty_cars = df[
    (df.Day_of_Week == 1) & (df.Number_of_Vehicles > 20)]
print("Accidents which happened on a Sunday involving > 20 cars: {0}".format(
    len(accidents_sunday_twenty_cars)))

Accidents which happened on a Sunday involving > 20 cars: 10
