# Demo large dataset processing

this demo illustrate the speed of Python to open large datasets, merge and group data

In [2]:
# load the earthquake data and population data from the pickle files in the data directory
# and plot the data on a map of the world

import pickle
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd



## Loading a csv file

The file earthquake_data.csv contains 500'000 lines to dummy earthquake data between 1974 and 2023.

The csv file is a text file which contains 1 line per data point:

```text
Timestamp,City,Magnitude,Depth (km)
1974-01-01 00:00:00.000000000,Los Angeles,8.849241726842655,49.37216665723409
1974-01-01 00:52:19.609479218,Istanbul,5.8757724072988795,24.514696519602126
1974-01-01 01:44:39.218958437,Istanbul,3.8669320572392474,37.61128303473302
1974-01-01 02:36:58.828437656,Kathmandu,4.844319224595527,1.3586515466092512
1974-01-01 03:29:18.437916875,Los Angeles,6.5682598775992505,93.28988214245696
...
```


In [21]:
import os

print('The file earthquake.csv contains  %.2f MB or raw data' % (os.stat('../data/earthquake_data.csv').st_size/1024/1024))


The file earthquake.csv contains  36.56 MB or raw data


In [3]:
# Test 1: load the earthquake data from the csv file
%timeit  df = pd.read_csv('../data/earthquake_data.csv')

# on my machine, this takes about 362 ms to load the data
# 362 ms ± 24.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
# try to load the same file in Excel 

367 ms ± 21.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [4]:
# test 2: load the data from the pickle files
%timeit with open('../data/earthquake_data.pkl', 'rb') as f: earthquake_data = pickle.load(f)

# on my machine this takes on average 9.61 ms to load the data from a pickle file
# 9.61 ms ± 471 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)

9.8 ms ± 801 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [7]:
with open('../data/earthquake_data.pkl', 'rb') as f: 
    earthquake_data = pickle.load(f)

In [8]:
with open('../data/city_population_data.pkl', 'rb') as f:
    population_data = pickle.load(f)


In [9]:
earthquake_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500000 entries, 0 to 499999
Data columns (total 4 columns):
 #   Column      Non-Null Count   Dtype         
---  ------      --------------   -----         
 0   Timestamp   500000 non-null  datetime64[ns]
 1   City        500000 non-null  object        
 2   Magnitude   500000 non-null  float64       
 3   Depth (km)  500000 non-null  float64       
dtypes: datetime64[ns](1), float64(2), object(1)
memory usage: 15.3+ MB


In [10]:
population_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7 entries, 0 to 6
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   City        7 non-null      object
 1   Country     7 non-null      object
 2   Population  7 non-null      int64 
dtypes: int64(1), object(2)
memory usage: 300.0+ bytes


In [11]:
population_data[['City','Population']]

Unnamed: 0,City,Population
0,Tokyo,37000
1,San Francisco,883
2,Los Angeles,3800
3,Mexico City,21000
4,Anchorage,290
5,Istanbul,15000
6,Kathmandu,1000


In [12]:

# merge the 2 dataframes (equivalent to creating a table in excel then adding a vlookup column formula to look up the population for each city)
%timeit combined_data = pd.merge(earthquake_data, population_data[['City','Population','Country']], on='City')

# on my machine it takes on average 43.8 ms to merge the 2 datasets
# 43.8 ms ± 2.8 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)

48 ms ± 8.97 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [13]:
combined_data = pd.merge(earthquake_data, population_data[['City','Population','Country']], on='City')

In [None]:

%timeit country_stats = combined_data.groupby('Country')['Magnitude'].agg(['median', 'mean']).round(3)

# on my machine it takes on average 78 ms to calculate the median and mean magnitude for each country
# 78 ms ± 2.12 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)

78 ms ± 4.63 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [15]:
country_stats = combined_data.groupby('Country')['Magnitude'].agg(['median', 'mean']).round(3)

country_stats

Unnamed: 0_level_0,median,mean
Country,Unnamed: 1_level_1,Unnamed: 2_level_1
Japan,5.998,6.0
Mexico,5.989,5.992
Nepal,6.0,5.998
Turkey,5.981,5.992
USA,6.009,6.006
