# Count the number of lines in Python for each file

## 1) Command Line

Use shell commands with the `!` notation to count the number of lines in `bookings.csv.bz2` and `searches.csv.bz2`.

In [1]:
!bzcat searches.csv.bz2 | wc -l

20390198


In [2]:
!bzcat bookings.csv.bz2 | wc -l

10000011


## 2) Python:

We have 2 options:

* uncompressing the whole file, then reading from the result.

* without uncompressing: better, because we don't expend as much storage or litter our HDD.


#### Python without uncompressing

In [14]:
import pandas as pd
import bz2

In [15]:
csv = pd.read_csv('searches.csv.bz2', header=0, compression='bz2', sep='^', chunksize=10e4)

In [16]:
%%time
shape = 0
for chunk in csv:
    shape += int(chunk.shape[0])
print(shape)



20390198
CPU times: user 8min 58s, sys: 29.2 s, total: 9min 27s
Wall time: 11min 50s


In [17]:
%%time
bzfile = bz2.BZ2File('searches.csv.bz2') 

count = 0
for line in bzfile:
    count += 1
    
print(count)

20390199
CPU times: user 7min 19s, sys: 16.5 s, total: 7min 36s
Wall time: 7min 57s


In [18]:
csv = pd.read_csv('bookings.csv.bz2', header=0, compression='bz2', sep='^', chunksize=10e4)

In [19]:
%%time
shape = 0
for chunk in csv:
    shape += int(chunk.shape[0])
print(shape)



10000010
CPU times: user 7min 51s, sys: 18.2 s, total: 8min 9s
Wall time: 8min 30s


In [20]:
%%time
bzfile = bz2.BZ2File('bookings.csv.bz2') 

count = 0
for line in bzfile:
    count += 1
    
print(count)

10000011
CPU times: user 5min 22s, sys: 8.51 s, total: 5min 30s
Wall time: 5min 48s
