# Exploring Data with Pandas

This notebook walks through examples of exploratory data analysis with Pandas. See the [episode README](README.md) for full discussion.

In [5]:
import pandas as pd

In [3]:
data_dir = '../../../data/input/ch1/'
airports_file = data_dir + 'airports.csv'
routes_file = data_dir + 'routes.csv'

In [3]:
routes = pd.read_csv(routes_file, header=0)
print(routes.head())
print(routes.head(2))
print(routes.tail(3))
print(routes.shape)
routes.info()

  airline  src dest codeshare  stops equipment
0      2B  ASF  KZN       NaN      0       CR2
1      2B  ASF  MRV       NaN      0       CR2
2      2B  CEK  KZN       NaN      0       CR2
3      2B  CEK  OVB       NaN      0       CR2
4      2B  DME  KZN       NaN      0       CR2
  airline  src dest codeshare  stops equipment
0      2B  ASF  KZN       NaN      0       CR2
1      2B  ASF  MRV       NaN      0       CR2
      airline  src dest codeshare  stops equipment
67659      ZM  FRU  DME       NaN      0       734
67660      ZM  FRU  OSS       NaN      0       734
67661      ZM  OSS  FRU       NaN      0       734
(67662, 6)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 67662 entries, 0 to 67661
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   airline    67662 non-null  object
 1   src        67662 non-null  object
 2   dest       67662 non-null  object
 3   codeshare  14597 non-null  object
 4   stops      6766

Another useful preview function is count(), which can help us quickly identify columns with missing values:

In [5]:
print(routes.count())

airline      67662
src          67662
dest         67662
codeshare    14597
stops        67662
equipment    67644
dtype: int64


In [7]:
print(routes.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 67662 entries, 0 to 67661
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   airline    67662 non-null  object
 1   src        67662 non-null  object
 2   dest       67662 non-null  object
 3   codeshare  14597 non-null  object
 4   stops      67662 non-null  int64 
 5   equipment  67644 non-null  object
dtypes: int64(1), object(5)
memory usage: 3.1+ MB
None


In [6]:
print(routes.describe())

              stops
count  67662.000000
mean       0.000163
std        0.012749
min        0.000000
25%        0.000000
50%        0.000000
75%        0.000000
max        1.000000


In [14]:
print(f"Fewest stops: {routes.stops.min()}\n Most stops: {routes.stops.max()}")

Fewest stops: 0
 Most stops: 1


In [8]:
print(routes.drop_duplicates().shape)
print(routes.drop_duplicates(ignore_index=True).shape)
print(routes.drop_duplicates(ignore_index=True, subset=['src', 'dest']).shape)

(67662, 6)
(67662, 6)
(37594, 6)


from pandas_profiling import ProfileReport

profile = ProfileReport(routes, title="Pandas Profiling Report")
profile.to_notebook_iframe()

In [6]:
fruit_file = data_dir + 'fruits.csv'
fruits = pd.read_csv(fruit_file, names=['fruit', 'day', 'count'])
print(fruits.head())
print(fruits.info())

    fruit        day  count
0   apple      today      4
1  cherry  yesterday      1
2   apple  yesterday      1
3   apple  yesterday      1
4  cherry      today      4
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25 entries, 0 to 24
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   fruit   25 non-null     object
 1   day     25 non-null     object
 2   count   25 non-null     int64 
dtypes: int64(1), object(2)
memory usage: 728.0+ bytes
None


In [30]:
print(fruits[['count']].sum())
apple_count = fruits[fruits.fruit == 'apple'][['count']]
print(apple_count)
print(apple_count.sum())
print(apple_count.mean())
print(apple_count.count())

count    68
dtype: int64
    count
0       4
2       1
3       1
5       1
7       4
10      5
11      5
14      1
20      4
count    26
dtype: int64
count    2.888889
dtype: float64
count    9
dtype: int64


In [25]:
fruits.head()

Unnamed: 0,fruit,day,count
0,apple,today,4
1,cherry,yesterday,1
2,apple,yesterday,1
3,apple,yesterday,1
4,cherry,today,4


In [32]:
print(fruits.sort_values(['count']))
print(fruits.sort_values(['fruit']))

     fruit        day  count
1   cherry  yesterday      1
2    apple  yesterday      1
3    apple  yesterday      1
5    apple  yesterday      1
18  cherry  yesterday      1
23  cherry  yesterday      1
14   apple      today      1
17  banana      today      2
22  cherry  yesterday      2
21  cherry  yesterday      2
9   cherry  yesterday      2
19  cherry  yesterday      2
13  banana  yesterday      2
6   cherry      today      3
15  cherry      today      3
0    apple      today      4
20   apple      today      4
12  cherry  yesterday      4
8   cherry      today      4
7    apple      today      4
4   cherry      today      4
24  cherry  yesterday      4
11   apple  yesterday      5
10   apple      today      5
16  cherry  yesterday      5
     fruit        day  count
0    apple      today      4
2    apple  yesterday      1
3    apple  yesterday      1
5    apple  yesterday      1
20   apple      today      4
7    apple      today      4
10   apple      today      5
11   apple  ye

In [33]:
print(routes.sort_values(['dest']).head())

      airline  src dest codeshare  stops equipment
10180      AH  IST  AAE       NaN      0       738
10134      AH  CDG  AAE       NaN      0       738
10238      AH  ORY  AAE       NaN      0       736
10220      AH  ORN  AAE       NaN      0       ATR
67471      ZI  MRS  AAE       NaN      0       319


In [9]:
# sort on the row index
print(fruits.sort_index(axis = 0))

# sort on the column labels
print(fruits.sort_index(axis = 1))

     fruit        day  count
0    apple      today      4
1   cherry  yesterday      1
2    apple  yesterday      1
3    apple  yesterday      1
4   cherry      today      4
5    apple  yesterday      1
6   cherry      today      3
7    apple      today      4
8   cherry      today      4
9   cherry  yesterday      2
10   apple      today      5
11   apple  yesterday      5
12  cherry  yesterday      4
13  banana  yesterday      2
14   apple      today      1
15  cherry      today      3
16  cherry  yesterday      5
17  banana      today      2
18  cherry  yesterday      1
19  cherry  yesterday      2
20   apple      today      4
21  cherry  yesterday      2
22  cherry  yesterday      2
23  cherry  yesterday      1
24  cherry  yesterday      4
    count        day   fruit
0       4      today   apple
1       1  yesterday  cherry
2       1  yesterday   apple
3       1  yesterday   apple
4       4      today  cherry
5       1  yesterday   apple
6       3      today  cherry
7       4     

In [12]:
print(fruits.sort_values(by = "fruit", ascending = False))

     fruit        day  count
12  cherry  yesterday      4
1   cherry  yesterday      1
23  cherry  yesterday      1
22  cherry  yesterday      2
21  cherry  yesterday      2
19  cherry  yesterday      2
18  cherry  yesterday      1
16  cherry  yesterday      5
15  cherry      today      3
24  cherry  yesterday      4
9   cherry  yesterday      2
8   cherry      today      4
6   cherry      today      3
4   cherry      today      4
13  banana  yesterday      2
17  banana      today      2
10   apple      today      5
11   apple  yesterday      5
14   apple      today      1
7    apple      today      4
5    apple  yesterday      1
20   apple      today      4
3    apple  yesterday      1
2    apple  yesterday      1
0    apple      today      4


We can also sort by multiple columns in a single operation using `sort_values()`. To do this, we pass `sort_values()` a list of column names as the first argument, with a corresponding tuple of booleans specifying sort directions as the second argument:

In [8]:
print(fruits.sort_values(["fruit", "count", "day"], ascending = (True, False, True)))

     fruit        day  count
10   apple      today      5
11   apple  yesterday      5
0    apple      today      4
7    apple      today      4
20   apple      today      4
14   apple      today      1
2    apple  yesterday      1
3    apple  yesterday      1
5    apple  yesterday      1
17  banana      today      2
13  banana  yesterday      2
16  cherry  yesterday      5
4   cherry      today      4
8   cherry      today      4
12  cherry  yesterday      4
24  cherry  yesterday      4
6   cherry      today      3
15  cherry      today      3
9   cherry  yesterday      2
19  cherry  yesterday      2
21  cherry  yesterday      2
22  cherry  yesterday      2
1   cherry  yesterday      1
18  cherry  yesterday      1
23  cherry  yesterday      1


We can see that this example produces a result where the records are sorted as follows:
1. by the fruit name, in ascending order
2. by fruit count, in descending order
3. by day, in ascending order