Pandas incorporates two additional data structures into Python, namely Pandas Series and Pandas DataFrame.

- How to import Pandas
- How to create Pandas Series and DataFrames using various methods
- How to access and change elements in Series and DataFrames
- How to perform arithmetic operations on Series
- How to load data into a DataFrame
- How to deal with Not a Number (NaN) values

[Pandas Documentation](https://pandas.pydata.org/pandas-docs/stable/)

Pandas Series and DataFrames are designed for fast data analysis and manipulation, as well as being flexible and easy to use. Below are just a few features that makes Pandas an excellent package for data analysis:

- Allows the use of labels for rows and columns
- Can calculate rolling statistics on time series data
- Easy handling of NaN values
- Is able to load data of different formats into DataFrames
- Can join and merge different datasets together
- It integrates with NumPy and Matplotlib

In [1]:
import pandas as pd

groceries = pd.Series(data=[20,6,'Yes','No'], index=['eggs', 'apples', 'milk', 'bread'])
print(groceries)

eggs       20
apples      6
milk      Yes
bread      No
dtype: object


In [2]:
print("\nShape:", groceries.shape)
print("Number of dimentions:", groceries.ndim)
print("Size:", groceries.size)


Shape: (4,)
Number of dimentions: 1
Size: 4


In [4]:
print(groceries.index)
print(groceries.values)

Index(['eggs', 'apples', 'milk', 'bread'], dtype='object')
[20 6 'Yes' 'No']


In [6]:
'banana' in groceries

False

In [7]:
'bread' in groceries

True

# Accessing and Deleting Elements in Pandas Series

In [8]:
groceries['eggs']

20

In [10]:
groceries[['eggs','milk']]

eggs     20
milk    Yes
dtype: object

In [11]:
groceries[0]  # first element

20

In [12]:
groceries[-1]  # last element

'No'

In [14]:
groceries[[0,-1]]

eggs     20
bread    No
dtype: object

In [15]:
groceries.loc['eggs']  # explicity using a labeled index

20

In [16]:
groceries.iloc[2]  # explicity using a numeric index

'Yes'

In [19]:
groceries['eggs'] = 2
groceries

eggs        2
apples      6
milk      Yes
bread      No
dtype: object

In [21]:
groceries.drop('apples') # returns groceries without apples

eggs       2
milk     Yes
bread     No
dtype: object

In [22]:
groceries # apples still in groceries

eggs        2
apples      6
milk      Yes
bread      No
dtype: object

In [23]:
groceries.drop('apples', inplace=True)
groceries

eggs       2
milk     Yes
bread     No
dtype: object

In [25]:
fruits = pd.Series([10, 6, 3], ['apples','oranges','bananas'])
fruits

apples     10
oranges     6
bananas     3
dtype: int64

In [26]:
fruits + 2

apples     12
oranges     8
bananas     5
dtype: int64

In [27]:
fruits * 2

apples     20
oranges    12
bananas     6
dtype: int64

In [28]:
import numpy as np

In [29]:
np.sqrt(fruits)

apples     3.162278
oranges    2.449490
bananas    1.732051
dtype: float64

In [30]:
np.exp(fruits)

apples     22026.465795
oranges      403.428793
bananas       20.085537
dtype: float64

In [31]:
np.power(fruits, 2)

apples     100
oranges     36
bananas      9
dtype: int64

In [32]:
fruits

apples     10
oranges     6
bananas     3
dtype: int64

In [34]:
fruits['bananas'] + 2

5

In [35]:
fruits.iloc[0] + 1

11

In [36]:
# double apples and oranges
fruits[['apples', 'oranges']] * 2

apples     20
oranges    12
dtype: int64

In [48]:
import pandas as pd

# Create a Pandas Series that contains the distance of some planets from the Sun.
# Use the name of the planets as the index to your Pandas Series, and the distance
# from the Sun as your data. The distance from the Sun is in units of 10^6 km

distance_from_sun = [149.6, 1433.5, 227.9, 108.2, 778.6]

planets = ['Earth','Saturn', 'Mars','Venus', 'Jupiter']

# Create a Pandas Series using the above data, with the name of the planets as
# the index and the distance from the Sun as your data.
dist_planets = pd.Series(index=planets, data=distance_from_sun)

print(dist_planets)

Earth       149.6
Saturn     1433.5
Mars        227.9
Venus       108.2
Jupiter     778.6
dtype: float64


In [49]:
# Calculate the number of minutes it takes sunlight to reach each planet. You can
# do this by dividing the distance from the Sun for each planet by the speed of light.
# Since in the data above the distance from the Sun is in units of 10^6 km, you can
# use a value for the speed of light of c = 18, since light travels 18 x 10^6 km/minute.
time_light = dist_planets / 18

print(time_light)

Earth       8.311111
Saturn     79.638889
Mars       12.661111
Venus       6.011111
Jupiter    43.255556
dtype: float64


In [50]:
close_planets = time_light[(time_light < 40)]

In [51]:
print(close_planets)

Earth     8.311111
Mars     12.661111
Venus     6.011111
dtype: float64


## Creating Pandas DataFrames

In [52]:
import pandas as pd

In [53]:
# dataframe is like a powerfull spreadsheet

items = {'Bob' : pd.Series(data = [245, 25, 55], index = ['bike', 'pants', 'watch']),
         'Alice' : pd.Series(data = [40, 110, 500, 45], index = ['book', 'glasses', 'bike', 'pants'])}

# We print the type of items to see that it is a dictionary
print(type(items))

<class 'dict'>


In [54]:
shopping_carts = pd.DataFrame(items)
shopping_carts

Unnamed: 0,Bob,Alice
bike,245.0,500.0
book,,40.0
glasses,,110.0
pants,25.0,45.0
watch,55.0,


In [55]:
shopping_carts.shape

(5, 2)

In [56]:
shopping_carts.ndim

2

In [61]:
shopping_carts.values

array([[ 245.,  500.],
       [  nan,   40.],
       [  nan,  110.],
       [  25.,   45.],
       [  55.,   nan]])

In [62]:
# We create a list of Python dictionaries
items2 = [{'bikes': 20, 'pants': 30, 'watches': 35}, 
          {'watches': 10, 'glasses': 50, 'bikes': 15, 'pants':5}]

# We create a DataFrame 
store_items = pd.DataFrame(items2)

# We display the DataFrame
store_items

Unnamed: 0,bikes,glasses,pants,watches
0,20,,30,35
1,15,50.0,5,10


## Accessing Elements in Pandas DataFrames

In [88]:
# We create a list of Python dictionaries
items2 = [{'bikes': 20, 'pants': 30, 'watches': 35}, 
          {'watches': 10, 'glasses': 50, 'bikes': 15, 'pants':5}]

# We create a DataFrame 
store_items = pd.DataFrame(items2, index=['store 1', 'store 2'])

# We display the DataFrame
store_items

Unnamed: 0,bikes,glasses,pants,watches
store 1,20,,30,35
store 2,15,50.0,5,10


In [89]:
# access column labels first
store_items['bikes']['store 1']

20

In [90]:
store_items['shirts'] = [15, 2]
store_items

Unnamed: 0,bikes,glasses,pants,watches,shirts
store 1,20,,30,35,15
store 2,15,50.0,5,10,2


In [91]:
store_items['suits'] = store_items['shirts'] + store_items['pants']
store_items

Unnamed: 0,bikes,glasses,pants,watches,shirts,suits
store 1,20,,30,35,15,45
store 2,15,50.0,5,10,2,7


In [92]:
new_items = [{'pants': 30}]
new_store = pd.DataFrame(new_items, index=['store 3'])
new_store

Unnamed: 0,pants
store 3,30


In [93]:
store_items = store_items.append(new_store, sort=True)
store_items

Unnamed: 0,bikes,glasses,pants,shirts,suits,watches
store 1,20.0,,30,15.0,45.0,35.0
store 2,15.0,50.0,5,2.0,7.0,10.0
store 3,,,30,,,


In [94]:
# delete a column with pop
store_items.pop('glasses')
store_items

Unnamed: 0,bikes,pants,shirts,suits,watches
store 1,20.0,30,15.0,45.0,35.0
store 2,15.0,5,2.0,7.0,10.0
store 3,,30,,,


In [95]:
# delete by axis
store_items = store_items.drop(['bikes'], axis=1)  # bikes column
store_items

Unnamed: 0,pants,shirts,suits,watches
store 1,30,15.0,45.0,35.0
store 2,5,2.0,7.0,10.0
store 3,30,,,


In [96]:
# delete by row label
store_items = store_items.drop(['store 1'], axis=0)
store_items

Unnamed: 0,pants,shirts,suits,watches
store 2,5,2.0,7.0,10.0
store 3,30,,,


In [98]:
store_items = store_items.rename(columns={'pants': 'hats'})

In [99]:
store_items

Unnamed: 0,hats,shirts,suits,watches
store 2,5,2.0,7.0,10.0
store 3,30,,,


In [100]:
store_items = store_items.rename(index={'store 3': 'fun'})
store_items

Unnamed: 0,hats,shirts,suits,watches
store 2,5,2.0,7.0,10.0
fun,30,,,


In [102]:
# Dealing with NaN
x = store_items.isnull()
print(x)


          hats  shirts  suits  watches
store 2  False   False  False    False
fun      False    True   True     True


In [104]:
# numer of NaNs is sum of logical true
x = store_items.isnull().sum()
print(x)

hats       0
shirts     1
suits      1
watches    1
dtype: int64


In [105]:
# sum the results
x = store_items.isnull().sum().sum()
print(x)

3


In [106]:
store_items.count()

hats       2
shirts     1
suits      1
watches    1
dtype: int64

In [107]:
# return a frame eliminating rows with NaN values
store_items.dropna(axis=0) # or store_items.dropna(axis=0, inplace=True) 

Unnamed: 0,hats,shirts,suits,watches
store 2,5,2.0,7.0,10.0


In [108]:
store_items

Unnamed: 0,hats,shirts,suits,watches
store 2,5,2.0,7.0,10.0
fun,30,,,


In [109]:
# return a frame eliminating columns with NaN values
store_items.dropna(axis=1) # or store_items.dropna(axis=1, inplace=True)

Unnamed: 0,hats
store 2,5
fun,30
