# Pandas

In [1]:
import pandas as pd

Differences between Pandas Series and NumPy ndarrays is that 
* you can assign an index label to each element in the Pandas Series
* Pandas Series can hold data of different data types
## Creating Pandas Series

In [2]:
# We create a Pandas Series that stores a grocery list
groceries = pd.Series(data = [30, 6, 'Yes', 'No'], index = ['eggs', 'apples', 'milk', 'bread'])
# We display the Groceries Pandas Series
print(groceries)
# We print some information about Groceries
print('Groceries has shape:', groceries.shape)
print('Groceries has dimension:', groceries.ndim)
print('Groceries has a total of', groceries.size, 'elements')
print('The data in Groceries is:', groceries.values)
print('The index of Groceries is:', groceries.index)
'bananas' in groceries

eggs       30
apples      6
milk      Yes
bread      No
dtype: object
Groceries has shape: (4,)
Groceries has dimension: 1
Groceries has a total of 4 elements
The data in Groceries is: [30 6 'Yes' 'No']
The index of Groceries is: Index(['eggs', 'apples', 'milk', 'bread'], dtype='object')


False

## Accessing and Deleting Elements in Pandas Series

In [3]:
# we can access multiple index labels
print('Do we need milk and bread:\n', groceries[['milk', 'bread']]) 
print('How many eggs and apples do we need to buy:\n', groceries.loc[['eggs', 'apples']]) 
# we use iloc to access multiple numerical indices
print('Do we need milk and bread:\n', groceries.iloc[[2, 3]]) 

Do we need milk and bread:
 milk     Yes
bread     No
dtype: object
How many eggs and apples do we need to buy:
 eggs      30
apples     6
dtype: object
Do we need milk and bread:
 milk     Yes
bread     No
dtype: object


In [4]:
# We change the number of eggs to 2
groceries['eggs'] = 2
print('Modified Grocery List:\n', groceries)
groceries.drop('apples', inplace = True)
print('Grocery List after removing apples in place:\n', groceries)

Modified Grocery List:
 eggs        2
apples      6
milk      Yes
bread      No
dtype: object
Grocery List after removing apples in place:
 eggs       2
milk     Yes
bread     No
dtype: object


## Arithmetic Operations on Pandas Series

In [5]:
# We multiply our grocery list by 2
groceries * 2

eggs          4
milk     YesYes
bread      NoNo
dtype: object

## Creating Pandas DataFrames

In [6]:
# We create a dictionary of Pandas Series 
items = {'Bob' : pd.Series(data = [245, 25, 55], index = ['bike', 'pants', 'watch']),
         'Alice' : pd.Series(data = [40, 110, 500, 45], index = ['book', 'glasses', 'bike', 'pants'])}
# We create a Pandas DataFrame by passing it a dictionary of Pandas Series
shopping_carts = pd.DataFrame(items)
# We display the DataFrame
shopping_carts

Unnamed: 0,Bob,Alice
bike,245.0,500.0
book,,40.0
glasses,,110.0
pants,25.0,45.0
watch,55.0,


In [7]:
# We create a dictionary of Pandas Series without indexes
data = {'Bob' : pd.Series([245, 25, 55]),
        'Alice' : pd.Series([40, 110, 500, 45])}
pd.DataFrame(data)

Unnamed: 0,Bob,Alice
0,245.0,40
1,25.0,110
2,55.0,500
3,,45


In [8]:
# We print some information about shopping_carts
print('shopping_carts has shape:', shopping_carts.shape)
print('shopping_carts has dimension:', shopping_carts.ndim)
print('shopping_carts has a total of:', shopping_carts.size, 'elements')
print('The data in shopping_carts is:\n', shopping_carts.values)
print('The row index in shopping_carts is:', shopping_carts.index)
print('The column index in shopping_carts is:', shopping_carts.columns)
# We Create a DataFrame that only has selected items for Alice
alice_sel_shopping_cart = pd.DataFrame(items, index = ['glasses', 'bike'], columns = ['Alice'])
# We display alice_sel_shopping_cart
alice_sel_shopping_cart

shopping_carts has shape: (5, 2)
shopping_carts has dimension: 2
shopping_carts has a total of: 10 elements
The data in shopping_carts is:
 [[245. 500.]
 [ nan  40.]
 [ nan 110.]
 [ 25.  45.]
 [ 55.  nan]]
The row index in shopping_carts is: Index(['bike', 'book', 'glasses', 'pants', 'watch'], dtype='object')
The column index in shopping_carts is: Index(['Bob', 'Alice'], dtype='object')


Unnamed: 0,Alice
glasses,110
bike,500


In [9]:
# We create a dictionary of lists (arrays)
data = {'Integers' : [1,2,3],
        'Floats' : [4.5, 8.2, 9.6]}
# We create a DataFrame and provide the row index
print (pd.DataFrame(data, index = ['label 1', 'label 2', 'label 3']))

# We create a list of Python dictionaries
items2 = [{'bikes': 20, 'pants': 30, 'watches': 35}, 
          {'watches': 10, 'glasses': 50, 'bikes': 15, 'pants':5}]

# We create a DataFrame 
store_items = pd.DataFrame(items2, index = ['store 1', 'store 2'])
print (store_items)

         Integers  Floats
label 1         1     4.5
label 2         2     8.2
label 3         3     9.6
         bikes  pants  watches  glasses
store 1     20     30       35      NaN
store 2     15      5       10     50.0


## Accessing Elements in Pandas DataFrames

In [10]:
print('How many bikes and pants are in each store:\n', store_items[['bikes', 'pants']])
print('What items are in Store 1:\n', store_items.loc[['store 1']])
print('How many bikes are in Store 2:', store_items['bikes']['store 2'])
store_items['shirts'] = [15,2]
store_items['suits'] = store_items['pants'] + store_items['shirts']
store_items

How many bikes and pants are in each store:
          bikes  pants
store 1     20     30
store 2     15      5
What items are in Store 1:
          bikes  pants  watches  glasses
store 1     20     30       35      NaN
How many bikes are in Store 2: 15


Unnamed: 0,bikes,pants,watches,glasses,shirts,suits
store 1,20,30,35,,15,45
store 2,15,5,10,50.0,2,7


In [11]:
# We count the number of NaN values in store_items
x =  store_items.isnull().sum().sum()
print('Number of NaN values in our DataFrame:', x)
# We replace NaN values with the previous value in the row
store_items.fillna(method = 'ffill', axis = 1)

Number of NaN values in our DataFrame: 1


Unnamed: 0,bikes,pants,watches,glasses,shirts,suits
store 1,20.0,30.0,35.0,35.0,15.0,45.0
store 2,15.0,5.0,10.0,50.0,2.0,7.0


In [12]:
# pd.read_csv('./GOOG.csv')
# Google_stock.describe()
# data.groupby(['Year', 'Department'])['Salary'].sum()