# Pandas and NumPy

## Import

In [1]:
import numpy as np
import pandas as pd

from pandas import Series, DataFrame

## NumPy Arrays
np.array(object, dtype=None, copy=True, order='K', subok=False, ndmin=0)
- object : array_like - An array, any object exposing the array interface, an object whose __array__ method returns an array, or any (nested) sequence.
- out : ndarray - An array object satisfying the specified requirements.

ndarray -> N-dimensional

### 1x3 array

In [2]:
print('1x3 array')
a = np.array([1, 2, 3])  
print(a)
print(type(a))            
print(a.shape)            
print(a[0], a[1], a[2])   
a[0] = 5                  
print(a)         

1x3 array
[1 2 3]
<type 'numpy.ndarray'>
(3,)
(1, 2, 3)
[5 2 3]


### 3x3 array

In [3]:
print('3x3 array')
b = np.array([[1,2,3],[4,5,6],[7,8,9]]) 
print(b)
print(b.shape)                     
print(b[0, 0], b[1, 1], b[2, 2]) # Pring diagonal

print(b[0:1])

3x3 array
[[1 2 3]
 [4 5 6]
 [7 8 9]]
(3, 3)
(1, 5, 9)
[[1 2 3]]


### Special methods

In [4]:
print('Create an 2x2 array of all zeros')
a = np.zeros((2,2))   
print(a)              
                     
print('Create an 1x2 array of all ones')
b = np.ones((1,2))   
print(b)

print('Create a 2x2 constant array')
c = np.full((2,2), 7)  
print(c)

print('Create a 2x2 identity matrix')
d = np.eye(2)  
print(d)

print('Create a 1x8 matrix')
s = np.arange(8)
print(s)

Create an 2x2 array of all zeros
[[0. 0.]
 [0. 0.]]
Create an 1x2 array of all ones
[[1. 1.]]
Create a 2x2 constant array
[[7 7]
 [7 7]]
Create a 2x2 identity matrix
[[1. 0.]
 [0. 1.]]
Create a 1x8 matrix
[0 1 2 3 4 5 6 7]


### Random Values


In [5]:
np.random.seed(25) # Always the same sequence
np.random.rand(8)

array([0.87012414, 0.58227693, 0.27883894, 0.18591123, 0.41110013,
       0.11737555, 0.68496874, 0.43761106])

In [6]:
np.random.rand(36).reshape((6,6)) # Reshape

array([[0.55622933, 0.36708032, 0.40236573, 0.1130407 , 0.44703085,
        0.58544512],
       [0.1619851 , 0.52071879, 0.32605113, 0.69918624, 0.36639455,
        0.83637451],
       [0.48134294, 0.5165023 , 0.38304813, 0.9975409 , 0.51424449,
        0.55905327],
       [0.03444977, 0.71993003, 0.42100355, 0.43693513, 0.28170075,
        0.90027434],
       [0.66961228, 0.45606875, 0.28980434, 0.52581896, 0.55924206,
        0.74528383],
       [0.82834625, 0.82369445, 0.07714032, 0.64486207, 0.30925759,
        0.52425372]])

## Pandas Series

Series(data=None, index=None, dtype=None, name=None, copy=False, fastpath=False)
- data : array-like, Iterable, dict, or scalar value
- index : array-like or Index (1d) - Values must be hashable and have the same length as data.

One-dimensional ndarray with axis labels (including time series).

In [7]:
idx=['row 1', 'row 2','row 3','row 4','row 5', 'row 6', 'row 7', 'row 8']
series1 = Series(np.arange(8), index=idx)
series1

row 1    0
row 2    1
row 3    2
row 4    3
row 5    4
row 6    5
row 7    6
row 8    7
dtype: int64

In [8]:
np.random.seed(25) #Always the same sequence
series2 = Series(np.random.rand(8), index=idx)
series2

row 1    0.870124
row 2    0.582277
row 3    0.278839
row 4    0.185911
row 5    0.411100
row 6    0.117376
row 7    0.684969
row 8    0.437611
dtype: float64

## Access to rows

In [9]:
# When you write square brackets with a label-index inside them, this tells Python to select and 
# retrieve all records with that label-index.
series1['row 7']

6

In [10]:
# When you write square brackets with an integer index inside them, this tells Python to select and 
# retrieve all records with the specified integer index.
series2[[0,1,7]]

row 1    0.870124
row 2    0.582277
row 8    0.437611
dtype: float64

## DataFrame

DataFrame(data=None, index=None, columns=None, dtype=None, copy=False)
- data : ndarray (structured or homogeneous), Iterable, dict, or DataFrame;
- index : Index or array-like - Index to use for resulting frame;
- columns : Index or array-like - Column labels to use for resulting frame;

In [11]:
np.random.seed(25)
dataframe1 = DataFrame(np.random.rand(36).reshape((6,6)), 
                   index=['row 1', 'row 2', 'row 3', 'row 4', 'row 5', 'row 6'],
                   columns=['column 1', 'column 2', 'column 3', 'column 4', 'column 5', 'column 6'])
dataframe1

Unnamed: 0,column 1,column 2,column 3,column 4,column 5,column 6
row 1,0.870124,0.582277,0.278839,0.185911,0.4111,0.117376
row 2,0.684969,0.437611,0.556229,0.36708,0.402366,0.113041
row 3,0.447031,0.585445,0.161985,0.520719,0.326051,0.699186
row 4,0.366395,0.836375,0.481343,0.516502,0.383048,0.997541
row 5,0.514244,0.559053,0.03445,0.71993,0.421004,0.436935
row 6,0.281701,0.900274,0.669612,0.456069,0.289804,0.525819


## Access to rows and columns

In [12]:
dataframe1[1:4] # Return rows 2, 3, 4 (1 is excluded)

Unnamed: 0,column 1,column 2,column 3,column 4,column 5,column 6
row 2,0.684969,0.437611,0.556229,0.36708,0.402366,0.113041
row 3,0.447031,0.585445,0.161985,0.520719,0.326051,0.699186
row 4,0.366395,0.836375,0.481343,0.516502,0.383048,0.997541


In [13]:
dataframe1.loc['row 2':'row 4'] # All included

Unnamed: 0,column 1,column 2,column 3,column 4,column 5,column 6
row 2,0.684969,0.437611,0.556229,0.36708,0.402366,0.113041
row 3,0.447031,0.585445,0.161985,0.520719,0.326051,0.699186
row 4,0.366395,0.836375,0.481343,0.516502,0.383048,0.997541


In [14]:
# When you call the .loc[] special indexer, and pass in a set of row and colum indexes, this tells 
# Python to select and retrieve only those specific rows and columns.
dataframe1.loc[['row 2', 'row 5'], ['column 5', 'column 2']]

Unnamed: 0,column 5,column 2
row 2,0.402366,0.437611
row 5,0.421004,0.559053


## Filtering

In [28]:
# Evaluated every each value and return a boolean array
series1>4 

row 1    False
row 2    False
row 3    False
row 4    False
row 5    False
row 6     True
row 7     True
row 8     True
dtype: bool

In [30]:
# Filter out values
series1[series1>4]

row 6    5
row 7    6
row 8    7
dtype: int64

In [31]:
# Aggregation function
series1[series1>4].sum()

18

In [18]:
dataframe2 = dataframe1[dataframe1>0.2]
dataframe2

Unnamed: 0,column 1,column 2,column 3,column 4,column 5,column 6
row 1,0.870124,0.582277,0.278839,,0.4111,
row 2,0.684969,0.437611,0.556229,0.36708,0.402366,
row 3,0.447031,0.585445,,0.520719,0.326051,0.699186
row 4,0.366395,0.836375,0.481343,0.516502,0.383048,0.997541
row 5,0.514244,0.559053,,0.71993,0.421004,0.436935
row 6,0.281701,0.900274,0.669612,0.456069,0.289804,0.525819


## Counting missing values

In [19]:
dataframe2.isnull().sum().sum()

5

## Filtering out missing values

In [20]:
# To identify and drop all rows from a DataFrame that contain ANY missing values, simply call the 
# .dropna() method off of the DataFrame object. 
dataframe2.dropna()

Unnamed: 0,column 1,column 2,column 3,column 4,column 5,column 6
row 4,0.366395,0.836375,0.481343,0.516502,0.383048,0.997541
row 6,0.281701,0.900274,0.669612,0.456069,0.289804,0.525819


In [21]:
# If you wanted to drop columns that contain any missing values, you'd just pass in the axis=1 argument 
# to select and search the DataFrame by columns, instead of by row.
dataframe2.dropna(axis=1)

Unnamed: 0,column 1,column 2,column 5
row 1,0.870124,0.582277,0.4111
row 2,0.684969,0.437611,0.402366
row 3,0.447031,0.585445,0.326051
row 4,0.366395,0.836375,0.383048
row 5,0.514244,0.559053,0.421004
row 6,0.281701,0.900274,0.289804


## Loading data from file

    mpg  Miles/(US) gallon
    cyl  Number of cylinders
    disp Displacement (cu.in.)
    hp   Gross horsepower
    drat Rear axle ratio
    wt   Weight (1000 lbs)
    qsec 1/4 mile time
    vs   Engine (0 = V-shaped, 1 = straight)
    am   Transmission (0 = automatic, 1 = manual)
    gear Number of forward gears
    carb Number of carburetors

In [22]:
cars = pd.read_csv('../samples/mtcars.csv')
cars.head()

Unnamed: 0.1,Unnamed: 0,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb
0,Mazda RX4,21.0,6,160.0,110,3.9,2.62,16.46,0,1,4,4
1,Mazda RX4 Wag,21.0,6,160.0,110,3.9,2.875,17.02,0,1,4,4
2,Datsun 710,22.8,4,108.0,93,3.85,2.32,18.61,1,1,4,1
3,Hornet 4 Drive,21.4,6,258.0,110,3.08,3.215,19.44,1,0,3,1
4,Hornet Sportabout,18.7,8,360.0,175,3.15,3.44,17.02,0,0,3,2


## Grouping data by column index

In [23]:
# To group a  DataFrame by its values in a particular column, call the .groupby() method off of the DataFrame, and then pass
# in the column Series you want the DataFrame to be grouped by.
cars_groups = cars.groupby(cars['cyl'])
cars_groups.mean()


Unnamed: 0_level_0,mpg,disp,hp,drat,wt,qsec,vs,am,gear,carb
cyl,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
4,26.663636,105.136364,82.636364,4.070909,2.285727,19.137273,0.909091,0.727273,4.090909,1.545455
6,19.742857,183.314286,122.285714,3.585714,3.117143,17.977143,0.571429,0.428571,3.857143,3.428571
8,15.1,353.1,209.214286,3.229286,3.999214,16.772143,0.0,0.142857,3.285714,3.5


In [24]:
# Back to NumPy array.
cars.values

array([['Mazda RX4', 21.0, 6, 160.0, 110, 3.9, 2.62, 16.46, 0, 1, 4, 4],
       ['Mazda RX4 Wag', 21.0, 6, 160.0, 110, 3.9, 2.875, 17.02, 0, 1, 4,
        4],
       ['Datsun 710', 22.8, 4, 108.0, 93, 3.85, 2.32, 18.61, 1, 1, 4, 1],
       ['Hornet 4 Drive', 21.4, 6, 258.0, 110, 3.08, 3.215, 19.44, 1, 0,
        3, 1],
       ['Hornet Sportabout', 18.7, 8, 360.0, 175, 3.15, 3.44, 17.02, 0,
        0, 3, 2],
       ['Valiant', 18.1, 6, 225.0, 105, 2.76, 3.46, 20.22, 1, 0, 3, 1],
       ['Duster 360', 14.3, 8, 360.0, 245, 3.21, 3.57, 15.84, 0, 0, 3, 4],
       ['Merc 240D', 24.4, 4, 146.7, 62, 3.69, 3.19, 20.0, 1, 0, 4, 2],
       ['Merc 230', 22.8, 4, 140.8, 95, 3.92, 3.15, 22.9, 1, 0, 4, 2],
       ['Merc 280', 19.2, 6, 167.6, 123, 3.92, 3.44, 18.3, 1, 0, 4, 4],
       ['Merc 280C', 17.8, 6, 167.6, 123, 3.92, 3.44, 18.9, 1, 0, 4, 4],
       ['Merc 450SE', 16.4, 8, 275.8, 180, 3.07, 4.07, 17.4, 0, 0, 3, 3],
       ['Merc 450SL', 17.3, 8, 275.8, 180, 3.07, 3.73, 17.6, 0, 0, 3, 3],
      

In [25]:
cars.iloc[:,0:1].values

array([['Mazda RX4'],
       ['Mazda RX4 Wag'],
       ['Datsun 710'],
       ['Hornet 4 Drive'],
       ['Hornet Sportabout'],
       ['Valiant'],
       ['Duster 360'],
       ['Merc 240D'],
       ['Merc 230'],
       ['Merc 280'],
       ['Merc 280C'],
       ['Merc 450SE'],
       ['Merc 450SL'],
       ['Merc 450SLC'],
       ['Cadillac Fleetwood'],
       ['Lincoln Continental'],
       ['Chrysler Imperial'],
       ['Fiat 128'],
       ['Honda Civic'],
       ['Toyota Corolla'],
       ['Toyota Corona'],
       ['Dodge Challenger'],
       ['AMC Javelin'],
       ['Camaro Z28'],
       ['Pontiac Firebird'],
       ['Fiat X1-9'],
       ['Porsche 914-2'],
       ['Lotus Europa'],
       ['Ford Pantera L'],
       ['Ferrari Dino'],
       ['Maserati Bora'],
       ['Volvo 142E']], dtype=object)