# Pandas

## Library Highlights

- A fast and efficient DataFrame object for data manipulation with integrated indexing
- Tools for reading and writing data between in-memory data structures and different formats, such as CSV and text files, Microsoft Excel, and SQL databases
- Intelligent data alignment and integrated handling ofmissing data: gain automatic label-based alignment in computations and easilymanipulate messy data into an orderly form
- Intelligent label-based slicing, fancy indexing, and subsetting of large data sets
- Columns can be inserted and deleted from data structures for size mutability
- High performancemerging and joining of data sets
- Hierarchical axis indexing provides an intuitive way of working with high-dimensional data in a lower-dimensional data structure
- Time series-functionality

### Series

In [6]:
import pandas as pd
import numpy as np
s = pd.Series([1,2,3,'hi',np.NINF,6])
s

0      1
1      2
2      3
3     hi
4   -inf
5      6
dtype: object

In [8]:
d = {'Chicago': 1000, 'New York': 1300, 'Portland': 900, 'San\
Francisco': 1100, 'Austin': 450, 'Boston': None}

cities = pd.Series(d)
cities

Chicago         1000.0
New York        1300.0
Portland         900.0
SanFrancisco    1100.0
Austin           450.0
Boston             NaN
dtype: float64

In [9]:
cities[cities < 1000]

Portland    900.0
Austin      450.0
dtype: float64

In [11]:
Less_than_1000 = cities < 1000
Less_than_1000

Chicago         False
New York        False
Portland         True
SanFrancisco    False
Austin           True
Boston          False
dtype: bool

In [14]:
dates = pd.date_range('20130101',periods=6)
dates

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

### Data Frame

In [20]:
df = pd.DataFrame(np.random.randn(6,4),index=dates, columns=list('ABCD'))
df

Unnamed: 0,A,B,C,D
2013-01-01,2.379157,-0.504103,1.520361,0.592046
2013-01-02,-1.130727,-0.596568,-1.129103,0.044817
2013-01-03,-0.521896,0.301053,-0.530699,-1.329904
2013-01-04,-0.666959,1.522983,1.613341,-0.424382
2013-01-05,-0.395797,-0.612069,0.663099,1.5688
2013-01-06,0.979426,-0.28561,0.929002,0.470229


In [21]:
df2 = pd.DataFrame({'A' : 1., 
                    'B' : pd.Timestamp('20130102'),
                    'C' : pd.Series(1,index=list(range(4)),dtype='float32'),
                    'D' : np.array([3] * 4,dtype='int32'),
                    'E' : 'foo' })
df2

Unnamed: 0,A,B,C,D,E
0,1.0,2013-01-02,1.0,3,foo
1,1.0,2013-01-02,1.0,3,foo
2,1.0,2013-01-02,1.0,3,foo
3,1.0,2013-01-02,1.0,3,foo


In [22]:
df.head()

Unnamed: 0,A,B,C,D
2013-01-01,2.379157,-0.504103,1.520361,0.592046
2013-01-02,-1.130727,-0.596568,-1.129103,0.044817
2013-01-03,-0.521896,0.301053,-0.530699,-1.329904
2013-01-04,-0.666959,1.522983,1.613341,-0.424382
2013-01-05,-0.395797,-0.612069,0.663099,1.5688


In [19]:
df.tail(2)

Unnamed: 0,A,B,C,D
2013-01-05,0.083249,-0.140062,-0.617511,1.245781
2013-01-06,-0.85272,0.339996,-1.378779,0.559623


In [25]:
df.describe()

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,0.275831,0.432735,-0.351942,-0.51784
std,1.02295,0.982865,0.808995,1.130236
min,-1.758445,-1.117134,-1.21422,-2.032422
25%,0.447758,-0.130982,-0.983911,-1.24447
50%,0.528298,0.731933,-0.513436,-0.593573
75%,0.852918,1.103062,0.304416,0.481096
max,0.974797,1.441524,0.693142,0.734644


In [27]:
df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

In [42]:
#Sorting by an axis:
df = pd.DataFrame(np.random.randn(6,4),index=dates, columns=list('ABCD'))
df.sort_index(axis=0, ascending=True) # column or rows, axis or s
df

Unnamed: 0,A,B,C,D
2013-01-01,1.102489,-0.630044,0.564147,1.805562
2013-01-02,2.003683,1.256328,1.85165,0.501532
2013-01-03,0.734071,-1.201777,0.61672,1.148775
2013-01-04,-0.128358,-1.094854,0.308925,-0.375639
2013-01-05,0.630649,1.434472,-1.317273,-1.376068
2013-01-06,1.890022,0.744405,-0.207258,-1.143081


In [49]:
df2 = pd.DataFrame({'A' : 1., 
                    'B' : pd.Timestamp('20130102'),
                    'C' : pd.Series(1,index=list(range(4)),dtype='float32'),
                    'D' : np.array([3] * 4,dtype='int32'),
                    'E' : 'foo' })
df2.sort_values(by = ['B'])
df2

Unnamed: 0,A,B,C,D,E
0,1.0,2013-01-02,1.0,3,foo
1,1.0,2013-01-02,1.0,3,foo
2,1.0,2013-01-02,1.0,3,foo
3,1.0,2013-01-02,1.0,3,foo
