<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Object-Creation" data-toc-modified-id="Object-Creation-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Object Creation</a></span></li><li><span><a href="#Veiwing-Data" data-toc-modified-id="Veiwing-Data-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Veiwing Data</a></span></li><li><span><a href="#Selection" data-toc-modified-id="Selection-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Selection</a></span></li><li><span><a href="#Missing-Data" data-toc-modified-id="Missing-Data-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Missing Data</a></span></li></ul></div>

# pandas
pandas is an open source, BSD-licensed library providing high-performance, easy-to-use **data structures and data analysis tools** for the Python programming language.

you can refer to [pandas cheat sheet](http://pandas.pydata.org/Pandas_Cheat_Sheet.pdf) and [pandas tutorial](https://pandas.pydata.org/pandas-docs/stable/getting_started/tutorials.html) for details.

In [1]:
import numpy as np
import pandas as pd

## Object Creation

In [9]:
# Creating a Series by passing a list of values, letting pandas create a default integer index:

s = pd.Series([1,3,5, np.nan, 6, 8]) 

s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

In [12]:
# Creating a DataFrame by passing a NumPy array, with a datetime index and labeled columns:

dates = pd.date_range('20130101', periods=6)

dates

df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list('ABCD'))

df

Unnamed: 0,A,B,C,D
2013-01-01,-1.444036,0.004337,-2.584044,0.051296
2013-01-02,-0.54477,-0.149141,-0.991055,-1.342031
2013-01-03,1.364917,-0.035276,1.513094,1.694041
2013-01-04,-0.999942,0.421217,-0.704924,1.157883
2013-01-05,-1.370461,2.342387,-1.316925,-0.437568
2013-01-06,0.204145,0.121449,0.716255,-0.951278


In [15]:
# Creating a DataFrame by passing a dict of objects that can be converted to series-like.
df2 = pd.DataFrame({'A': 1.,
                     'B': pd.Timestamp('20130102'),
                     'C': pd.Series(1, index=list(range(4)), dtype='float32'),
                     'D': np.array([3] * 4, dtype='int32'),
                     'E': pd.Categorical(["test", "train", "test", "train"]),
                     'F': 'foo'})

print(df2)
df2.dtypes

     A          B    C  D      E    F
0  1.0 2013-01-02  1.0  3   test  foo
1  1.0 2013-01-02  1.0  3  train  foo
2  1.0 2013-01-02  1.0  3   test  foo
3  1.0 2013-01-02  1.0  3  train  foo


A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object

## Veiwing Data

In [18]:
df.head()


Unnamed: 0,A,B,C,D
2013-01-02,-0.54477,-0.149141,-0.991055,-1.342031
2013-01-03,1.364917,-0.035276,1.513094,1.694041
2013-01-04,-0.999942,0.421217,-0.704924,1.157883
2013-01-05,-1.370461,2.342387,-1.316925,-0.437568
2013-01-06,0.204145,0.121449,0.716255,-0.951278


In [19]:
df.tail(3)

Unnamed: 0,A,B,C,D
2013-01-04,-0.999942,0.421217,-0.704924,1.157883
2013-01-05,-1.370461,2.342387,-1.316925,-0.437568
2013-01-06,0.204145,0.121449,0.716255,-0.951278


In [20]:
df.index

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [21]:
df.to_numpy()

array([[-1.44403566,  0.00433673, -2.58404448,  0.05129626],
       [-0.54477013, -0.1491408 , -0.99105485, -1.34203082],
       [ 1.36491709, -0.03527627,  1.51309416,  1.69404076],
       [-0.99994231,  0.42121657, -0.70492418,  1.15788298],
       [-1.37046126,  2.342387  , -1.31692542, -0.43756772],
       [ 0.20414505,  0.12144924,  0.71625519, -0.95127831]])

In [22]:
df2.to_numpy()

array([[1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'test', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'train', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'test', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'train', 'foo']],
      dtype=object)

In [24]:
# describe() shows a quick statistic summary of your data:
df.describe()

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,-0.465025,0.450829,-0.561267,0.028724
std,1.083259,0.946887,1.469901,1.192107
min,-1.444036,-0.149141,-2.584044,-1.342031
25%,-1.277832,-0.025373,-1.235458,-0.822851
50%,-0.772356,0.062893,-0.84799,-0.193136
75%,0.016916,0.346275,0.36096,0.881236
max,1.364917,2.342387,1.513094,1.694041


In [25]:
df.T

Unnamed: 0,2013-01-01 00:00:00,2013-01-02 00:00:00,2013-01-03 00:00:00,2013-01-04 00:00:00,2013-01-05 00:00:00,2013-01-06 00:00:00
A,-1.444036,-0.54477,1.364917,-0.999942,-1.370461,0.204145
B,0.004337,-0.149141,-0.035276,0.421217,2.342387,0.121449
C,-2.584044,-0.991055,1.513094,-0.704924,-1.316925,0.716255
D,0.051296,-1.342031,1.694041,1.157883,-0.437568,-0.951278


In [28]:
df.sort_index(axis=1, ascending=False)

Unnamed: 0,D,C,B,A
2013-01-01,0.051296,-2.584044,0.004337,-1.444036
2013-01-02,-1.342031,-0.991055,-0.149141,-0.54477
2013-01-03,1.694041,1.513094,-0.035276,1.364917
2013-01-04,1.157883,-0.704924,0.421217,-0.999942
2013-01-05,-0.437568,-1.316925,2.342387,-1.370461
2013-01-06,-0.951278,0.716255,0.121449,0.204145


In [29]:
df.sort_values(by='B')

Unnamed: 0,A,B,C,D
2013-01-02,-0.54477,-0.149141,-0.991055,-1.342031
2013-01-03,1.364917,-0.035276,1.513094,1.694041
2013-01-01,-1.444036,0.004337,-2.584044,0.051296
2013-01-06,0.204145,0.121449,0.716255,-0.951278
2013-01-04,-0.999942,0.421217,-0.704924,1.157883
2013-01-05,-1.370461,2.342387,-1.316925,-0.437568


## Selection

In [31]:
df['A']

2013-01-01   -1.444036
2013-01-02   -0.544770
2013-01-03    1.364917
2013-01-04   -0.999942
2013-01-05   -1.370461
2013-01-06    0.204145
Freq: D, Name: A, dtype: float64

In [33]:
df[0:3]

Unnamed: 0,A,B,C,D
2013-01-01,-1.444036,0.004337,-2.584044,0.051296
2013-01-02,-0.54477,-0.149141,-0.991055,-1.342031
2013-01-03,1.364917,-0.035276,1.513094,1.694041


In [34]:
df['20130102': '20130104']

Unnamed: 0,A,B,C,D
2013-01-02,-0.54477,-0.149141,-0.991055,-1.342031
2013-01-03,1.364917,-0.035276,1.513094,1.694041
2013-01-04,-0.999942,0.421217,-0.704924,1.157883


In [35]:
df.loc[dates[0]]

A   -1.444036
B    0.004337
C   -2.584044
D    0.051296
Name: 2013-01-01 00:00:00, dtype: float64

In [36]:
df.loc[:, ['A', 'B']]

Unnamed: 0,A,B
2013-01-01,-1.444036,0.004337
2013-01-02,-0.54477,-0.149141
2013-01-03,1.364917,-0.035276
2013-01-04,-0.999942,0.421217
2013-01-05,-1.370461,2.342387
2013-01-06,0.204145,0.121449


In [37]:
df.loc['20130102':'20130104', ['A', 'B']]

Unnamed: 0,A,B
2013-01-02,-0.54477,-0.149141
2013-01-03,1.364917,-0.035276
2013-01-04,-0.999942,0.421217


In [38]:
df.loc['20130102', ['A', 'B']]

A   -0.544770
B   -0.149141
Name: 2013-01-02 00:00:00, dtype: float64

In [39]:
df.loc[dates[0], 'A']

-1.4440356648732384

In [40]:
df.at[dates[0], 'A']

-1.4440356648732384

In [42]:
df.iloc[3]

A   -0.999942
B    0.421217
C   -0.704924
D    1.157883
Name: 2013-01-04 00:00:00, dtype: float64

In [41]:
df.iloc[3:5, 0:2]

Unnamed: 0,A,B
2013-01-04,-0.999942,0.421217
2013-01-05,-1.370461,2.342387


In [43]:
df.iloc[[1,2,4], [0,2]]

Unnamed: 0,A,C
2013-01-02,-0.54477,-0.991055
2013-01-03,1.364917,1.513094
2013-01-05,-1.370461,-1.316925


In [44]:
df.iloc[:, 1:3]

Unnamed: 0,B,C
2013-01-01,0.004337,-2.584044
2013-01-02,-0.149141,-0.991055
2013-01-03,-0.035276,1.513094
2013-01-04,0.421217,-0.704924
2013-01-05,2.342387,-1.316925
2013-01-06,0.121449,0.716255


In [45]:
df.iloc[1,1]

-0.14914079567314928

In [46]:
df[df >0 ]

Unnamed: 0,A,B,C,D
2013-01-01,,0.004337,,0.051296
2013-01-02,,,,
2013-01-03,1.364917,,1.513094,1.694041
2013-01-04,,0.421217,,1.157883
2013-01-05,,2.342387,,
2013-01-06,0.204145,0.121449,0.716255,


In [47]:
df2 = df.copy()
df2['E'] = ['one', 'one', 'two', 'three', 'four', 'three']

df2

Unnamed: 0,A,B,C,D,E
2013-01-01,-1.444036,0.004337,-2.584044,0.051296,one
2013-01-02,-0.54477,-0.149141,-0.991055,-1.342031,one
2013-01-03,1.364917,-0.035276,1.513094,1.694041,two
2013-01-04,-0.999942,0.421217,-0.704924,1.157883,three
2013-01-05,-1.370461,2.342387,-1.316925,-0.437568,four
2013-01-06,0.204145,0.121449,0.716255,-0.951278,three


In [50]:
df2[df2['E'].isin(['two', 'four'])]

Unnamed: 0,A,B,C,D,E
2013-01-03,1.364917,-0.035276,1.513094,1.694041,two
2013-01-05,-1.370461,2.342387,-1.316925,-0.437568,four


In [54]:
s1 = pd.Series([1,2,3,4,5,6], index=pd.date_range('20130102', periods=6))
s1


In [53]:
df['F'] = s1

In [57]:
df.at[dates[0], 'A'] = 0
df.iat[0,1] = 0
df.loc[:, 'D'] = np.array([5] * len(df))
df

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,0.0,-2.584044,5,
2013-01-02,-0.54477,-0.149141,-0.991055,5,1.0
2013-01-03,1.364917,-0.035276,1.513094,5,2.0
2013-01-04,-0.999942,0.421217,-0.704924,5,3.0
2013-01-05,-1.370461,2.342387,-1.316925,5,4.0
2013-01-06,0.204145,0.121449,0.716255,5,5.0


In [59]:
df2 = df.copy()
df2[df2>0] = -df2

df2

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,0.0,-2.584044,-5,
2013-01-02,-0.54477,-0.149141,-0.991055,-5,-1.0
2013-01-03,-1.364917,-0.035276,-1.513094,-5,-2.0
2013-01-04,-0.999942,-0.421217,-0.704924,-5,-3.0
2013-01-05,-1.370461,-2.342387,-1.316925,-5,-4.0
2013-01-06,-0.204145,-0.121449,-0.716255,-5,-5.0


## Missing Data

In [34]:
a = np.random.randn(3,4)
b = np.arange(3)

data = [a,b]

df = pd.DataFrame(data, index=[0,1], columns=list('A'))
print(df.iloc[0].array)

df.to_csv('example.csv')

<PandasArray>
[array([[ 0.05096022, -1.12149973, -0.00764756, -1.21029032],
       [-0.40859699, -0.62506821, -0.63566376, -0.22387273],
       [-1.03127088,  1.01415313, -1.33815226, -0.34013302]])]
Length: 1, dtype: object
