In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
s = pd.Series([1, 3, 4, np.nan, 6, 8])

In [3]:
s

0     1
1     3
2     4
3   NaN
4     6
5     8
dtype: float64

In [4]:
dates = pd.date_range('20130101', periods = 6)

In [5]:
dates

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D', tz=None)

In [12]:
df = pd.DataFrame(np.random.randn(6, 4), index = dates, columns = list('ABCD'))

In [13]:
df

Unnamed: 0,A,B,C,D
2013-01-01,-1.831377,-0.011215,1.248415,1.543559
2013-01-02,-0.93393,-0.195441,0.597386,0.248281
2013-01-03,-1.427315,-0.3551,-0.277997,-1.239355
2013-01-04,0.279082,-1.758154,0.994242,-0.391214
2013-01-05,0.400923,-0.254727,-0.523866,0.177194
2013-01-06,-0.308233,0.353648,2.344497,0.674551


In [15]:
df.columns.values

array(['A', 'B', 'C', 'D'], dtype=object)

In [16]:
df.describe()

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,-0.636808,-0.370165,0.730446,0.168836
std,0.912565,0.72434,1.053661,0.943255
min,-1.831377,-1.758154,-0.523866,-1.239355
25%,-1.303969,-0.330006,-0.059152,-0.249112
50%,-0.621081,-0.225084,0.795814,0.212738
75%,0.132253,-0.057272,1.184872,0.567984
max,0.400923,0.353648,2.344497,1.543559


In [18]:
# sorting by an axis
df.sort_index(axis = 1, ascending = False)

Unnamed: 0,D,C,B,A
2013-01-01,1.543559,1.248415,-0.011215,-1.831377
2013-01-02,0.248281,0.597386,-0.195441,-0.93393
2013-01-03,-1.239355,-0.277997,-0.3551,-1.427315
2013-01-04,-0.391214,0.994242,-1.758154,0.279082
2013-01-05,0.177194,-0.523866,-0.254727,0.400923
2013-01-06,0.674551,2.344497,0.353648,-0.308233


In [19]:
df.sort(columns = 'B', ascending = False)

Unnamed: 0,A,B,C,D
2013-01-06,-0.308233,0.353648,2.344497,0.674551
2013-01-01,-1.831377,-0.011215,1.248415,1.543559
2013-01-02,-0.93393,-0.195441,0.597386,0.248281
2013-01-05,0.400923,-0.254727,-0.523866,0.177194
2013-01-03,-1.427315,-0.3551,-0.277997,-1.239355
2013-01-04,0.279082,-1.758154,0.994242,-0.391214


#### Selection

In [37]:
# selection by label
df.loc[:, ['A', 'B']]

Unnamed: 0,A,B
2013-01-01,-1.831377,-0.011215
2013-01-02,-0.93393,-0.195441
2013-01-03,-1.427315,-0.3551
2013-01-04,0.279082,-1.758154
2013-01-05,0.400923,-0.254727
2013-01-06,-0.308233,0.353648


In [38]:
df.loc?

In [39]:
df.loc['20130102':'20130104',['A','B']]

Unnamed: 0,A,B
2013-01-02,-0.93393,-0.195441
2013-01-03,-1.427315,-0.3551
2013-01-04,0.279082,-1.758154


In [42]:
# selection by position
df.iloc[1:3, :]

Unnamed: 0,A,B,C,D
2013-01-02,-0.93393,-0.195441,0.597386,0.248281
2013-01-03,-1.427315,-0.3551,-0.277997,-1.239355


In [43]:
df.iloc?

In [44]:
df.iloc[3:5, 0:2]

Unnamed: 0,A,B
2013-01-04,0.279082,-1.758154
2013-01-05,0.400923,-0.254727


In [45]:
df.iloc[[1, 2, 4], [0,2]]

Unnamed: 0,A,C
2013-01-02,-0.93393,0.597386
2013-01-03,-1.427315,-0.277997
2013-01-05,0.400923,-0.523866


In [46]:
# boolean indexing
df[df.A > 0]

Unnamed: 0,A,B,C,D
2013-01-04,0.279082,-1.758154,0.994242,-0.391214
2013-01-05,0.400923,-0.254727,-0.523866,0.177194


In [47]:
df[df > 0]

Unnamed: 0,A,B,C,D
2013-01-01,,,1.248415,1.543559
2013-01-02,,,0.597386,0.248281
2013-01-03,,,,
2013-01-04,0.279082,,0.994242,
2013-01-05,0.400923,,,0.177194
2013-01-06,,0.353648,2.344497,0.674551


In [48]:
df.isin?

In [49]:
# setting new columns
s1 = pd.Series([1,2,3,4,5,6],index=pd.date_range('20130102',periods=6))

In [50]:
df['F'] = s1
df

Unnamed: 0,A,B,C,D,F
2013-01-01,-1.831377,-0.011215,1.248415,1.543559,
2013-01-02,-0.93393,-0.195441,0.597386,0.248281,1.0
2013-01-03,-1.427315,-0.3551,-0.277997,-1.239355,2.0
2013-01-04,0.279082,-1.758154,0.994242,-0.391214,3.0
2013-01-05,0.400923,-0.254727,-0.523866,0.177194,4.0
2013-01-06,-0.308233,0.353648,2.344497,0.674551,5.0


In [54]:
df.iloc[0, 4] = 0

In [55]:
df

Unnamed: 0,A,B,C,D,F
2013-01-01,-1.831377,-0.011215,1.248415,1.543559,0
2013-01-02,-0.93393,-0.195441,0.597386,0.248281,1
2013-01-03,-1.427315,-0.3551,-0.277997,-1.239355,2
2013-01-04,0.279082,-1.758154,0.994242,-0.391214,3
2013-01-05,0.400923,-0.254727,-0.523866,0.177194,4
2013-01-06,-0.308233,0.353648,2.344497,0.674551,5


In [56]:
# reindex to change index on specific axis
df1 = df.reindex(index=dates[0:4],columns=list(df.columns) + ['E'])

In [60]:
df1.iloc[0:2, 5] = 1

In [61]:
df1

Unnamed: 0,A,B,C,D,F,E
2013-01-01,-1.831377,-0.011215,1.248415,1.543559,0,1.0
2013-01-02,-0.93393,-0.195441,0.597386,0.248281,1,1.0
2013-01-03,-1.427315,-0.3551,-0.277997,-1.239355,2,
2013-01-04,0.279082,-1.758154,0.994242,-0.391214,3,


In [64]:
# drop missing data
df1.dropna(how = 'any')

Unnamed: 0,A,B,C,D,F,E
2013-01-01,-1.831377,-0.011215,1.248415,1.543559,0,1
2013-01-02,-0.93393,-0.195441,0.597386,0.248281,1,1


In [66]:
# filling missing data
df1.fillna(value = 5)

Unnamed: 0,A,B,C,D,F,E
2013-01-01,-1.831377,-0.011215,1.248415,1.543559,0,1
2013-01-02,-0.93393,-0.195441,0.597386,0.248281,1,1
2013-01-03,-1.427315,-0.3551,-0.277997,-1.239355,2,5
2013-01-04,0.279082,-1.758154,0.994242,-0.391214,3,5


In [67]:
# to get a mask
pd.isnull(df1)

Unnamed: 0,A,B,C,D,F,E
2013-01-01,False,False,False,False,False,False
2013-01-02,False,False,False,False,False,False
2013-01-03,False,False,False,False,False,True
2013-01-04,False,False,False,False,False,True


In [69]:
df.mean(0)

A   -0.636808
B   -0.370165
C    0.730446
D    0.168836
F    2.500000
dtype: float64

In [70]:
df.mean(1)

2013-01-01    0.189876
2013-01-02    0.143259
2013-01-03   -0.259953
2013-01-04    0.424791
2013-01-05    0.759905
2013-01-06    1.612893
Freq: D, dtype: float64

In [73]:
# apply function to each column
df.apply?

In [76]:
df.apply(np.cumsum, axis = 0)

Unnamed: 0,A,B,C,D,F
2013-01-01,-1.831377,-0.011215,1.248415,1.543559,0
2013-01-02,-2.765307,-0.206657,1.845801,1.79184,1
2013-01-03,-4.192622,-0.561756,1.567804,0.552484,3
2013-01-04,-3.913539,-2.31991,2.562046,0.16127,6
2013-01-05,-3.512617,-2.574637,2.038179,0.338465,10
2013-01-06,-3.820849,-2.220988,4.382677,1.013016,15


In [77]:
# database style joining
pd.merge?

In [78]:
# split data by some criteria and applying a function to each group
df.groupby?

In [79]:
df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar',
                          'foo', 'bar', 'foo', 'foo'],
                   'B' : ['one', 'one', 'two', 'three',
                          'two', 'two', 'one', 'three'],
                   'C' : np.random.randn(8),
                   'D' : np.random.randn(8)})

In [80]:
df

Unnamed: 0,A,B,C,D
0,foo,one,-2.115485,0.524686
1,bar,one,-0.145535,0.546976
2,foo,two,0.218792,-0.0876
3,bar,three,-1.130691,-1.530134
4,foo,two,0.905228,-0.425979
5,bar,two,-0.506526,0.077986
6,foo,one,1.390257,2.133716
7,foo,three,1.538244,0.49545


In [81]:
df.groupby('A').sum()

Unnamed: 0_level_0,C,D
A,Unnamed: 1_level_1,Unnamed: 2_level_1
bar,-1.782751,-0.905173
foo,1.937036,2.640273


In [82]:
df.groupby(['A', 'B']).sum()

Unnamed: 0_level_0,Unnamed: 1_level_0,C,D
A,B,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,-0.145535,0.546976
bar,three,-1.130691,-1.530134
bar,two,-0.506526,0.077986
foo,one,-0.725228,2.658401
foo,three,1.538244,0.49545
foo,two,1.12402,-0.513578


#### Categoricals

In [83]:
df = pd.DataFrame({"id":[1,2,3,4,5,6], "raw_grade":['a', 'b', 'b', 'a', 'a', 'e']})

In [86]:
# convert raw grades to categorical data
df['grade'] = df['raw_grade'].astype('category')

In [87]:
df["grade"].cat.categories = ["very good", "good", "very bad"]

In [111]:
df['grade'] = 'NA'

In [112]:
df

Unnamed: 0,id,raw_grade,grade
0,1,a,
1,2,b,
2,3,b,
3,4,a,
4,5,a,
5,6,e,


In [113]:
df['grade'][df.raw_grade == 'a'] = 'very good'
df['grade'][df.raw_grade == 'b'] = 'good'
df['grade'][df.raw_grade == 'e'] = 'very bad'

A value is trying to be set on a copy of a slice from a DataFrame

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from IPython.kernel.zmq import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()


In [117]:
df.grade = df.grade.astype('category')

In [118]:
# add order to categorical variable
df["grade"] = df["grade"].cat.set_categories(["very bad", "bad", "medium", "good", "very good"])

In [119]:
df.grade

0    very good
1         good
2         good
3    very good
4    very good
5     very bad
Name: grade, dtype: category
Categories (5, object): [very bad, bad, medium, good, very good]

In [121]:
# sort by order in category
df.sort('grade')

Unnamed: 0,id,raw_grade,grade
5,6,e,very bad
1,2,b,good
2,3,b,good
0,1,a,very good
3,4,a,very good
4,5,a,very good


In [123]:
df.groupby('grade').size()

grade
very bad      1
bad         NaN
medium      NaN
good          2
very good     3
dtype: float64