In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# 创建对象

## Series

In [5]:
s = pd.Series([1,3,5,np.nan,6,8])
s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

## DataFrames

In [6]:
dates = pd.date_range('20180101', periods=6)
dates

DatetimeIndex(['2018-01-01', '2018-01-02', '2018-01-03', '2018-01-04',
               '2018-01-05', '2018-01-06'],
              dtype='datetime64[ns]', freq='D')

In [18]:
df = pd.DataFrame(np.random.randn(6,4), index=dates, columns=list('ABCD'))
df

Unnamed: 0,A,B,C,D
2018-01-01,1.159255,-2.352487,-1.269852,-0.108547
2018-01-02,-0.224549,1.04711,0.987356,-0.43107
2018-01-03,1.131581,0.066462,1.005311,0.360926
2018-01-04,-1.178453,0.923428,-1.617306,1.320643
2018-01-05,-0.719781,-1.032415,-1.362041,0.686076
2018-01-06,0.63928,1.437779,0.323961,0.554092


In [26]:
# 可以使用字典来创建 DataFrame 。
# 如果字典的 Value 是单一值，那么会自动扩展。
# 如果字典的 Value 是列表或者 Series ，那么长度要保持一致。
# 如果字典中只有一个值有 Index ，那么会使用这个 Index 作为整个 DataFrame 的 Index 。
# 如果字典有多个 Index ，那么必须保持一致，否则会报错。
df2 = pd.DataFrame(
    { 'A' : 1.,
      'B' : pd.Timestamp('20130102'),
      'C' : pd.Series(1,index=list(range(2,6)),dtype='float32'), 
      'D' : np.array([3] * 4,dtype='int32'), 
      'E' : pd.Categorical(["test","train","test","train"]), 
      'F' : 'foo' }
)
df2

Unnamed: 0,A,B,C,D,E,F
2,1.0,2013-01-02,1.0,3,test,foo
3,1.0,2013-01-02,1.0,3,train,foo
4,1.0,2013-01-02,1.0,3,test,foo
5,1.0,2013-01-02,1.0,3,train,foo


# 查看数据

In [28]:
# 对象类型
df2.dtypes

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object

In [33]:
# 查看头部数据
df.head()

Unnamed: 0,A,B,C,D
2018-01-01,1.159255,-2.352487,-1.269852,-0.108547
2018-01-02,-0.224549,1.04711,0.987356,-0.43107
2018-01-03,1.131581,0.066462,1.005311,0.360926
2018-01-04,-1.178453,0.923428,-1.617306,1.320643
2018-01-05,-0.719781,-1.032415,-1.362041,0.686076


In [34]:
# 查看尾部数据
df.tail(2)

Unnamed: 0,A,B,C,D
2018-01-05,-0.719781,-1.032415,-1.362041,0.686076
2018-01-06,0.63928,1.437779,0.323961,0.554092


head 和 tail 接受一个整数参数，缺省值为 5 。

In [35]:
df.index

DatetimeIndex(['2018-01-01', '2018-01-02', '2018-01-03', '2018-01-04',
               '2018-01-05', '2018-01-06'],
              dtype='datetime64[ns]', freq='D')

In [36]:
df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

In [37]:
df.values

array([[ 1.15925508, -2.35248676, -1.26985199, -0.10854658],
       [-0.22454925,  1.04710995,  0.98735612, -0.43106969],
       [ 1.13158129,  0.06646213,  1.0053106 ,  0.36092633],
       [-1.17845281,  0.92342763, -1.61730616,  1.32064281],
       [-0.71978126, -1.03241512, -1.36204097,  0.68607573],
       [ 0.63927993,  1.43777896,  0.32396119,  0.55409232]])

In [40]:
df.describe()

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,0.134555,0.014979,-0.322095,0.39702
std,0.988095,1.457361,1.228931,0.61708
min,-1.178453,-2.352487,-1.617306,-0.43107
25%,-0.595973,-0.757696,-1.338994,0.008822
50%,0.207365,0.494945,-0.472945,0.457509
75%,1.008506,1.016189,0.821507,0.65308
max,1.159255,1.437779,1.005311,1.320643


In [41]:
df.T

Unnamed: 0,2018-01-01 00:00:00,2018-01-02 00:00:00,2018-01-03 00:00:00,2018-01-04 00:00:00,2018-01-05 00:00:00,2018-01-06 00:00:00
A,1.159255,-0.224549,1.131581,-1.178453,-0.719781,0.63928
B,-2.352487,1.04711,0.066462,0.923428,-1.032415,1.437779
C,-1.269852,0.987356,1.005311,-1.617306,-1.362041,0.323961
D,-0.108547,-0.43107,0.360926,1.320643,0.686076,0.554092
