# Pandas

## pandas基本介绍

## Series

In [2]:
import pandas as pd
import numpy as np

s = pd.Series([1, 3, 6, np.nan, 44, 1])
print(s)

0     1.0
1     3.0
2     6.0
3     NaN
4    44.0
5     1.0
dtype: float64


## DataFrame

In [6]:
dates = pd.date_range('20180101', periods=6)
df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=['a', 'b', 'c', 'd'])

print(df)

                   a         b         c         d
2018-01-01 -1.162940 -0.954633 -0.771211  0.851020
2018-01-02 -0.578357  1.478543  0.734987  0.592461
2018-01-03 -1.247189 -1.009870 -0.748720  0.202590
2018-01-04  1.722796  1.887897  0.550508  0.715857
2018-01-05 -1.211695  0.955130  0.302027 -0.650235
2018-01-06 -0.240877  0.157945  0.512859  0.553275


## DataFrame的一些简单运用

In [7]:
print(df['b'])

2018-01-01   -0.954633
2018-01-02    1.478543
2018-01-03   -1.009870
2018-01-04    1.887897
2018-01-05    0.955130
2018-01-06    0.157945
Freq: D, Name: b, dtype: float64


### pandas默认索引

In [8]:
df1 = pd.DataFrame(np.arange(12).reshape((3, 4)))
print(df1)

   0  1   2   3
0  0  1   2   3
1  4  5   6   7
2  8  9  10  11


### 构建每一列特定的数据

In [12]:
df2 = pd.DataFrame({'A': 1.,
                   'B': pd.Timestamp('20180102'),
                   'C': np.array([3] * 4, dtype='int32'),
                   'E': pd.Categorical(['test', 'train', 'test', 'train']),
                   'F': 'foo'})
print(df2)

     A          B  C      E    F
0  1.0 2018-01-02  3   test  foo
1  1.0 2018-01-02  3  train  foo
2  1.0 2018-01-02  3   test  foo
3  1.0 2018-01-02  3  train  foo


### 查看dataFrame的数据类型

In [13]:
print(df2.dtypes)

A           float64
B    datetime64[ns]
C             int32
E          category
F            object
dtype: object


### 查看行序号

In [14]:
print(df2.index)

RangeIndex(start=0, stop=4, step=1)


### 查看列序号

In [15]:
print(df2.columns)

Index(['A', 'B', 'C', 'E', 'F'], dtype='object')


### 查看所有值

In [16]:
print(df2.values)

[[1.0 Timestamp('2018-01-02 00:00:00') 3 'test' 'foo']
 [1.0 Timestamp('2018-01-02 00:00:00') 3 'train' 'foo']
 [1.0 Timestamp('2018-01-02 00:00:00') 3 'test' 'foo']
 [1.0 Timestamp('2018-01-02 00:00:00') 3 'train' 'foo']]


### 查看数据的描述

In [17]:
print(df.describe())

              a         b         c         d
count  6.000000  6.000000  6.000000  6.000000
mean  -0.453043  0.419168  0.096742  0.377495
std    1.140628  1.229993  0.677747  0.548205
min   -1.247189 -1.009870 -0.771211 -0.650235
25%   -1.199506 -0.676489 -0.486033  0.290261
50%   -0.870648  0.556537  0.407443  0.572868
75%   -0.325247  1.347689  0.541096  0.685008
max    1.722796  1.887897  0.734987  0.851020


### 对数据的index排序并输出

In [18]:
print(df2.sort_index(axis=1, ascending=False))

     F      E  C          B    A
0  foo   test  3 2018-01-02  1.0
1  foo  train  3 2018-01-02  1.0
2  foo   test  3 2018-01-02  1.0
3  foo  train  3 2018-01-02  1.0


### 对数的值排序

In [19]:
print(df2.sort_values(by='B'))

     A          B  C      E    F
0  1.0 2018-01-02  3   test  foo
1  1.0 2018-01-02  3  train  foo
2  1.0 2018-01-02  3   test  foo
3  1.0 2018-01-02  3  train  foo
