### Pandas-简介

pandas是一个Python语言的软件包，在我们使用Python语言进行机器学习编程的时候，这是一个非常常用的基础编程库。本文是对它的一个入门教程。

pandas提供了快速，灵活和富有表现力的数据结构。

#### Pandas的核心数据结构

##### pandas最核心的就是Series和DataFrame两个数据结构。
DataFrame可以看做是Series的容器，即：一个DataFrame中可以包含若干个Series。

In [2]:
import numpy as np
import pandas as pd

### Series 结构简介

In [9]:
a = pd.Series([1,2,3])

In [10]:
print(a)

0    1
1    2
2    3
dtype: int64


In [11]:
b = np.array([1,2,3])

In [12]:
print(b)

[1 2 3]


### DataFrame 结构简介

创建数据表

In [14]:
c = pd.DataFrame(np.arange(12).reshape(3,4))

In [15]:
c

Unnamed: 0,0,1,2,3
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11


行列的提取

In [17]:
c[1]

0    1
1    5
2    9
Name: 1, dtype: int32

In [19]:
c.iloc[1]

0    4
1    5
2    6
3    7
Name: 1, dtype: int32

In [8]:
df = pd.DataFrame(np.arange(16).reshape(4,4),
                 columns=['col1','col2','col3','col4'],
                 index = ['a','b','c','d'])

In [9]:
df

Unnamed: 0,col1,col2,col3,col4
a,0,1,2,3
b,4,5,6,7
c,8,9,10,11
d,12,13,14,15


In [24]:
df['col1']

a     0
b     4
c     8
d    12
Name: col1, dtype: int32

In [25]:
df.head(2)

Unnamed: 0,col1,col2,col3,col4
a,0,1,2,3
b,4,5,6,7


In [27]:
# 数值型特征的整体统计信息查看
df.describe()

Unnamed: 0,col1,col2,col3,col4
count,4.0,4.0,4.0,4.0
mean,6.0,7.0,8.0,9.0
std,5.163978,5.163978,5.163978,5.163978
min,0.0,1.0,2.0,3.0
25%,3.0,4.0,5.0,6.0
50%,6.0,7.0,8.0,9.0
75%,9.0,10.0,11.0,12.0
max,12.0,13.0,14.0,15.0


In [30]:
df.tail(2)

Unnamed: 0,col1,col2,col3,col4
c,8,9,10,11
d,12,13,14,15


In [3]:
note = pd.Series(['A','B','C','D'])

In [4]:
ID = pd.Series(['01','02','03','04'])

In [5]:
df2 = pd.DataFrame([ID,note])

In [7]:
df2.T

Unnamed: 0,0,1
0,1,A
1,2,B
2,3,C
3,4,D


In [10]:
df

Unnamed: 0,col1,col2,col3,col4
a,0,1,2,3
b,4,5,6,7
c,8,9,10,11
d,12,13,14,15


In [17]:
#df['col5']=['01','02','03','04']
df['col6']=pd.Series(['01','02','03','04'],index = ['b','a','c','d'])

In [18]:
df

Unnamed: 0,col1,col2,col3,col4,col5,col6
a,0,1,2,3,1,2
b,4,5,6,7,2,1
c,8,9,10,11,3,3
d,12,13,14,15,4,4


In [19]:
del df['col5']

In [20]:
df

Unnamed: 0,col1,col2,col3,col4,col6
a,0,1,2,3,2
b,4,5,6,7,1
c,8,9,10,11,3
d,12,13,14,15,4


### 数据访问

In [21]:
df.columns

Index(['col1', 'col2', 'col3', 'col4', 'col6'], dtype='object')

In [22]:
df.index

Index(['a', 'b', 'c', 'd'], dtype='object')

In [23]:
df.values

array([[0, 1, 2, 3, '02'],
       [4, 5, 6, 7, '01'],
       [8, 9, 10, 11, '03'],
       [12, 13, 14, 15, '04']], dtype=object)

In [31]:
# loc , iloc 
# loc 索引 iloc 位置
df

Unnamed: 0,col1,col2,col3,col4,col6
a,0,1,2,3,2
b,4,5,6,7,1
c,8,9,10,11,3
d,12,13,14,15,4


In [32]:
df.loc['a']

col1     0
col2     1
col3     2
col4     3
col6    02
Name: a, dtype: object

In [34]:
df.loc[['a','b'],['col1','col3']]

Unnamed: 0,col1,col3
a,0,2
b,4,6


In [36]:
df.iloc[0]

col1     0
col2     1
col3     2
col4     3
col6    02
Name: a, dtype: object

In [40]:
df.iloc[0:2,1:3]

Unnamed: 0,col2,col3
a,1,2
b,5,6


### Pandas对于数据文件的读取 

In [47]:
### pandas read_csv read_excel
df3 = pd.read_csv('./data.csv')

In [48]:
df3

Unnamed: 0,date,ID,class,price
0,2018.1.1,1001,A,22.0
1,2018.1.2,1002,B,10.0
2,2018.1.3,1003,A,31.0
3,2018.1.4,1004,C,
4,2018.1.5,1005,D,21.0
5,2018.1.6,1002,B,12.0


In [49]:
df3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6 entries, 0 to 5
Data columns (total 4 columns):
date     6 non-null object
ID       6 non-null int64
class    6 non-null object
price    5 non-null float64
dtypes: float64(1), int64(1), object(2)
memory usage: 272.0+ bytes


In [50]:
df3['num']=[1,2,1,3,5,3]

In [51]:
df3

Unnamed: 0,date,ID,class,price,num
0,2018.1.1,1001,A,22.0,1
1,2018.1.2,1002,B,10.0,2
2,2018.1.3,1003,A,31.0,1
3,2018.1.4,1004,C,,3
4,2018.1.5,1005,D,21.0,5
5,2018.1.6,1002,B,12.0,3


In [64]:
df3.to_csv('./data_addnum.csv',index=False,header=False)

In [67]:
df4 = pd.read_csv('./data_addnum.csv',header=None)

In [68]:
df4

Unnamed: 0,0,1,2,3,4
0,2018.1.1,1001,A,22.0,1
1,2018.1.2,1002,B,10.0,2
2,2018.1.3,1003,A,31.0,1
3,2018.1.4,1004,C,,3
4,2018.1.5,1005,D,21.0,5
5,2018.1.6,1002,B,12.0,3


In [69]:
df4.columns = ['c1','c2','c3','c4','c5']

In [70]:
df4

Unnamed: 0,c1,c2,c3,c4,c5
0,2018.1.1,1001,A,22.0,1
1,2018.1.2,1002,B,10.0,2
2,2018.1.3,1003,A,31.0,1
3,2018.1.4,1004,C,,3
4,2018.1.5,1005,D,21.0,5
5,2018.1.6,1002,B,12.0,3


In [71]:
## sudo pip3 install xlrd

In [73]:
df5 = df4.dropna()

In [76]:
df4

Unnamed: 0,c1,c2,c3,c4,c5
0,2018.1.1,1001,A,22.0,1
1,2018.1.2,1002,B,10.0,2
2,2018.1.3,1003,A,31.0,1
3,2018.1.4,1004,C,,3
4,2018.1.5,1005,D,21.0,5
5,2018.1.6,1002,B,12.0,3


In [78]:
df4.dropna(axis=1,how='any')

Unnamed: 0,c1,c2,c3,c5
0,2018.1.1,1001,A,1
1,2018.1.2,1002,B,2
2,2018.1.3,1003,A,1
3,2018.1.4,1004,C,3
4,2018.1.5,1005,D,5
5,2018.1.6,1002,B,3


In [86]:
df4.fillna(df4['c4'].median())

Unnamed: 0,c1,c2,c3,c4,c5
0,2018.1.1,1001,A,22.0,1
1,2018.1.2,1002,B,10.0,2
2,2018.1.3,1003,A,31.0,1
3,2018.1.4,1004,C,21.0,3
4,2018.1.5,1005,D,21.0,5
5,2018.1.6,1002,B,12.0,3


In [87]:
#ENd