# 利用python进行数据分析

## 附录部分：python入门

#### 异常处理

In [3]:
def attempt_float(x):
    try:
        return float(x)
    except:
        print("%s 不是一个类似浮点数的东西" % x)
        
attempt_float("12.34")

12.34

In [4]:
attempt_float("something")

something 不是一个类似浮点数的东西


+ 在一个元组中定义可能的异常类型，就可以捕获多种异常：

In [5]:
def attempt_float(x):
    try:
        return float(x)
    except (TypeError, ValueError):
        return x
    
attempt_float([1,2])

[1, 2]

+ 并不想考虑异常，而是希望代码可以正常运行：

##### range ；始终是返回一个迭代器

#### 三元表达式

In [6]:
def panduan(x):
    print("非负") if x >= 0 else print("负数")

panduan(10)

非负


##### 元组拆包

In [7]:
tup=(2,3,4)
a,b,c=tup
b

3

In [8]:
tup = (2,3,(4,5))
a,b,(c,d) = tup
c

4

#### 列表
insert的计算量比append 大，判断列表是否含有某一个值比字典，set耗时；
+ extend添加元素，比合并节省资源，不用再创建一个新列表

In [9]:
x = ["s","as","sas"]
x.extend([7,8,9])
x

['s', 'as', 'sas', 7, 8, 9]

#### 二分搜索
+ bisect.bisect查找插入到哪个位置可以保持列表的有序性，而bisect.insort将新元素插入到那个位置
+ **注意**：这个函数不检查列表是否有序，

In [10]:
import bisect
c=[1,2,2,3,4,5]
bisect.bisect(c,2)

3

In [11]:
bisect.insort(c,4)
c

[1, 2, 2, 3, 4, 4, 5]

#### 切片

In [12]:
# 倒置
seq=[7,2,3,4,5]
seq[::-1]

[5, 4, 3, 2, 7]

In [13]:
"12345"[::-1]

'54321'

#### 内置的序列函数
+ enumerate()函数,它可以返回序列的(i,value)元组

In [14]:
alist = ["foo","ds","ds"]
for i,value in enumerate(alist):
    print(i,value)

0 foo
1 ds
2 ds


In [15]:
map = dict((v,i) for i,v in enumerate(alist))
map

{'ds': 2, 'foo': 0}

#### zip：将多个序列进行配对

In [16]:
seq1 = ["foo","zoo","ses"]
seq2 = [1,2,3]
list(zip(seq1,seq2))

[('foo', 1), ('zoo', 2), ('ses', 3)]

In [17]:
# 可以接受多个序列，长度由最短的序列所决定
seq3 = ["sa","sas"]
list(zip(seq1,seq2,seq3))

[('foo', 1, 'sa'), ('zoo', 2, 'sas')]

In [18]:
for i,(a,b) in enumerate(zip(seq1,seq2)):
    print("%d: %s, %s" % (i,a,b))

0: foo, 1
1: zoo, 2
2: ses, 3


+ unzip(解压）：


In [19]:
a = zip(seq1,seq2)
pitchers = [("Nolan","Ryan"), ("Roger","Clemens"),("Scds","dssd")]
first_name, last_name = zip(*pitchers)

In [20]:
first_name

('Nolan', 'Roger', 'Scds')

In [21]:
# reversed 返回逆序的序列
list(reversed(range(10)))

[9, 8, 7, 6, 5, 4, 3, 2, 1, 0]

#### 字典

In [22]:
d1 = {'a':"some value","b":"big","c":"cost"}
d1

{'a': 'some value', 'b': 'big', 'c': 'cost'}

In [23]:
d1["d"] = "dozen"  #访问与添加

In [24]:
d1

{'a': 'some value', 'b': 'big', 'c': 'cost', 'd': 'dozen'}

In [25]:
"b" in d1

True

In [26]:
d1.keys()  #这是一个迭代器

dict_keys(['c', 'b', 'a', 'd'])

+ del删除值，pop删除之后还返回那个值

In [27]:
d1["dummy"] = "another value"
del d1["dummy"]
d1

{'a': 'some value', 'b': 'big', 'c': 'cost', 'd': 'dozen'}

In [28]:
d1.pop("d")

'dozen'

+ update:一个字典合并到另一个字典中去

In [29]:
d1.update({"c":"aas"})
d1

{'a': 'some value', 'b': 'big', 'c': 'aas'}

+ 二元组生成一个字典

In [30]:
map = dict(zip(range(5),range(5)))
map

{0: 0, 1: 1, 2: 2, 3: 3, 4: 4}

In [31]:
map.get(5,"na")  #如果5不在key中，返回na

'na'

In [32]:
map.pop(5,"na")

'na'

+ 字典的key只能选取不可变对象，若想将list作为key，可以先将list改为tuple
#### set集合

In [33]:
a=set([1,2,3,4,2,3,1])
a

{1, 2, 3, 4}

In [34]:
a.add(3)
a

{1, 2, 3, 4}

#### 列表推导式：以一条表达式创建一个表达式

+ 1. 列表：

In [35]:
string = ["aa","as","eric","james","harden"]
[x.upper() for x in string if len(x)>2]

['ERIC', 'JAMES', 'HARDEN']

+ 2. set集合,需要使用的是花括号

In [36]:
{len(x) for x in string}

{2, 4, 5, 6}

+ 字典：也是使用花括号

In [37]:
{val: value for value,val in enumerate(string)}

{'aa': 0, 'as': 1, 'eric': 2, 'harden': 4, 'james': 3}

In [38]:
dict((val,value) for value, val in enumerate(string))

{'aa': 0, 'as': 1, 'eric': 2, 'harden': 4, 'james': 3}

+ 嵌套列表推导式

In [39]:
all_data=[["tom","jason","hana"],["saas","sasa","asasas"]]

[name for names in all_data for name in names if len(name)>2]

['tom', 'jason', 'hana', 'saas', 'sasa', 'asasas']

In [40]:
some_tuple = [(1,2,3),(2,3,4),(4,5,6)]
[x for tup in some_tuple for x in tup ]

[1, 2, 3, 2, 3, 4, 4, 5, 6]

In [41]:
[[x for x in tup] for tup in some_tuple]

[[1, 2, 3], [2, 3, 4], [4, 5, 6]]

#### 函数
+ 参数里，先位置参数，后使用关键字
+ 命名空间的概念：局部空间在函数运行完之后会直接销毁
+ 局部函数：外层函数被调用的时候才会被动态创建出来；


+ 返回多个值：这多个值事实上是一个元组
+ 函数也是对象

In [42]:
def remove_punctuation(value):
    return re.sub('[!#?]','',value)

clean_ops = [str.strip,remove_punctuation,str.title]

def clean_strings(strings,ops):
    result=[]
    for value in strings:
        for function in ops:
            value = function(value)
        result.append(value)
    return result



#### 匿名函数


In [43]:
string.sort(key=lambda x: len(set(list(x))))
string

['aa', 'as', 'eric', 'james', 'harden']

#### 闭包：返回函数的函数
+ 被函数的函数可以访问其创建者的局部空间中的变量，即使他的创建者已经执行完毕，闭包仍然能访问其创建者的局部命名空间

In [44]:
def make_watcher():
    have_seen={}
    def has_been_seen(x):
        if x in have_seen:
            return True
        else:
            have_seen[x]=True
            return False
    return has_been_seen

watcher=make_watcher()
vals=[5,6,1,5,1,6,3,5]
[watcher(x) for x in vals]

[False, False, False, True, True, True, False, True]

+ 虽然可以修改任何内部状态对象，但是不能绑定外层函数作用域的变量；一个方法就是修改字典或列表，而不是去绑定变量

#### 调用语法*args **kwargs
#### 柯里化:部分参数应用
#### 生成器

In [45]:
gen  = (x ** 2 for x in range(100))
gen

<generator object <genexpr> at 0x000001ED84A63A98>

In [46]:
sum(gen)

328350

#### itertools 模块：储存了很多用于常见数据算法的生成器

In [47]:
import itertools
first_letter = lambda x:x[0]
names=["sa","asas","Jas","assa"]

for letter, names in itertools.groupby(names, first_letter):
    print(letter,list(names))

s ['sa']
a ['asas']
J ['Jas']
a ['assa']


## 第二章，引言

In [48]:
import json
path="E:/python学习/利用python进行数据分析 源码与数据/ch02/usagov_bitly_data2012-03-16-1331923249.txt"
records = [json.loads(line) for line in open(path)]

FileNotFoundError: [Errno 2] No such file or directory: 'E:/python学习/利用python进行数据分析 源码与数据/ch02/usagov_bitly_data2012-03-16-1331923249.txt'

In [None]:
records[0]

In [None]:
test = records[0]  #时区
test["tz"]

+ 使用pandas进行处理

In [None]:
from pandas import DataFrame, Series
import pandas as pd;import numpy as np
frame = DataFrame(records)


In [None]:
frame

In [None]:
frame['tz'][:10]

In [None]:
frame["tz"].value_counts()[:10]  #series对象有一个value_count方法

In [None]:
clean = frame["tz"].fillna("Missing")
clean[clean == ""] = "Unkown"
tz_counts = clean.value_counts()
tz_counts[:10]

In [None]:
%matplotlib inline
tz_counts[:10].plot(kind="barh",rot=0)

In [None]:
result = Series([x.split()[0] for x in frame.a.dropna()])
result[:10]

In [None]:
cframe = frame[frame.a.notnull()]
cframe[:10]

判断agent信息中是否包含windows,紧接着根据系统进行分类

In [None]:
operating_system = np.where(cframe['a'].str.contains("Windows"), "Windows", "Not Windows")
operating_system[:10]

In [None]:
by_tz_os = cframe.groupby(['tz',operating_system])
agg_counts = by_tz_os.size().unstack().fillna(0)

In [None]:
agg_counts[:10]

+ MovieLens数据

In [None]:
uname = ["user_id","gender", "age", "occupation", "zip"]
users = pd.read_table("E:/python学习/利用python进行数据分析 源码与数据/ch02/movielens/users.dat", sep = "::", header=None, names=uname)

In [None]:
users[:10]

In [None]:
rname=["user_id","movie_id","rating","timestamp"]

rating = pd.read_table("E:/python学习/利用python进行数据分析 源码与数据/ch02/movielens/ratings.dat",
                      sep="::",header=None, names = rname)
rating[:10]

In [None]:
mnames = ["movie_id","title","genres"]
movies = pd.read_table("E:/python学习/利用python进行数据分析 源码与数据/ch02/movielens/movies.dat",
                      sep="::",header=None,names=mnames)
movies[:10]

In [None]:
data = pd.merge(pd.merge(rating,users), movies)
data[:10]

In [None]:
movies[:13]

## 第三章 Ipython

+ 查找命名空间

In [None]:
import pandas as pd
pd.*table*?

+ %run 一个python文件
+ 执行剪切板中的代码：
+ magic命令

In [None]:
%magic

## 第四章 numpy基础

### ndarrary:多维数组对象


In [None]:
import numpy as np
data = np.array([1,2.1,3.1,4,5])
data

In [None]:
data2=[[1,2,3,4],[1,2,3]]
data2=np.array(data2)
data2

In [None]:
data2.dtype

In [None]:
data.dtype

+ zero全零矩阵，ones全1矩阵

In [None]:
zero=np.zeros(10)
zero

In [None]:
zero.dtype

In [None]:
np.zeros((3,4))

In [None]:
np.ones((2,3,3))

In [None]:
np.arange(10)

In [None]:
np.array([1+2j,2,3]).dtype  #复数

#### 数据类型

In [None]:
arr = np.array([1,2,3,4])
arr

In [None]:
arr.dtype

+  使用astype转换其dtype

In [None]:
arr1 = arr.astype(np.float64)
arr1

In [None]:
arr1.dtype

In [None]:
string = np.array(["1.2","3.4"])
string.astype(np.float).dtype

In [None]:
string.dtype

In [None]:
test = np.array([1,2,3], dtype="f8")  # 可以使用简写
test.dtype

### 数组和标量之间的运算
矢量化运算

+ 同样大小的数组之间的元素会将运算运用到一一对应的元素中去

In [None]:
arr = np.array([[1,2,3],[4,5,6]])
arr

+ 切片，索引

In [None]:
arr = np.arange(10)
arr

In [None]:
arr[0:3]

In [None]:
arr[5]

+ 将一个标量值富裕切片时，该值会自动传播到整个选区；

In [None]:
arr[5:8]=12
arr

*注意*：数组切片不会进行复制，上面发生的改变就直接在原先的对象上进行更改了；

In [None]:
arr_slice = arr[5:8]
arr_slice

In [None]:
arr_slice[1] = 100  #直接发生改变了
arr

In [None]:
arr_slice[:] = 120
arr

+ 为了解决这个问题，如果我们想要得到一个拷贝的话，可以写下copy语句

In [None]:
arr2 = arr.copy()
arr2

In [None]:
arr2[:]=100
arr

In [None]:
arr3d = np.array([[[1,2,3],[2,3,4]],[[7,8,9],[3,4,5]]])
arr3d

In [None]:
arr3d[0]

In [None]:
arr3d[0,1]

In [None]:
arr3d[0,1,0]

熟悉索引很重要，多维度的数组都是一个视图，同时索引都是从0开始的；

In [None]:
arr3d[0,:2,1]

#### 布尔型索引

In [None]:
names = np.array(["bob","as","asassa","bob","eric"])
names

In [None]:
from numpy.random import randn
data = randn(5,4)
data

In [None]:
data[names=="bob"]

In [None]:
data[names=="bob",2:]

In [None]:
arr3d[1,1:,1:]

In [None]:
arr3d[1,1:,1:].shape

In [None]:
data[names!="bob",2:]

In [None]:
data[~(names=="bob")]     # 把-号改为了~

In [None]:
data[data < 0] = 0
data

+ 花式索引

In [None]:
data

In [None]:
data[[2,3,1,0]]

In [None]:
data[[2,3,1,0],[1,2,3,0]]

+ 从上面的切片可以知道，两个花式并不会和R一样，可以考虑使用

In [None]:
data[[2,3,1,0]][:,[1,2,3,0]]

**注意**：花式索引总是赋值到新数组之中，因此不会对原有的数据发生直接改变

In [None]:
data[[2,3,1,0]][0,1] = 1
data              #并没有发生改变

#### 数组转置和轴对换
+ 转置同样是一个视图，就是不会进行复制，会对原有对象产生直接变化

In [None]:
arr = np.arange(15).reshape((3,5))
arr

In [None]:
arr.T

In [None]:
arr.T[0,1] = 20
arr.T

In [None]:
arr

#### 通用函数ufunc :类似于lapply呗

In [None]:
arr = np.arange(10)
np.cos(arr)

In [None]:
points=np.arange(-5,5,0.01)
xs, ys = np.meshgrid(points, points)
ys

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
z=np.sqrt(xs**2 + ys ** 2)
plt.imshow(z,cmap=plt.cm.gray);
plt.colorbar()


#### ifelse的矢量化：np.where

In [None]:
xarr = np.array([1.1,1.2,1.3,1.4,1.5])
yarr = np.array([2.1,2.2,2.3,2.4,2.5])
cond = np.array([True,False,True,True,False])

result = np.where(cond, xarr, yarr)

In [None]:
result

In [None]:
arr = randn(4,4)
arr

In [None]:
np.where(arr > 0 , "正", "负")

In [None]:
np.where(arr < 0 ,"负", np.where(arr > 0.2, "0.2", "0"))

#### 数学和统计方法

In [None]:
arr = np.random.randn(5,4)
arr

In [None]:
arr.mean()

In [None]:
arr.mean(axis=1) # 所有列进行计算

In [None]:
arr.mean(axis=0)  #s所有行进行计算

In [None]:
arr.mean(0)

In [None]:
arr.cumsum()

#### 用于bool型的方法

In [None]:
arr = randn(100)
(arr>0).sum()

In [None]:
bools =np.array([False,True,False,True])
bools.any()

In [None]:
bools.all()

#### 排序
sort

In [None]:
arr.sort()
arr[:10]

In [80]:
import numpy as np
from numpy.random import randn
arr = randn(4,3)
arr

array([[-0.05614454, -0.77399023, -1.62068385],
       [-0.0606662 ,  1.46459721,  0.61282961],
       [ 0.44229106, -0.24570093,  2.81967163],
       [-2.06221973,  1.4367815 , -0.63169763]])

+ 在任意一个轴上进行排序

In [None]:
arr.sort(1)
arr

In [None]:
arr.sort(0)
arr

#### 唯一化 unique

In [None]:
names = np.array(["Bob","Joe","Will","Joe"])
names

In [None]:
np.unique(names)

#### 集合运算
+ intersect1d(x,y) 公共元素，有序输出

In [None]:
x = np.array([1,2,3,4,5])
y = np.array([2,3,4])
np.intersect1d(x,y)

+ union1d(x,y) 并集

In [None]:
np.union1d(x,y)

+ in1d: x的元素是否包含于y，类似于%in%

In [None]:
np.in1d(x,y)

+ setdiff1d : 集合的差，在x中且不在y中

In [None]:
np.setdiff1d(x,y)

+ setxor1d ： 对称差

In [None]:
np.setxor1d(x,y)

### 数组文件的输入输出
#### 二进制文件格式

In [None]:
arr

In [None]:
np.save("some_arr", arr)

In [None]:
np.load("some_arr.npy")

+ 多个文件保存到一个压缩文件

In [None]:
np.savez("zip",a=arr,b=arr)


In [None]:
arch = np.load("zip.npz")
# arch其实是个字典
arch['a']

#### 随机数生成

In [None]:
samples = np.random.normal(size=(4,4))
samples

In [None]:
np.random.permutation(range(10))

In [None]:
np.random.seed(1)
randn(10)

## 第五章 pandas入门

In [2]:
from pandas import Series, DataFrame
import pandas as pd

#### Series对象
类似于带name的vector：分values与index两项;
可以把它看成是一个定长的有序字典

In [50]:
obj = Series([9,10,2,3])
obj

0     9
1    10
2     2
3     3
dtype: int64

In [51]:
obj.values

array([ 9, 10,  2,  3], dtype=int64)

In [52]:
obj.index

RangeIndex(start=0, stop=4, step=1)

In [53]:
# 自定义index
obj1 = Series([1,2,3,4], index=["d","a","w","w"])
obj1

d    1
a    2
w    3
w    4
dtype: int64

In [54]:
obj1["w"]

w    3
w    4
dtype: int64

In [55]:
'w' in obj1

True

+ 既然可以被认为是字典，自然可以通过字典直接生成；

In [56]:
dict1 = {"Harden":13, "Anderson":30, "Ariza":31}
obj2 = Series(dict1)
obj2

Anderson    30
Ariza       31
Harden      13
dtype: int64

In [57]:
player = ["Eric","Harden","Ariza"]
obj3 = Series(dict1, index=player)
obj3

Eric       NaN
Harden    13.0
Ariza     31.0
dtype: float64

In [58]:
obj3.isnull()

Eric       True
Harden    False
Ariza     False
dtype: bool

In [59]:
obj3.notnull()

Eric      False
Harden     True
Ariza      True
dtype: bool

In [60]:
obj3 + obj2

Anderson     NaN
Ariza       62.0
Eric         NaN
Harden      26.0
dtype: float64

In [61]:
obj3.name = "Player"
obj3

Eric       NaN
Harden    13.0
Ariza     31.0
Name: Player, dtype: float64

In [62]:
obj3.index.name= "index"
obj3

index
Eric       NaN
Harden    13.0
Ariza     31.0
Name: Player, dtype: float64

In [63]:
obj3.index = ["Gordon","Harden","Ariza"]  #修改index
obj3

Gordon     NaN
Harden    13.0
Ariza     31.0
Name: Player, dtype: float64

## DataFrame
每一列其实是一个Series，这些列共用一个索引

In [64]:
data = {"Team":["Hou","SAC","PHX"], "Player":["James Harden","Cousins","Bledosoe"],"Num":[13,15,2]}

In [65]:
frame = DataFrame(data)
frame

Unnamed: 0,Num,Player,Team
0,13,James Harden,Hou
1,15,Cousins,SAC
2,2,Bledosoe,PHX


+ 自定义列的顺序

In [66]:
frame2 = DataFrame(data, columns=["Team","Player","Num"])
frame2

Unnamed: 0,Team,Player,Num
0,Hou,James Harden,13
1,SAC,Cousins,15
2,PHX,Bledosoe,2


In [67]:
frame2=DataFrame(data, columns=["Team","Player","Num","Score"], index = ["One","Two","Three"])
frame2

Unnamed: 0,Team,Player,Num,Score
One,Hou,James Harden,13,
Two,SAC,Cousins,15,
Three,PHX,Bledosoe,2,


In [68]:
frame2["Team"]

One      Hou
Two      SAC
Three    PHX
Name: Team, dtype: object

In [69]:
frame2.ix['Three']

Team           PHX
Player    Bledosoe
Num              2
Score          NaN
Name: Three, dtype: object

In [70]:
frame2["Score"] = 2
frame2

Unnamed: 0,Team,Player,Num,Score
One,Hou,James Harden,13,2
Two,SAC,Cousins,15,2
Three,PHX,Bledosoe,2,2


+ 注意用Series赋值给列时，index也必须匹配

In [71]:
frame2["Score"] = Series([20,15,15], index = ["One","Two", "Three"])
frame2

Unnamed: 0,Team,Player,Num,Score
One,Hou,James Harden,13,20
Two,SAC,Cousins,15,15
Three,PHX,Bledosoe,2,15


In [72]:
frame2["Score"] = Series([20,15],index=["Two","One"])
frame2

Unnamed: 0,Team,Player,Num,Score
One,Hou,James Harden,13,15.0
Two,SAC,Cousins,15,20.0
Three,PHX,Bledosoe,2,


+ 新增列：直接给一个不存在的列进行赋值；
+ 删除列：del

In [73]:
frame2["Star"] = frame2.Score > 15
frame2

Unnamed: 0,Team,Player,Num,Score,Star
One,Hou,James Harden,13,15.0,False
Two,SAC,Cousins,15,20.0,True
Three,PHX,Bledosoe,2,,False


In [74]:
del frame2["Star"]
frame2

Unnamed: 0,Team,Player,Num,Score
One,Hou,James Harden,13,15.0
Two,SAC,Cousins,15,20.0
Three,PHX,Bledosoe,2,


+ 另外一种方式，嵌套的字典

In [75]:
pop = {"Neveda":{2001:2.4,2002:2.9},"Ohio":{2000:1.5,2001:2.0,2002:3.7}}
frame3 = DataFrame(pop)
frame3

Unnamed: 0,Neveda,Ohio
2000,,1.5
2001,2.4,2.0
2002,2.9,3.7


In [76]:
frame3.T

Unnamed: 0,2000,2001,2002
Neveda,,2.4,2.9
Ohio,1.5,2.0,3.7


In [77]:
DataFrame(pop, index=[2001,2002,2003])

Unnamed: 0,Neveda,Ohio
2001,2.4,2.0
2002,2.9,3.7
2003,,


In [81]:
arr

array([[-0.05614454, -0.77399023, -1.62068385],
       [-0.0606662 ,  1.46459721,  0.61282961],
       [ 0.44229106, -0.24570093,  2.81967163],
       [-2.06221973,  1.4367815 , -0.63169763]])

In [82]:
DataFrame(arr, columns=["a","b","c"], index = ["One","Two","Three","Four"])

Unnamed: 0,a,b,c
One,-0.056145,-0.77399,-1.620684
Two,-0.060666,1.464597,0.61283
Three,0.442291,-0.245701,2.819672
Four,-2.06222,1.436781,-0.631698


In [83]:
DataFrame([{2001:1,2002:3},{2002:5, 2004:10}])

Unnamed: 0,2001,2002,2004
0,1.0,3,
1,,5,10.0


In [84]:
frame2.index.name = "year"
frame2.columns.name = "Atrributes"
frame2

Atrributes,Team,Player,Num,Score
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
One,Hou,James Harden,13,15.0
Two,SAC,Cousins,15,20.0
Three,PHX,Bledosoe,2,


In [85]:
frame3.values

array([[ nan,  1.5],
       [ 2.4,  2. ],
       [ 2.9,  3.7]])

In [86]:
frame2.values

array([['Hou', 'James Harden', 13, 15.0],
       ['SAC', 'Cousins', 15, 20.0],
       ['PHX', 'Bledosoe', 2, nan]], dtype=object)

#### 索引对象:注意index是不可变的（inmutable）
+ 每一次构建一个DataFrame时，序列的标签都会被转换成一个index

In [88]:
obj = Series(range(3), index=["a","b","c"])
obj

a    0
b    1
c    2
dtype: int32

In [89]:
index = obj.index
index

Index(['a', 'b', 'c'], dtype='object')

In [90]:
index[0]

'a'

In [92]:
index[0] = "one"  #Index does not support mutable operations

TypeError: Index does not support mutable operations

+ index可以在多个数据结构之间共享

In [94]:
obj2 = Series([-1,2,3], index = index)
obj2

a   -1
b    2
c    3
dtype: int64

In [95]:
frame3

Unnamed: 0,Neveda,Ohio
2000,,1.5
2001,2.4,2.0
2002,2.9,3.7


+ 索引对象类似于一个固定长度的，不可修改的集合，同时index类定义了一系列methods和attributes；

In [96]:
2000 in frame3.index

True

In [97]:
index1 = frame3.index
index1

Int64Index([2000, 2001, 2002], dtype='int64')

In [99]:
index2 = frame2.index
index2

Index(['One', 'Two', 'Three'], dtype='object', name='year')

+ append:连接一个新的index，生成新的index

In [100]:
index1.append(index2)

Index([2000, 2001, 2002, 'One', 'Two', 'Three'], dtype='object')

+ isin 是否在给定的list之中

In [107]:
index1.isin([2000,2001])


array([ True,  True, False], dtype=bool)

+ delete : 将某一个特定的索引值删除，产生一个新的索引

In [112]:
index1.delete(1)

Int64Index([2000, 2002], dtype='int64')

+ drop：删除传入的值，并得到新的index

In [116]:
index1.drop(2000)

Int64Index([2001, 2002], dtype='int64')

+ inset：在loc中插入传入的值，并得到新的index

In [118]:
index1.insert(1,2005)

Int64Index([2000, 2005, 2001, 2002], dtype='int64')

+ 判断一个index是否是递增的

In [120]:
index1.is_monotonic

True

+ 判断是否是递减的

In [123]:
index1.is_unique

True

+ 计算index中unique的值

In [124]:
index1.unique()

array([2000, 2001, 2002], dtype=int64)

#### 基础功能
##### 重新索引 reindex

In [125]:
obj = Series([4,5,6,7], index = ["a","b","c","d"])
obj

a    4
b    5
c    6
d    7
dtype: int64

In [126]:
obj2 = obj.reindex(["b","c","a","e","d"])
obj2

b    5.0
c    6.0
a    4.0
e    NaN
d    7.0
dtype: float64

In [127]:
obj2 = obj.reindex(["b","c","a","e","d"], fill_value = "缺失值")
obj2

b      5
c      6
a      4
e    缺失值
d      7
dtype: object

In [130]:
obj3 = Series(["blue","purple","red"], index = [0,2,4])
obj3

0      blue
2    purple
4       red
dtype: object

+ 同时可以使用一些插值处理：

In [131]:
obj3.reindex(range(6), method="ffill")

0      blue
1      blue
2    purple
3    purple
4       red
5       red
dtype: object

In [132]:
obj3.reindex(range(6), method="bfill")

0      blue
1    purple
2    purple
3       red
4       red
5       NaN
dtype: object

In [134]:
frame =DataFrame(np.arange(9).reshape((3,3)), index = ['a','c','d'], columns=['Ohio',"Texas","California"])
frame

Unnamed: 0,Ohio,Texas,California
a,0,1,2
c,3,4,5
d,6,7,8


In [135]:
frame.reindex(index=["a","b","c","d"], columns=["Ohio","Texas","Washington","California"])


Unnamed: 0,Ohio,Texas,Washington,California
a,0.0,1.0,,2.0
b,,,,
c,3.0,4.0,,5.0
d,6.0,7.0,,8.0


#### 丢弃指定的轴上的项

In [137]:
obj = Series(np.arange(5.), index = ["a","b","c","d","e"])
obj

a    0.0
b    1.0
c    2.0
d    3.0
e    4.0
dtype: float64

In [138]:
new_obj = obj.drop("c")
new_obj

a    0.0
b    1.0
d    3.0
e    4.0
dtype: float64

In [139]:
frame

Unnamed: 0,Ohio,Texas,California
a,0,1,2
c,3,4,5
d,6,7,8


In [140]:
frame.drop("a")

Unnamed: 0,Ohio,Texas,California
c,3,4,5
d,6,7,8


In [141]:
frame.drop(["a","d"])

Unnamed: 0,Ohio,Texas,California
c,3,4,5


In [143]:
frame.drop("Ohio", axis = 1)

Unnamed: 0,Texas,California
a,1,2
c,4,5
d,7,8


In [144]:
frame.drop("a", axis = 0)

Unnamed: 0,Ohio,Texas,California
c,3,4,5
d,6,7,8


#### 索引，选取与过滤

In [145]:
obj

a    0.0
b    1.0
c    2.0
d    3.0
e    4.0
dtype: float64

In [146]:
obj["a"]

0.0

In [147]:
obj[1]

1.0

In [148]:
obj[[1,2]]

b    1.0
c    2.0
dtype: float64

In [149]:
obj[["a","c"]]

a    0.0
c    2.0
dtype: float64

In [150]:
obj["b":"c"]

b    1.0
c    2.0
dtype: float64

In [151]:
obj["b":"c"] = 5
obj

a    0.0
b    5.0
c    5.0
d    3.0
e    4.0
dtype: float64

In [152]:
frame

Unnamed: 0,Ohio,Texas,California
a,0,1,2
c,3,4,5
d,6,7,8


In [153]:
frame["Ohio"]

a    0
c    3
d    6
Name: Ohio, dtype: int32

In [154]:
frame[["Ohio","Texas"]]

Unnamed: 0,Ohio,Texas
a,0,1
c,3,4
d,6,7


In [171]:
frame[:2]

Unnamed: 0,Ohio,Texas,California
a,0,1,2
c,3,4,5


In [159]:
frame[[2,1]]

Unnamed: 0,California,Texas
a,2,1
c,5,4
d,8,7


#### 注意pandas里面的索引
若想要实现R里面的行列同时提取，可是使用ix

In [161]:
frame

Unnamed: 0,Ohio,Texas,California
a,0,1,2
c,3,4,5
d,6,7,8


In [162]:
frame.ix[1,2]

5

In [163]:
frame.ix[["a","d"],["Ohio","Texas"]]

Unnamed: 0,Ohio,Texas
a,0,1
d,6,7


In [164]:
frame.ix[0:1,[2,1]]

Unnamed: 0,California,Texas
a,2,1


In [166]:
frame.ix["a":"c",:"Texas"]

Unnamed: 0,Ohio,Texas
a,0,1
c,3,4


In [167]:
frame.ix[2]

Ohio          6
Texas         7
California    8
Name: d, dtype: int32

In [179]:
frame[[1,2]]

Unnamed: 0,Texas,California
a,1,2
c,4,5
d,7,8


+ DataFrame[i]不对，DataFrame[[i]]表示的是取相关列，DataFrame[i:j]表示的是取相关行
+ 还是尽量多选用ix比较好

In [185]:
frame[:1]

Unnamed: 0,Ohio,Texas,California
a,0,1,2


In [183]:
frame2[[1]]

Atrributes,Player
year,Unnamed: 1_level_1
One,James Harden
Two,Cousins
Three,Bledosoe


##### ix

In [186]:
frame.ix[1,2]

5

In [188]:
frame

Unnamed: 0,Ohio,Texas,California
a,0,1,2
c,3,4,5
d,6,7,8


In [189]:
frame.ix[0:2,1]

a    1
c    4
Name: Texas, dtype: int32

In [190]:
frame.ix["a":"c",[2,1]]

Unnamed: 0,California,Texas
a,2,1
c,5,4


+ DataFrame与array的计算同样遵循广播的原则
#### 函数应用和映射，其实就是apply方法

In [191]:
frame = DataFrame(np.random.randn(4,3), columns = list("bde"), index = ["Utah","Ohio","Texas","Oregon"])
frame

Unnamed: 0,b,d,e
Utah,-0.449776,-1.525782,1.193768
Ohio,-0.282743,-2.049467,0.18199
Texas,0.09441,-0.074407,1.475825
Oregon,-1.706569,0.984483,0.562119


In [192]:
f = lambda x : x.max()-x.min()
frame.apply(f)

b    1.800978
d    3.033949
e    1.293836
dtype: float64

In [193]:
# 在所有行上进行计算
frame.apply(f, axis = 1)

Utah      2.719550
Ohio      2.231456
Texas     1.550233
Oregon    2.691051
dtype: float64

In [194]:
def f(x):
    return Series([x.min(),x.max()], index = ["min","max"])

frame.apply(f)

Unnamed: 0,b,d,e
min,-1.706569,-2.049467,0.18199
max,0.09441,0.984483,1.475825


##### applymap函数：对一个数据框之中所有的元素进行一个运算

In [195]:
format = lambda x: "%.2f" % x
frame.applymap(format)

Unnamed: 0,b,d,e
Utah,-0.45,-1.53,1.19
Ohio,-0.28,-2.05,0.18
Texas,0.09,-0.07,1.48
Oregon,-1.71,0.98,0.56


#### 排序和排名
+ sort_index:按照index排序

In [3]:
obj = Series(range(4), index = ['d','a','b','c'])
obj

d    0
a    1
b    2
c    3
dtype: int32

In [4]:
obj.sort_index()

a    1
b    2
c    3
d    0
dtype: int32

对于dataframe则可以根据任意一个轴上的索引进行排序

In [7]:
import numpy as np
frame = DataFrame(np.arange(8).reshape((2,4)), index = ["Three","One"], columns = ["d","a","b","c"])
frame

Unnamed: 0,d,a,b,c
Three,0,1,2,3
One,4,5,6,7


In [8]:
frame.sort_index()

Unnamed: 0,d,a,b,c
One,4,5,6,7
Three,0,1,2,3


In [9]:
frame.sort_index(axis=1)

Unnamed: 0,a,b,c,d
Three,1,2,3,0
One,5,6,7,4


In [10]:
frame.sort_index(axis=1, ascending = False)

Unnamed: 0,d,c,b,a
Three,0,3,2,1
One,4,7,6,5


+ order： 按照值来排序,
+ 现在似乎已经是sort_values

In [12]:
obj.sort_values()

d    0
a    1
b    2
c    3
dtype: int32

In [13]:
obj.sort_values(ascending = False)

c    3
b    2
a    1
d    0
dtype: int32

In [14]:
obj = Series([4,np.nan,7,-3,2])
obj

0    4.0
1    NaN
2    7.0
3   -3.0
4    2.0
dtype: float64

In [16]:
obj.sort_values()

3   -3.0
4    2.0
0    4.0
2    7.0
1    NaN
dtype: float64

In [17]:
obj.sort_values(ascending = False)

2    7.0
0    4.0
4    2.0
3   -3.0
1    NaN
dtype: float64

**sort的时候，NaN永远都在最下边**

In [25]:
frame1 = DataFrame({"b":[4,7,-3,2],"a":[0,1,0,1]})
frame1

Unnamed: 0,a,b
0,0,4
1,1,7
2,0,-3
3,1,2


In [27]:
frame1.sort_values(by = "b")

Unnamed: 0,a,b
2,0,-3
3,1,2
0,0,4
1,1,7


In [28]:
frame1.sort_values(by=["a","b"])

Unnamed: 0,a,b
2,0,-3
0,0,4
3,1,2
1,1,7


##### ranking：平均排名

In [30]:
obj = Series([7,-5,7,4,2,0,4])
obj

0    7
1   -5
2    7
3    4
4    2
5    0
6    4
dtype: int64

In [31]:
obj.rank()

0    6.5
1    1.0
2    6.5
3    4.5
4    3.0
5    2.0
6    4.5
dtype: float64

+ 根据出现顺序进行排名：

In [32]:
obj.rank(method="first")

0    6.0
1    1.0
2    7.0
3    4.0
4    3.0
5    2.0
6    5.0
dtype: float64

In [33]:
frame.rank(axis=1)

Unnamed: 0,d,a,b,c
Three,1.0,2.0,3.0,4.0
One,1.0,2.0,3.0,4.0


#### 带有重复值的轴索引


In [34]:
obj = Series(range(5), index = ["a","a","b","b","c"])
obj

a    0
a    1
b    2
b    3
c    4
dtype: int32

In [37]:
obj.index.is_unique

False

In [38]:
obj["a"]

a    0
a    1
dtype: int32

#### 汇总和计算描述统计


In [39]:
df = DataFrame([[1.4,np.nan],[7.1,-4.5],[np.nan,np.nan],[0.75,-1.3]], index=["a","b","c","d"], columns=["one","two"])
df

Unnamed: 0,one,two
a,1.4,
b,7.1,-4.5
c,,
d,0.75,-1.3


In [40]:
df.sum()

one    9.25
two   -5.80
dtype: float64

In [41]:
df.sum(axis=1)

a    1.40
b    2.60
c    0.00
d   -0.55
dtype: float64

In [42]:
df.sum(axis=1,skipna=False)

a     NaN
b    2.60
c     NaN
d   -0.55
dtype: float64

In [45]:
#达到最小值的索引
df.idxmin()

one    d
two    b
dtype: object

In [47]:
# 累积性和
df.cumsum()

Unnamed: 0,one,two
a,1.4,
b,8.5,-4.5
c,,
d,9.25,-5.8


In [48]:
df.describe()



Unnamed: 0,one,two
count,3.0,2.0
mean,3.083333,-2.9
std,3.493685,2.262742
min,0.75,-4.5
25%,,
50%,,
75%,,
max,7.1,-1.3


In [49]:
df.pct_change()

Unnamed: 0,one,two
a,,
b,4.071429,
c,,
d,-0.894366,-0.711111


#### 相关系数与协方差

In [1]:
import pandas as pd
from pandas import DataFrame,Series


#### 唯一值、值计数以及成员资格

In [3]:
obj = Series(["c","a","a",'b',"s","a"])
uniques = obj.unique()
uniques

array(['c', 'a', 'b', 's'], dtype=object)

+ 频数计算

In [4]:
obj.value_counts() 

a    3
c    1
b    1
s    1
dtype: int64

+ 是否在Series之中

In [7]:
mask = obj.isin(["b"])
mask 

0    False
1    False
2    False
3     True
4    False
5    False
dtype: bool

In [8]:
obj[mask]

3    b
dtype: object

In [9]:
obj.isin(("b","c"))

0     True
1    False
2    False
3     True
4    False
5    False
dtype: bool

In [11]:
data = DataFrame({"Qu1":[1,2,3,4,6], "Qu2":[2,3,1,2,3], "Qu3":[1,5,2,4,4]})
data

Unnamed: 0,Qu1,Qu2,Qu3
0,1,2,1
1,2,3,5
2,3,1,2
3,4,2,4
4,6,3,4


In [14]:
data.apply(pd.value_counts).fillna(0)

Unnamed: 0,Qu1,Qu2,Qu3
1,1.0,1.0,1.0
2,1.0,2.0,1.0
3,1.0,2.0,0.0
4,1.0,0.0,2.0
5,0.0,0.0,1.0
6,1.0,0.0,0.0


#### missing data

In [16]:
import numpy as np
string_data = Series(["asa","asa",np.nan,"asnjkanjk"])
string_data

0          asa
1          asa
2          NaN
3    asnjkanjk
dtype: object

In [17]:
string_data.isnull()

0    False
1    False
2     True
3    False
dtype: bool

In [18]:
string_data.notnull()

0     True
1     True
2    False
3     True
dtype: bool

In [19]:
string_data.fillna("哈哈")

0          asa
1          asa
2           哈哈
3    asnjkanjk
dtype: object

In [20]:
string_data.dropna()

0          asa
1          asa
3    asnjkanjk
dtype: object

#### 滤除缺失数据

In [22]:
from numpy import nan as NA
data = Series([1,NA, 3.5,4.5])
data

0    1.0
1    NaN
2    3.5
3    4.5
dtype: float64

In [23]:
data.dropna()

0    1.0
2    3.5
3    4.5
dtype: float64

In [24]:
data[data.notnull()]

0    1.0
2    3.5
3    4.5
dtype: float64

In [60]:
data = DataFrame([[1.,6.5,3.],[1.,NA,NA],[NA,NA,NA],[NA,3.,5.]] )
data

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,3.0,5.0


In [31]:
data.dropna()

Unnamed: 0,0,1,2
0,1.0,6.5,3.0


+ 只消除那些整行都是NA的数据

In [34]:
data.dropna(how="all")

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
3,,3.0,5.0


+ 只消除那些整列都是NA的数据

In [35]:
data.dropna(how="all",axis=1)

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,3.0,5.0


+ thresh参数表示去除几个行

In [40]:
data.dropna(thresh=1)

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
3,,3.0,5.0


#### 填充缺失数据 


In [44]:
data.fillna("哈哈")

Unnamed: 0,0,1,2
0,1,6.5,3
1,1,哈哈,哈哈
2,哈哈,哈哈,哈哈
3,哈哈,3,5


+ 用一个字典来调用fillna，可以使得每一列用给定的字符来代替

In [45]:
data.fillna({0:"第一列",1:"第二列",2:"第三列"})

Unnamed: 0,0,1,2
0,1,6.5,3
1,1,第二列,第三列
2,第一列,第二列,第三列
3,第一列,3,5


+ fillna默认会返回新对象，但是也可以自定义就地修改

In [54]:
data.fillna(0, inplace = True)


In [55]:
data

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,0.0,0.0
2,0.0,0.0,0.0
3,0.0,3.0,5.0


In [61]:
data.fillna(method="ffill")

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,6.5,3.0
2,1.0,6.5,3.0
3,1.0,3.0,5.0


In [66]:

df = DataFrame(np.random.randn(6,3))
df.ix[2:,1] = NA
df.ix[4:,2] = NA
df

Unnamed: 0,0,1,2
0,1.210457,0.455409,-0.405739
1,1.486043,-1.104979,-0.928385
2,0.141098,,-0.734534
3,-0.227863,,-1.551969
4,-0.846499,,
5,-1.431357,,


In [67]:
df.fillna(method="ffill",limit=1)

Unnamed: 0,0,1,2
0,1.210457,0.455409,-0.405739
1,1.486043,-1.104979,-0.928385
2,0.141098,-1.104979,-0.734534
3,-0.227863,,-1.551969
4,-0.846499,,-1.551969
5,-1.431357,,


#### 层次化索引

In [68]:
data = DataFrame(np.random.randn(10),
                index=[["a","a","a","b","b","b","c","c","d","d"], [1,2,3,1,2,3,1,2,1,2]])
data

Unnamed: 0,Unnamed: 1,0
a,1,-0.054115
a,2,-0.232676
a,3,-0.88347
b,1,0.404246
b,2,1.408182
b,3,-0.812905
c,1,-1.916982
c,2,1.70112
d,1,-1.435357
d,2,0.566579


In [69]:
data.index

MultiIndex(levels=[['a', 'b', 'c', 'd'], [1, 2, 3]],
           labels=[[0, 0, 0, 1, 1, 1, 2, 2, 3, 3], [0, 1, 2, 0, 1, 2, 0, 1, 0, 1]])

In [77]:
data["a":"b"]

Unnamed: 0,Unnamed: 1,0
a,1,-0.054115
a,2,-0.232676
a,3,-0.88347
b,1,0.404246
b,2,1.408182
b,3,-0.812905


In [79]:
data.ix["b"]

Unnamed: 0,0
1,0.404246
2,1.408182
3,-0.812905


In [81]:
data.ix["a":"b"]

Unnamed: 0,Unnamed: 1,0
a,1,-0.054115
a,2,-0.232676
a,3,-0.88347
b,1,0.404246
b,2,1.408182
b,3,-0.812905


In [85]:
data.unstack()

Unnamed: 0_level_0,0,0,0
Unnamed: 0_level_1,1,2,3
a,-0.054115,-0.232676,-0.88347
b,0.404246,1.408182,-0.812905
c,-1.916982,1.70112,
d,-1.435357,0.566579,


In [86]:
data.unstack().stack()

Unnamed: 0,Unnamed: 1,0
a,1,-0.054115
a,2,-0.232676
a,3,-0.88347
b,1,0.404246
b,2,1.408182
b,3,-0.812905
c,1,-1.916982
c,2,1.70112
d,1,-1.435357
d,2,0.566579


In [87]:
frame = DataFrame(np.arange(12).reshape((4,3)),
                 index=[["a","a","b","b"],[1,2,1,2]],
                 columns = [["Ohio","Ohio","Colorado"],["Green","Red","Green"]])
frame

Unnamed: 0_level_0,Unnamed: 1_level_0,Ohio,Ohio,Colorado
Unnamed: 0_level_1,Unnamed: 1_level_1,Green,Red,Green
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


In [90]:
frame.ix["a"]

Unnamed: 0_level_0,Ohio,Ohio,Colorado
Unnamed: 0_level_1,Green,Red,Green
1,0,1,2
2,3,4,5


In [92]:
frame.index.names = ["key1","key2"]
frame.columns.names = ["state","color"]
frame

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


#### 重排分级顺序

In [94]:
frame.swaplevel("key1","key2")

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key2,key1,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1,a,0,1,2
2,a,3,4,5
1,b,6,7,8
2,b,9,10,11


In [96]:
frame.sortlevel(1)

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,0,1,2
b,1,6,7,8
a,2,3,4,5
b,2,9,10,11


In [97]:
frame.swaplevel(0,1).sortlevel(0)

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key2,key1,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1,a,0,1,2
1,b,6,7,8
2,a,3,4,5
2,b,9,10,11


#### 根据级别汇总统计

In [98]:
frame.sum(level = "key2")

state,Ohio,Ohio,Colorado
color,Green,Red,Green
key2,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
1,6,8,10
2,12,14,16


In [100]:
frame.sum(level='color', axis=1)

Unnamed: 0_level_0,color,Green,Red
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,1,2,1
a,2,8,4
b,1,14,7
b,2,20,10
