## 1、索引操作

#### 1-1、直接使用行列索引（先列后行）
    data['open']['2018-02-27']
**必须先列后行，否则报错，同时不支持切片**

#### 1-2、结合loc或者iloc使用索引
**使用行列名字进行切片**

    data.loc['2018-02-27':'2018-02-14', "open":"close"]

**使用行列索引进行切片**

    data.iloc[:5, :3]
    
#### 1-3、使用ix组合索引（后面的版本会删除）
获取0～3行，'open', 'close', 'high', 'low'四列数据
    
    data.ix[0:4, ['open', 'close', 'high', 'low']]

**推荐使用loc和iloc**
注意loc参数是值列表
    
    data.loc[data.index[0:4], ['open', 'close', 'high', 'low']]

注意iloc参数是索引列表

    data.iloc[0:4, data.columns.get_indexer(['open', 'close', 'high', 'low'])]

#### 1-4、rename修改行名列名
#### 1-5、删除/插入一列数据 drop、insert
#### 1-6、数据预处理 df.info() df.describe() df.describe(include=object)
#### 1-7、获取最大的n条 df.nlargest ； 获取最小的n条 df.nsmallest()
#### 1-8、去重df.drop_duplicates(subset=['columns'], keep='first') subset 针对那个字段去重，keep取开始值还是结束值（first， last）
#### 1-9、groupby后边统计数量用size，然后可以用sort_values排序

In [3]:
import pandas as pd
import numpy as np

In [None]:
### 1、修改行列索引
**不能单个修改，必须全部修改**

In [None]:
# 整体修改行索引
new_index = [f'同学-{i}' for i in range(score.shape[0])]
new_index

In [None]:
df.index = new_index
df

In [None]:
### 2、重设索引
reset_index(drop=False)
drop: 是否删除原来索引，默认False

In [None]:
# 重设索引
df.reset_index()

In [None]:
# 删除原索引
df.reset_index(drop=True)

In [None]:
### 3、以某列值设为新的索引
- set_index(keys, drop=True)
- keys: 列索引名或列索引名称的列表
- drop: 当作新的索引，删除原来的列，默认True

In [None]:
# 创建数据
data_df = pd.DataFrame({
    'month': [1, 4, 7, 10],
    'year': [2012, 2014, 2013, 2014],
    'sale': [55, 40, 84, 31]
})
data_df

In [None]:
# 以month设置为新索引
data_df.set_index('month')

In [None]:
# 设置多个索引
data_df.set_index(['year', 'month'])

In [3]:
basic_data = np.random.randint(1, 6, (10, 6))
index = list(range(1, 11))
columns = ['time', 'open', 'close', 'number', 'high', 'low']

data = pd.DataFrame(basic_data, index, columns)
data

Unnamed: 0,time,open,close,number,high,low
1,3,4,2,3,3,5
2,3,1,3,3,2,2
3,5,2,1,4,3,5
4,4,1,4,3,2,3
5,4,5,5,5,2,1
6,5,2,5,4,3,2
7,1,1,4,4,2,4
8,5,2,2,3,2,2
9,5,2,4,2,1,3
10,4,1,5,4,2,4


In [2]:
# 假设需求：获取（3，4）数据

In [3]:
# 直接索引  先列后行！！
data['number'][3]

2

In [4]:
# loc 参数是key值！！！
data.loc[[3], ['number']]

Unnamed: 0,number
3,2


In [5]:
# iloc 参数是索引！！! 索引从0开始
data.iloc[[2], [3]]

Unnamed: 0,number
3,2


In [6]:
# ix 组合索引 （已经移除） 使用loc、iloc代替
data.index  # 获取行的值列表

Int64Index([1, 2, 3, 4, 5, 6, 7, 8, 9, 10], dtype='int64')

In [7]:
data.loc[data.index[2:3], ['number']]

Unnamed: 0,number
3,2


In [8]:
data.iloc[[2], data.columns.get_indexer(['number'])]

Unnamed: 0,number
3,2


## 2、赋值操作
    
    data['close'] = 1
    data.close = 1

In [16]:
data

Unnamed: 0,time,open,close,number,high,low
1,3,4,10,3,4,1
2,2,5,10,1,4,3
3,3,3,10,2,5,2
4,4,2,10,4,5,5
5,1,1,10,4,4,2
6,4,4,10,1,1,4
7,2,2,10,1,1,4
8,5,1,10,1,5,3
9,2,1,10,1,3,2
10,1,1,10,1,5,3


In [25]:
data['time'][1] = 9
data

Unnamed: 0,time,open,close,number,high,low
1,9,4,10,3,4,1
2,2,5,10,1,4,3
3,3,3,10,2,5,2
4,4,2,10,4,5,5
5,1,1,10,4,4,2
6,4,4,10,1,1,4
7,2,2,10,1,1,4
8,5,1,10,1,5,3
9,2,1,10,1,3,2
10,1,1,10,1,5,3


In [9]:
data['close'] = 1
data

Unnamed: 0,time,open,close,number,high,low
1,3,4,1,3,4,1
2,2,5,1,1,4,3
3,3,3,1,2,5,2
4,4,2,1,4,5,5
5,1,1,1,4,4,2
6,4,4,1,1,1,4
7,2,2,1,1,1,4
8,5,1,1,1,5,3
9,2,1,1,1,3,2
10,1,1,1,1,5,3


In [10]:
data.close = 10
data

Unnamed: 0,time,open,close,number,high,low
1,3,4,10,3,4,1
2,2,5,10,1,4,3
3,3,3,10,2,5,2
4,4,2,10,4,5,5
5,1,1,10,4,4,2
6,4,4,10,1,1,4
7,2,2,10,1,1,4
8,5,1,10,1,5,3
9,2,1,10,1,3,2
10,1,1,10,1,5,3


## 3、排序操作
      
内容排序
- by：根据那列排序，可以多列，按顺序排列，如果前面列相同，用后面列排
- ascending：True升序，False降序, 默认升序 
    
         data.sort_values(by, ascending)
索引排序

         data.sort_index()

In [11]:
data.sort_values(by=['time','open'])

Unnamed: 0,time,open,close,number,high,low
5,1,1,10,4,4,2
10,1,1,10,1,5,3
9,2,1,10,1,3,2
7,2,2,10,1,1,4
2,2,5,10,1,4,3
3,3,3,10,2,5,2
1,3,4,10,3,4,1
4,4,2,10,4,5,5
6,4,4,10,1,1,4
8,5,1,10,1,5,3


In [12]:
data.sort_index()

Unnamed: 0,time,open,close,number,high,low
1,3,4,10,3,4,1
2,2,5,10,1,4,3
3,3,3,10,2,5,2
4,4,2,10,4,5,5
5,1,1,10,4,4,2
6,4,4,10,1,1,4
7,2,2,10,1,1,4
8,5,1,10,1,5,3
9,2,1,10,1,3,2
10,1,1,10,1,5,3


## 4、series排序

    series.sort_values(ascending=)
    series.sort_index()

In [13]:
df = data['number']
# 单独取出一列就是一个Series
print(type(df))
df

<class 'pandas.core.series.Series'>


1     3
2     1
3     2
4     4
5     4
6     1
7     1
8     1
9     1
10    1
Name: number, dtype: int64

In [14]:
df.sort_values()

2     1
6     1
7     1
8     1
9     1
10    1
3     2
1     3
4     4
5     4
Name: number, dtype: int64

In [15]:
df.sort_index()

1     3
2     1
3     2
4     4
5     4
6     1
7     1
8     1
9     1
10    1
Name: number, dtype: int64

## 数据序列化，比如生成的模型

In [1]:
mkdir test_output

In [4]:
data = pd.read_csv('data/movie.csv')
head = data.head()

### 写

In [5]:
head.to_pickle('test_output/movie.pkl')

In [6]:
head.to_csv('test_output/movie.csv')

In [7]:
head.to_csv('test_output/movie.tsv', sep='\t', index=False)

In [9]:
head.to_excel('test_output/movie.xlsx')

### 读

In [10]:
pd.read_pickle('test_output/movie.pkl')

Unnamed: 0,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,...,num_user_for_reviews,language,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
0,Color,James Cameron,723.0,178.0,0.0,855.0,Joel David Moore,1000.0,760505847.0,Action|Adventure|Fantasy|Sci-Fi,...,3054.0,English,USA,PG-13,237000000.0,2009.0,936.0,7.9,1.78,33000
1,Color,Gore Verbinski,302.0,169.0,563.0,1000.0,Orlando Bloom,40000.0,309404152.0,Action|Adventure|Fantasy,...,1238.0,English,USA,PG-13,300000000.0,2007.0,5000.0,7.1,2.35,0
2,Color,Sam Mendes,602.0,148.0,0.0,161.0,Rory Kinnear,11000.0,200074175.0,Action|Adventure|Thriller,...,994.0,English,UK,PG-13,245000000.0,2015.0,393.0,6.8,2.35,85000
3,Color,Christopher Nolan,813.0,164.0,22000.0,23000.0,Christian Bale,27000.0,448130642.0,Action|Thriller,...,2701.0,English,USA,PG-13,250000000.0,2012.0,23000.0,8.5,2.35,164000
4,,Doug Walker,,,131.0,,Rob Walker,131.0,,Documentary,...,,,,,,,12.0,7.1,,0


In [11]:
pd.read_csv('test_output/movie.csv')

Unnamed: 0.1,Unnamed: 0,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,...,num_user_for_reviews,language,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
0,0,Color,James Cameron,723.0,178.0,0.0,855.0,Joel David Moore,1000.0,760505847.0,...,3054.0,English,USA,PG-13,237000000.0,2009.0,936.0,7.9,1.78,33000
1,1,Color,Gore Verbinski,302.0,169.0,563.0,1000.0,Orlando Bloom,40000.0,309404152.0,...,1238.0,English,USA,PG-13,300000000.0,2007.0,5000.0,7.1,2.35,0
2,2,Color,Sam Mendes,602.0,148.0,0.0,161.0,Rory Kinnear,11000.0,200074175.0,...,994.0,English,UK,PG-13,245000000.0,2015.0,393.0,6.8,2.35,85000
3,3,Color,Christopher Nolan,813.0,164.0,22000.0,23000.0,Christian Bale,27000.0,448130642.0,...,2701.0,English,USA,PG-13,250000000.0,2012.0,23000.0,8.5,2.35,164000
4,4,,Doug Walker,,,131.0,,Rob Walker,131.0,,...,,,,,,,12.0,7.1,,0


In [13]:
pd.read_csv('test_output/movie.tsv', sep='\t')

Unnamed: 0,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,...,num_user_for_reviews,language,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
0,Color,James Cameron,723.0,178.0,0.0,855.0,Joel David Moore,1000.0,760505847.0,Action|Adventure|Fantasy|Sci-Fi,...,3054.0,English,USA,PG-13,237000000.0,2009.0,936.0,7.9,1.78,33000
1,Color,Gore Verbinski,302.0,169.0,563.0,1000.0,Orlando Bloom,40000.0,309404152.0,Action|Adventure|Fantasy,...,1238.0,English,USA,PG-13,300000000.0,2007.0,5000.0,7.1,2.35,0
2,Color,Sam Mendes,602.0,148.0,0.0,161.0,Rory Kinnear,11000.0,200074175.0,Action|Adventure|Thriller,...,994.0,English,UK,PG-13,245000000.0,2015.0,393.0,6.8,2.35,85000
3,Color,Christopher Nolan,813.0,164.0,22000.0,23000.0,Christian Bale,27000.0,448130642.0,Action|Thriller,...,2701.0,English,USA,PG-13,250000000.0,2012.0,23000.0,8.5,2.35,164000
4,,Doug Walker,,,131.0,,Rob Walker,131.0,,Documentary,...,,,,,,,12.0,7.1,,0


In [14]:
pd.read_excel('test_output/movie.xlsx')

Unnamed: 0.1,Unnamed: 0,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,...,num_user_for_reviews,language,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
0,0,Color,James Cameron,723.0,178.0,0,855.0,Joel David Moore,1000,760505847.0,...,3054.0,English,USA,PG-13,237000000.0,2009.0,936,7.9,1.78,33000
1,1,Color,Gore Verbinski,302.0,169.0,563,1000.0,Orlando Bloom,40000,309404152.0,...,1238.0,English,USA,PG-13,300000000.0,2007.0,5000,7.1,2.35,0
2,2,Color,Sam Mendes,602.0,148.0,0,161.0,Rory Kinnear,11000,200074175.0,...,994.0,English,UK,PG-13,245000000.0,2015.0,393,6.8,2.35,85000
3,3,Color,Christopher Nolan,813.0,164.0,22000,23000.0,Christian Bale,27000,448130642.0,...,2701.0,English,USA,PG-13,250000000.0,2012.0,23000,8.5,2.35,164000
4,4,,Doug Walker,,,131,,Rob Walker,131,,...,,,,,,,12,7.1,,0


In [4]:
df = pd.DataFrame([
    [1, 2, 3],
    [4, 5, 6]
])
df

Unnamed: 0,0,1,2
0,1,2,3
1,4,5,6


In [11]:
df.at[0, 0] = 9
df

Unnamed: 0,0,1,2
0,9,2,3
1,4,5,6


In [16]:
df.index = [f'row_{i + 1}' for i in range(df.shape[0])]
df.columns = [f'col_{i + 1}' for i in range(df.shape[1])]
df

Unnamed: 0,col_1,col_2,col_3
row_1,9,2,3
row_2,4,5,6


In [21]:
# label at
df.at['row_1', 'col_2'] = 10
df

Unnamed: 0,col_1,col_2,col_3
row_1,9,10,3
row_2,4,11,6


In [24]:
# label loc
df.loc['row_1', 'col_2'] = 12
df

Unnamed: 0,col_1,col_2,col_3
row_1,9,12,3
row_2,4,11,6


In [25]:
# position iat
df.iat[1, 1] = 11
df

Unnamed: 0,col_1,col_2,col_3
row_1,9,12,3
row_2,4,11,6


In [26]:
# position iloc
df.iloc[1, 1] = 99
df

Unnamed: 0,col_1,col_2,col_3
row_1,9,12,3
row_2,4,99,6


In [28]:
# 所有元素取负数
fdf = -df
fdf

Unnamed: 0,col_1,col_2,col_3
row_1,-9,-12,-3
row_2,-4,-99,-6
