In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

plt.rcParams['font.sans-serif'] = ['SimHei']  #用来正常显示中文标签
plt.rcParams['axes.unicode_minus'] = False    #用来正常显示负号
plt.style.use('ggplot')

df = pd.read_excel('./movie_data.xls')

# 查看

## 数据

In [2]:
df.head(1)  # 前n行

Unnamed: 0,mid,name,release_year,country,type,myRate,myTag,href
0,4864908,影,2018,中国大陆,电影,4,"['古装', '动作']",https://movie.douban.com/subject/4864908/


In [3]:
df.sample(1) # 随机n行

Unnamed: 0,mid,name,release_year,country,type,myRate,myTag,href
438,25908042,横冲直撞好莱坞,2015,中国大陆,电影,2,"['喜剧', '冒险']",https://movie.douban.com/subject/25908042/


In [4]:
df.tail(1)  # 后n行

Unnamed: 0,mid,name,release_year,country,type,myRate,myTag,href
585,1972724,斯坦福监狱实验,2015,美国,电影,5,"['真实事件改编', '人性', '惊悚']",https://movie.douban.com/subject/1972724/


## 基本信息

In [5]:
print('索引: \t', df.index)
print('列名: \t', list(df.columns))
print('各列类型: \n', df.dtypes)
print('表格大小: \t', df.shape)

索引: 	 RangeIndex(start=0, stop=586, step=1)
列名: 	 ['mid', 'name', 'release_year', 'country', 'type', 'myRate', 'myTag', 'href']
各列类型: 
 mid              int64
name            object
release_year     int64
country         object
type            object
myRate           int64
myTag           object
href            object
dtype: object
表格大小: 	 (586, 8)


# 修改

## 类型转换

In [6]:
df1 = df.copy()
df1.myRate = df1.myRate.astype(np.float64)
df1.dtypes

mid               int64
name             object
release_year      int64
country          object
type             object
myRate          float64
myTag            object
href             object
dtype: object

## 列别名

In [7]:
# columns接受一个字典，或者一个转换函数
df1 = df.copy()
df1.rename(columns={'mid': 'movieId','name': 'movieName', 'release_year': 'releaseYear'}, inplace=True)
df1.head(2)

Unnamed: 0,movieId,movieName,releaseYear,country,type,myRate,myTag,href
0,4864908,影,2018,中国大陆,电影,4,"['古装', '动作']",https://movie.douban.com/subject/4864908/
1,3878007,海王,2018,美国,电影,5,"['奇幻', '动作']",https://movie.douban.com/subject/3878007/


## 末尾插入一行

In [8]:
df1 = df.copy()
# 插入数据dict可缺少部分数据，自动填充NaN
insert_dict = {'mid': 111111, 'name': '你好之华', 'release_year': 2018,  'country': '中国大陆', 'type':'电影', 'myRate': 5, 'myTag': '[\'爱情\']', 'href': 'https://movie.douban.com/subject/111111/'}
df1 = df1.append(insert_dict, ignore_index=True)
df1.tail(2)

Unnamed: 0,mid,name,release_year,country,type,myRate,myTag,href
585,1972724,斯坦福监狱实验,2015,美国,电影,5,"['真实事件改编', '人性', '惊悚']",https://movie.douban.com/subject/1972724/
586,111111,你好之华,2018,中国大陆,电影,5,['爱情'],https://movie.douban.com/subject/111111/


In [9]:
df1 = df.copy()
# 插入数据list必须一一对应
insert_list = [111111, '你好之华', 2018, '中国大陆', '电影', 5, '[\'爱情\']', 'https://movie.douban.com/subject/111111/']
df1.loc[len(df1)] = insert_list
df1.tail(2)

Unnamed: 0,mid,name,release_year,country,type,myRate,myTag,href
585,1972724,斯坦福监狱实验,2015,美国,电影,5,"['真实事件改编', '人性', '惊悚']",https://movie.douban.com/subject/1972724/
586,111111,你好之华,2018,中国大陆,电影,5,['爱情'],https://movie.douban.com/subject/111111/


## 指定位置数据

In [10]:
df1 = df.copy()
df1.loc[585, 'mid'] = 111111   # 通过别名修改
df1.iloc[585, 2] = 2018        # 通过数值位置修改
df1.iloc[585, 5:8] = [5, '[\'爱情\']', 'https://movie.douban.com/subject/111111/']  # 批量修改
df1.tail(2)

Unnamed: 0,mid,name,release_year,country,type,myRate,myTag,href
584,26384741,湮灭,2018,美国,电影,4,"['科幻', '悬疑']",https://movie.douban.com/subject/26384741/
585,111111,斯坦福监狱实验,2018,美国,电影,5,['爱情'],https://movie.douban.com/subject/111111/
