In [3]:
import numpy as np
import pandas as pd


# 1.DataFrame的常用属性

In [None]:
df = pd.read_csv('scientists.csv')
df

In [None]:
# 演示df对象的常用属性
print(df.ndim) # 轴（维数）
print(df.shape) # 维度
print(df.size) # 元素个数
print(df.index) # 获取df对象的索引列
print(df.columns) # 获取列名

# 2. DataFrame对象的常用函数

In [None]:
print(len(df))
print(df.head())
print(df.head(n=3))
print(df.tail())
print(df.tail(n=2))
print(df.keys()) # 所有的列名


In [None]:
print(df.info()) # 查看df对象各列的基本信息


In [None]:
print(df.describe()) # 统计信息 只能看到数值类
print(df.describe(exclude=['int','float'])) # 查看df对象但是不包括整形和浮点型
print(df.describe(include='all')) # 所有的

In [None]:
df['Age'].mean() 
df['Age'].max()

In [None]:
df.count() # 统计各列的非空值

# 3.DataFrame对象的布尔索引操作

In [None]:
# 需求：查找电影时长大于平均时长的电影信息
# 1.读取数据源文件，获取df对象
movie_df = pd.read_csv('movie.csv')
movie_df.head()

In [None]:
# 获取符合条件的数据
movie_df[movie_df.duration > movie_df.duration.mean()]

In [None]:
# 也支持手动传参方式，直接传入布尔值
movie_df.head()[[True,False,True,True,False]]

# 4.DataFrame对象的计算

In [None]:
df['Age']*2 # dataFrame对象和数值运算，把数值作用到每个dataframe上，但是要注意要求对象必须全是数值

In [None]:
# df和df之间的运算，对应元素相加，如果索引不匹配，则用nan填充
print(df+df)
df[:4]+df

# 5. 更改Series和DataFrame对象的行索引，列名

## 5.1 读取文件后，设置行索引

In [None]:
# 1.读取数据源文件，获取df对象
movie = pd.read_csv('movie.csv')
movie.head()

In [None]:
# 2. 设置movie_tiltle为行索引
# 在pandas中大部分函数，都是在源数据上拷贝修改，并且返回副本
# 默认情况下inpace= false 返回副本，不修改源数据，如果让inplace = true，那么就可以修改源数据
movie.set_index('movie_title')
movie.head()

In [None]:
# 查看
movie.set_index('movie_title',inplace = True)
movie.head()  

## 5.2 读取文件时设置行索引

In [None]:
movie2 = pd.read_csv('movie.csv', index_col='movie_title' )
movie2.head()

## 5.3 取消设置的行索引，归为：系统自动提供0~n

In [None]:
movie2.reset_index(inplace= True) # 不要次数做多了
movie2.head()

## 5.4修改DataFrame对象的行索引和列名

In [None]:
# 1 读取数据源文件，获取df对象，指定行索引
movie = pd.read_csv('movie.csv',index_col='movie_title')
movie.head()

### 5.4.1 rename()直接修改

In [None]:
# 获取前5个列名
movie2.index[:5]

# 获取前5个行索引值
movie.columns[:5]

idx_name = {'Avatar':'阿凡达',"Pirates of the Caribbean: At World's End": '世界的尽头'}
col_name = {'color':'颜色','director_name':'导演名'}
movie.rename(index=idx_name, columns=col_name,inplace=True)
movie.head() 

### 5.4.2 将index和column属性提取出来，修改之后再放回去

In [None]:
idx_list = movie.index.tolist()
col_list = movie.columns.to_list()
idx_list[0] = '阿凡达'
idx_list[2] = '007幽灵'
col_list[0] = '颜色'
col_list[1] = '导演'

movie.index = idx_list
movie.columns = col_list
movie.head()

## 5.5 添加、删除、插入列

In [None]:
# 0表示没看过,1表示看过
movie['has_seen'] = 0
movie['dafb'] = movie['director_facebook_likes']+movie['actor_3_facebook_likes']+movie['actor_2_facebook_likes']+movie['actor_1_facebook_likes']

# 查看
movie.head()


In [None]:
# movie.drop('has_seen',axis='columns',inplace=True)
movie.drop('has_seen',axis=1,inplace=True)
movie.head()

In [None]:
movie.insert(loc =1,column='profit',value=movie['gross'] - movie.budget) # 没有inplace，就在元数据修改
movie.head()

# 6.导入和导出数据

## 6.1 导出数据

In [29]:
# 导出的output目录必须存在
df = pd.read_csv('scientists.csv')
# 对df进行操作
newdf = df[df.Age>df.Age.mean()]
# df.to_后缀名(路径)
print(newdf)
# pkl适合用作中间数据
newdf.to_pickle('output/scientist.pkl')


                   Name        Born        Died  Age     Occupation
1        William Gosset  1876-06-13  1937-10-16   61   Statistician
2  Florence Nightingale  1820-05-12  1910-08-13   90          Nurse
3           Marie Curie  1867-11-07  1934-07-04   66        Chemist
7          Johann Gauss  1777-04-30  1855-02-23   77  Mathematician


In [35]:
# newdf.to_excel('output/scientist.xlsx') # 会把索引当做数据导出
# newdf.to_excel('output/scientist.xlsx',index=False,sheet_name='ai20') 没有索引
# 如果每行用,隔开，那么就是to_csv
# 如果用\t隔开则是tsv
newdf.to_csv('output/scientist.csv',index=False)
newdf.to_csv('output/scientist.tsv',index=False, sep='\t')

In [41]:
# 导入
# pd.read_pickle('output/scientist.pkl')
pd.read_excel('output/scientist.xlsx')
pd.read_csv('output/scientist.csv')
pd.read_csv('output/scientist.tsv',sep='\t')

Unnamed: 0,Name,Born,Died,Age,Occupation
0,William Gosset,1876-06-13,1937-10-16,61,Statistician
1,Florence Nightingale,1820-05-12,1910-08-13,90,Nurse
2,Marie Curie,1867-11-07,1934-07-04,66,Chemist
3,Johann Gauss,1777-04-30,1855-02-23,77,Mathematician
