In [5]:
import numpy as np
import pandas as pd
import sqlite3

# 1. 数据组合 - concat() 函数

In [7]:
# 1. 加载数据源,获取 df 对象
df1 = pd.read_csv("data/concat_1.csv")
df2 = pd.read_csv("data/concat_2.csv")
df3 = pd.read_csv("data/concat_3.csv")
df1

Unnamed: 0,A,B,C,D
0,a0,b0,c0,d0
1,a1,b1,c1,d1
2,a2,b2,c2,d2
3,a3,b3,c3,d3


In [8]:
# 2. concat() 函数,把 df 对象连接起来
# 格式: pd.concat([])
# 细节: concat() 函数,按行拼接是,参考:列名,按列拼接是,参考: 行索引
pd.concat([df1,df2,df3],axis='rows')    # 按照行拼接
pd.concat([df1,df2,df3],axis='columns') # 按列拼接

Unnamed: 0,A,B,C,D,A.1,B.1,C.1,D.1,A.2,B.2,C.2,D.2
0,a0,b0,c0,d0,a4,b4,c4,d4,a8,b8,c8,d8
1,a1,b1,c1,d1,a5,b5,c5,d5,a9,b9,c9,d9
2,a2,b2,c2,d2,a6,b6,c6,d6,a10,b10,c10,d10
3,a3,b3,c3,d3,a7,b7,c7,d7,a11,b11,c11,d11


In [None]:
# 3. 把 DataFrame 和 Series 对象拼接到一起
# 细节: 由于 Series 是列数据, concat方法默认是添加行,但是 Series 没有行索引,所以添加了第一个新列,缺失的数据用 NaN 填充
# pd.concat([df1,pd.Series(['a','b','c'])])

In [10]:
# 4. 如果将将 ['n1','n2','n3','n4'] 作为 行链接到 df1后,如何实现.
df5 = pd.DataFrame([['n1','n2','n3','n4']],columns=df2.columns)
pd.concat([df2,df5],ignore_index=True) # 忽略行索引,即: 会自动重置索引


Unnamed: 0,A,B,C,D
0,a4,b4,c4,d4
1,a5,b5,c5,d5
2,a6,b6,c6,d6
3,a7,b7,c7,d7
4,n1,n2,n3,n4


In [11]:
# 5. 演示: append() 函数,注意该函数已过时,新版很中已移除
# df1.append(df2,ignore_index=True)

# 使用后 python 字典,添加数据行
# df1.append({"A":"你"})

In [68]:
# 6. 添加列
pd.concat([df2,df5],axis='columns')

Unnamed: 0,A,B,C,D,A.1,B.1,C.1,D.1
0,a4,b4,c4,d4,n1,n2,n3,n4
1,a5,b5,c5,d5,,,,
2,a6,b6,c6,d6,,,,
3,a7,b7,c7,d7,,,,


## 数据组合-merge() 函数
### 1.1 一对一合并

In [70]:
"""
在使用 concat 连接数据时,涉及到参数 join= inner / outer
数据库中可以依据共有数据把两个或者多个数据表组合起来,即 join操作
DataFrame 中也可以实现类似的数据库的join操作
Pandas 也可通过 pd.join 命令组合数据,也可以通过 pd.merge 组合数据
merge 更灵活,如果想依据行索引来合并 DataFrame 可以考虑使用 join 函数
"""

'\n在使用 concat 连接数据时,涉及到参数 join= inner / outer\n数据库中可以依据共有数据把两个或者多个数据表组合起来,即 join操作\nDataFrame 中也可以实现类似的数据库的join操作\nPandas 也可通过 pd.join 命令组合数据,也可以通过 pd.merge 组合数据\nmerge 更灵活,如果想依据行索引来合并 DataFrame 可以考虑使用 join 函数\n'

In [90]:
# 从 sqlit 中读取数据
data_db = sqlite3.connect('data/chinook.db')
tracks = pd.read_sql_query("select * from tracks;",data_db)
genres = pd.read_sql_query("select * from genres",data_db)
genres

Unnamed: 0,GenreId,Name
0,1,Rock
1,2,Jazz
2,3,Metal
3,4,Alternative & Punk
4,5,Rock And Roll
5,6,Blues
6,7,Latin
7,8,Reggae
8,9,Pop
9,10,Soundtrack


In [87]:
# 从 tracks表（歌曲表）提取部分数据，使其不含重复的 GenreID 值
tracks_subset = tracks.loc[[0,62,76,98,110,193,204,281,322,359]]
tracks_subset

Unnamed: 0,TrackId,Name,AlbumId,MediaTypeId,GenreId,Composer,Milliseconds,Bytes,UnitPrice
0,1,For Those About To Rock (We Salute You),1,1,1,"Angus Young, Malcolm Young, Brian Johnson",343719,11170334,0.99
62,63,Desafinado,8,1,2,,185338,5990473,0.99
76,77,Enter Sandman,9,1,3,Apocalyptica,221701,7286305,0.99
98,99,Your Time Has Come,11,1,4,"Cornell, Commerford, Morello, Wilk",255529,8273592,0.99
110,111,Money,12,1,5,"Berry Gordy, Jr./Janie Bradford",147591,2365897,0.99
193,194,First Time I Met The Blues,20,1,6,Eurreal Montgomery,140434,4604995,0.99
204,205,Jorge Da Capadócia,21,1,7,Jorge Ben,177397,5842196,0.99
281,282,Girassol,26,1,8,Bino Farias/Da Gama/Lazão/Pedro Luis/Toni Garrido,249808,8327676,0.99
322,323,"Dig-Dig, Lambe-Lambe (Ao Vivo)",29,1,9,Cassiano Costa/Cintia Maviane/J.F./Lucas Costa,205479,6892516,0.99
359,360,Vai-Vai 2001,32,1,10,,276349,9402241,0.99


In [91]:
# 3. 通过 merge() 函数,实现 tracks_subset(歌曲表子集) 和 genres(歌曲分类表) 连接操作
# 格式: df.merge(df,on="关键字",how="连接方式")
# 细节1: 如果两个 df 对象关联字段一样,用 on 直接连接,如果不一样,用 left_on="左df字段名" right_on="右边df字段名"
genres.merge(tracks_subset[['TrackId',"GenreId",'Milliseconds']],on="GenreId",how="left")  # left= 左表连接 坐标全集+交集

Unnamed: 0,GenreId,Name,TrackId,Milliseconds
0,1,Rock,1.0,343719.0
1,2,Jazz,63.0,185338.0
2,3,Metal,77.0,221701.0
3,4,Alternative & Punk,99.0,255529.0
4,5,Rock And Roll,111.0,147591.0
5,6,Blues,194.0,140434.0
6,7,Latin,205.0,177397.0
7,8,Reggae,282.0,249808.0
8,9,Pop,323.0,205479.0
9,10,Soundtrack,360.0,276349.0


In [96]:
genres.merge(tracks_subset[['TrackId',"GenreId",'Milliseconds']],on="GenreId",how="right") # right= 右表连接 右表全集+交集

Unnamed: 0,GenreId,Name,TrackId,Milliseconds
0,1,Rock,1,343719
1,2,Jazz,63,185338
2,3,Metal,77,221701
3,4,Alternative & Punk,99,255529
4,5,Rock And Roll,111,147591
5,6,Blues,194,140434
6,7,Latin,205,177397
7,8,Reggae,282,249808
8,9,Pop,323,205479
9,10,Soundtrack,360,276349


In [95]:
genres.merge(tracks_subset[['TrackId',"GenreId",'Milliseconds']],on="GenreId",how="inner") # inner = 内连接,交集

Unnamed: 0,GenreId,Name,TrackId,Milliseconds
0,1,Rock,1,343719
1,2,Jazz,63,185338
2,3,Metal,77,221701
3,4,Alternative & Punk,99,255529
4,5,Rock And Roll,111,147591
5,6,Blues,194,140434
6,7,Latin,205,177397
7,8,Reggae,282,249808
8,9,Pop,323,205479
9,10,Soundtrack,360,276349


In [99]:
# 细节2: 如果有两个 df 的字段重名了,则 suffixes=("_x","_y") 会分别给 左df 和 右df 加后缀
genres.merge(tracks_subset,on="GenreId",how="outer") # outer = 满外连接,全部

# 可自定义重名的后缀
genres.merge(tracks_subset,on="GenreId",how="outer",suffixes=('_左表','_右表'))

Unnamed: 0,GenreId,Name_左表,TrackId,Name_右表,AlbumId,MediaTypeId,Composer,Milliseconds,Bytes,UnitPrice
0,1,Rock,1.0,For Those About To Rock (We Salute You),1.0,1.0,"Angus Young, Malcolm Young, Brian Johnson",343719.0,11170334.0,0.99
1,2,Jazz,63.0,Desafinado,8.0,1.0,,185338.0,5990473.0,0.99
2,3,Metal,77.0,Enter Sandman,9.0,1.0,Apocalyptica,221701.0,7286305.0,0.99
3,4,Alternative & Punk,99.0,Your Time Has Come,11.0,1.0,"Cornell, Commerford, Morello, Wilk",255529.0,8273592.0,0.99
4,5,Rock And Roll,111.0,Money,12.0,1.0,"Berry Gordy, Jr./Janie Bradford",147591.0,2365897.0,0.99
5,6,Blues,194.0,First Time I Met The Blues,20.0,1.0,Eurreal Montgomery,140434.0,4604995.0,0.99
6,7,Latin,205.0,Jorge Da Capadócia,21.0,1.0,Jorge Ben,177397.0,5842196.0,0.99
7,8,Reggae,282.0,Girassol,26.0,1.0,Bino Farias/Da Gama/Lazão/Pedro Luis/Toni Garrido,249808.0,8327676.0,0.99
8,9,Pop,323.0,"Dig-Dig, Lambe-Lambe (Ao Vivo)",29.0,1.0,Cassiano Costa/Cintia Maviane/J.F./Lucas Costa,205479.0,6892516.0,0.99
9,10,Soundtrack,360.0,Vai-Vai 2001,32.0,1.0,,276349.0,9402241.0,0.99


### 2.2 多对一合并

In [129]:
# 需求: 计算每个类型 歌曲的平均时长
#1. 把 歌曲表 tracks 和 歌曲类别表 genres 关联到一起
gener_track = genres.merge(tracks,on="GenreId",how="left")
# 2. 基于上述的数据,按照 歌曲类别分组,计算平均时长
tmp_series = gener_track.groupby(["GenreId","Name_x"])['Milliseconds'].mean()

# 3. 基于上述的数据 转成 日期格式
pd.to_timedelta(tmp_series,unit='ms').dt.floor('s').sort_values( )


GenreId  Name_x            
5        Rock And Roll        0 days 00:02:14
25       Opera                0 days 00:02:54
17       Hip Hop/Rap          0 days 00:02:58
12       Easy Listening       0 days 00:03:09
11       Bossa Nova           0 days 00:03:39
14       R&B/Soul             0 days 00:03:40
16       World                0 days 00:03:44
9        Pop                  0 days 00:03:49
7        Latin                0 days 00:03:52
4        Alternative & Punk   0 days 00:03:54
10       Soundtrack           0 days 00:04:04
8        Reggae               0 days 00:04:07
23       Alternative          0 days 00:04:24
6        Blues                0 days 00:04:30
1        Rock                 0 days 00:04:43
2        Jazz                 0 days 00:04:51
24       Classical            0 days 00:04:53
13       Heavy Metal          0 days 00:04:57
15       Electronica/Dance    0 days 00:05:02
3        Metal                0 days 00:05:09
22       Comedy               0 days 00:26:25
19    