In [1]:
# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Demo to Show DataFrame Operations

In [2]:
import bigframes.pandas as bpd

df = bpd.read_gbq("bigquery-public-data.baseball.schedules")

### Select a subset of the DF

In [3]:
df = df[["gameId", "year", "homeTeamName", "awayTeamName", "duration_minutes"]]

In [4]:
df
# Here starts real execution, may take a while

Unnamed: 0,gameId,year,homeTeamName,awayTeamName,duration_minutes
0,7283fe5e-7b8b-4d3a-b729-476cda6e4c45,2016,Rockies,Mets,191
1,4f086577-3575-4b8c-a7c9-e6a05d674367,2016,Phillies,Dodgers,174
2,28cac4ff-bec0-4ae6-94f2-0dbc763afff2,2016,White Sox,Braves,168
3,8606d487-9de5-436b-afaf-02773cf900f0,2016,Mets,Twins,255
4,fabb7d45-7929-4f83-ae04-0e6c308eaf7b,2016,Pirates,Brewers,167
5,d12c179f-f52e-401d-8453-58bfca1d0d4f,2016,Yankees,Angels,178
6,62b6c3a3-4d0e-4c3a-b1fe-b0a6bcbbcbb1,2016,Diamondbacks,Phillies,214
7,4119beed-2d02-4d80-bf73-7c4a5099de37,2016,Red Sox,Athletics,187
8,32a3546f-dde0-428f-aee2-8983d72ea67e,2016,Red Sox,Twins,165
9,9c736289-013b-4324-bf65-e764b35f921e,2016,Orioles,Royals,157


In [5]:
df.shape

(2431, 5)

### Retrieve properties of the DF.

In [6]:
df.dtypes

gameId              string[pyarrow]
year                          Int64
homeTeamName        string[pyarrow]
awayTeamName        string[pyarrow]
duration_minutes              Int64
dtype: object

In [7]:
df.columns

Index(['gameId', 'year', 'homeTeamName', 'awayTeamName', 'duration_minutes'], dtype='object')

### Add a new column

In [8]:
df = df.assign(title=df['homeTeamName'] + " vs " + df['awayTeamName'])
df

Unnamed: 0,gameId,year,homeTeamName,awayTeamName,duration_minutes,title
0,7283fe5e-7b8b-4d3a-b729-476cda6e4c45,2016,Rockies,Mets,191,Rockies vs Mets
1,4f086577-3575-4b8c-a7c9-e6a05d674367,2016,Phillies,Dodgers,174,Phillies vs Dodgers
2,28cac4ff-bec0-4ae6-94f2-0dbc763afff2,2016,White Sox,Braves,168,White Sox vs Braves
3,8606d487-9de5-436b-afaf-02773cf900f0,2016,Mets,Twins,255,Mets vs Twins
4,fabb7d45-7929-4f83-ae04-0e6c308eaf7b,2016,Pirates,Brewers,167,Pirates vs Brewers
5,d12c179f-f52e-401d-8453-58bfca1d0d4f,2016,Yankees,Angels,178,Yankees vs Angels
6,62b6c3a3-4d0e-4c3a-b1fe-b0a6bcbbcbb1,2016,Diamondbacks,Phillies,214,Diamondbacks vs Phillies
7,4119beed-2d02-4d80-bf73-7c4a5099de37,2016,Red Sox,Athletics,187,Red Sox vs Athletics
8,32a3546f-dde0-428f-aee2-8983d72ea67e,2016,Red Sox,Twins,165,Red Sox vs Twins
9,9c736289-013b-4324-bf65-e764b35f921e,2016,Orioles,Royals,157,Orioles vs Royals


### Manipulate the column

In [9]:
df = df.rename(columns={"title": "headline"})
df

Unnamed: 0,gameId,year,homeTeamName,awayTeamName,duration_minutes,headline
0,7283fe5e-7b8b-4d3a-b729-476cda6e4c45,2016,Rockies,Mets,191,Rockies vs Mets
1,4f086577-3575-4b8c-a7c9-e6a05d674367,2016,Phillies,Dodgers,174,Phillies vs Dodgers
2,28cac4ff-bec0-4ae6-94f2-0dbc763afff2,2016,White Sox,Braves,168,White Sox vs Braves
3,8606d487-9de5-436b-afaf-02773cf900f0,2016,Mets,Twins,255,Mets vs Twins
4,fabb7d45-7929-4f83-ae04-0e6c308eaf7b,2016,Pirates,Brewers,167,Pirates vs Brewers
5,d12c179f-f52e-401d-8453-58bfca1d0d4f,2016,Yankees,Angels,178,Yankees vs Angels
6,62b6c3a3-4d0e-4c3a-b1fe-b0a6bcbbcbb1,2016,Diamondbacks,Phillies,214,Diamondbacks vs Phillies
7,4119beed-2d02-4d80-bf73-7c4a5099de37,2016,Red Sox,Athletics,187,Red Sox vs Athletics
8,32a3546f-dde0-428f-aee2-8983d72ea67e,2016,Red Sox,Twins,165,Red Sox vs Twins
9,9c736289-013b-4324-bf65-e764b35f921e,2016,Orioles,Royals,157,Orioles vs Royals


In [10]:
df = df.drop(columns="headline")

In [11]:
df

Unnamed: 0,gameId,year,homeTeamName,awayTeamName,duration_minutes
0,7283fe5e-7b8b-4d3a-b729-476cda6e4c45,2016,Rockies,Mets,191
1,4f086577-3575-4b8c-a7c9-e6a05d674367,2016,Phillies,Dodgers,174
2,28cac4ff-bec0-4ae6-94f2-0dbc763afff2,2016,White Sox,Braves,168
3,8606d487-9de5-436b-afaf-02773cf900f0,2016,Mets,Twins,255
4,fabb7d45-7929-4f83-ae04-0e6c308eaf7b,2016,Pirates,Brewers,167
5,d12c179f-f52e-401d-8453-58bfca1d0d4f,2016,Yankees,Angels,178
6,62b6c3a3-4d0e-4c3a-b1fe-b0a6bcbbcbb1,2016,Diamondbacks,Phillies,214
7,4119beed-2d02-4d80-bf73-7c4a5099de37,2016,Red Sox,Athletics,187
8,32a3546f-dde0-428f-aee2-8983d72ea67e,2016,Red Sox,Twins,165
9,9c736289-013b-4324-bf65-e764b35f921e,2016,Orioles,Royals,157


### Drop Nan values

In [12]:
df = df.dropna()
df

Unnamed: 0,gameId,year,homeTeamName,awayTeamName,duration_minutes
0,7283fe5e-7b8b-4d3a-b729-476cda6e4c45,2016,Rockies,Mets,191
1,4f086577-3575-4b8c-a7c9-e6a05d674367,2016,Phillies,Dodgers,174
2,28cac4ff-bec0-4ae6-94f2-0dbc763afff2,2016,White Sox,Braves,168
3,8606d487-9de5-436b-afaf-02773cf900f0,2016,Mets,Twins,255
4,fabb7d45-7929-4f83-ae04-0e6c308eaf7b,2016,Pirates,Brewers,167
5,d12c179f-f52e-401d-8453-58bfca1d0d4f,2016,Yankees,Angels,178
6,62b6c3a3-4d0e-4c3a-b1fe-b0a6bcbbcbb1,2016,Diamondbacks,Phillies,214
7,4119beed-2d02-4d80-bf73-7c4a5099de37,2016,Red Sox,Athletics,187
8,32a3546f-dde0-428f-aee2-8983d72ea67e,2016,Red Sox,Twins,165
9,9c736289-013b-4324-bf65-e764b35f921e,2016,Orioles,Royals,157


### Join two DFs

In [13]:
df1 = df[["gameId", "homeTeamName"]]
df1

Unnamed: 0,gameId,homeTeamName
0,7283fe5e-7b8b-4d3a-b729-476cda6e4c45,Rockies
1,4f086577-3575-4b8c-a7c9-e6a05d674367,Phillies
2,28cac4ff-bec0-4ae6-94f2-0dbc763afff2,White Sox
3,8606d487-9de5-436b-afaf-02773cf900f0,Mets
4,fabb7d45-7929-4f83-ae04-0e6c308eaf7b,Pirates
5,d12c179f-f52e-401d-8453-58bfca1d0d4f,Yankees
6,62b6c3a3-4d0e-4c3a-b1fe-b0a6bcbbcbb1,Diamondbacks
7,4119beed-2d02-4d80-bf73-7c4a5099de37,Red Sox
8,32a3546f-dde0-428f-aee2-8983d72ea67e,Red Sox
9,9c736289-013b-4324-bf65-e764b35f921e,Orioles


In [14]:
df2 = df[["gameId", "awayTeamName"]].head(2)
df2

Unnamed: 0,gameId,awayTeamName
0,7283fe5e-7b8b-4d3a-b729-476cda6e4c45,Mets
1,4f086577-3575-4b8c-a7c9-e6a05d674367,Dodgers


In [15]:
df1.merge(df2, on="gameId", how="inner")

Unnamed: 0,gameId,homeTeamName,awayTeamName
0,7283fe5e-7b8b-4d3a-b729-476cda6e4c45,Rockies,Mets
1,4f086577-3575-4b8c-a7c9-e6a05d674367,Phillies,Dodgers


In [16]:
df1.merge(df2, on="gameId", how="outer")

Unnamed: 0,gameId,homeTeamName,awayTeamName
0,7283fe5e-7b8b-4d3a-b729-476cda6e4c45,Rockies,Mets
1,4f086577-3575-4b8c-a7c9-e6a05d674367,Phillies,Dodgers
2,28cac4ff-bec0-4ae6-94f2-0dbc763afff2,White Sox,
3,8606d487-9de5-436b-afaf-02773cf900f0,Mets,
4,fabb7d45-7929-4f83-ae04-0e6c308eaf7b,Pirates,
5,d12c179f-f52e-401d-8453-58bfca1d0d4f,Yankees,
6,62b6c3a3-4d0e-4c3a-b1fe-b0a6bcbbcbb1,Diamondbacks,
7,4119beed-2d02-4d80-bf73-7c4a5099de37,Red Sox,
8,32a3546f-dde0-428f-aee2-8983d72ea67e,Red Sox,
9,9c736289-013b-4324-bf65-e764b35f921e,Orioles,


In [17]:
df1.merge(df2, on="gameId", how="left")

Unnamed: 0,gameId,homeTeamName,awayTeamName
0,7283fe5e-7b8b-4d3a-b729-476cda6e4c45,Rockies,Mets
1,4f086577-3575-4b8c-a7c9-e6a05d674367,Phillies,Dodgers
2,28cac4ff-bec0-4ae6-94f2-0dbc763afff2,White Sox,
3,8606d487-9de5-436b-afaf-02773cf900f0,Mets,
4,fabb7d45-7929-4f83-ae04-0e6c308eaf7b,Pirates,
5,d12c179f-f52e-401d-8453-58bfca1d0d4f,Yankees,
6,62b6c3a3-4d0e-4c3a-b1fe-b0a6bcbbcbb1,Diamondbacks,
7,4119beed-2d02-4d80-bf73-7c4a5099de37,Red Sox,
8,32a3546f-dde0-428f-aee2-8983d72ea67e,Red Sox,
9,9c736289-013b-4324-bf65-e764b35f921e,Orioles,


In [18]:
df1.merge(df2, on="gameId", how="right")

Unnamed: 0,gameId,homeTeamName,awayTeamName
0,7283fe5e-7b8b-4d3a-b729-476cda6e4c45,Rockies,Mets
1,4f086577-3575-4b8c-a7c9-e6a05d674367,Phillies,Dodgers


### Concat two DFs

In [19]:
bpd.concat([df, df])

Unnamed: 0,gameId,year,homeTeamName,awayTeamName,duration_minutes
0,7283fe5e-7b8b-4d3a-b729-476cda6e4c45,2016,Rockies,Mets,191
1,4f086577-3575-4b8c-a7c9-e6a05d674367,2016,Phillies,Dodgers,174
2,28cac4ff-bec0-4ae6-94f2-0dbc763afff2,2016,White Sox,Braves,168
3,8606d487-9de5-436b-afaf-02773cf900f0,2016,Mets,Twins,255
4,fabb7d45-7929-4f83-ae04-0e6c308eaf7b,2016,Pirates,Brewers,167
5,d12c179f-f52e-401d-8453-58bfca1d0d4f,2016,Yankees,Angels,178
6,62b6c3a3-4d0e-4c3a-b1fe-b0a6bcbbcbb1,2016,Diamondbacks,Phillies,214
7,4119beed-2d02-4d80-bf73-7c4a5099de37,2016,Red Sox,Athletics,187
8,32a3546f-dde0-428f-aee2-8983d72ea67e,2016,Red Sox,Twins,165
9,9c736289-013b-4324-bf65-e764b35f921e,2016,Orioles,Royals,157


### Access column through property

In [20]:
df.homeTeamName

0          Rockies
1         Phillies
2        White Sox
3             Mets
4          Pirates
5          Yankees
6     Diamondbacks
7          Red Sox
8          Red Sox
9          Orioles
10    Diamondbacks
11            Cubs
12         Orioles
13           Twins
14         Yankees
15          Tigers
16            Mets
17            Mets
18         Rangers
19           Twins
20          Astros
21          Astros
22         Red Sox
23       Nationals
24       Nationals
Name: homeTeamName, dtype: string

### Retrieve SQL

In [21]:
print(df1.merge(df2, on="gameId", how="inner").sql)

SELECT
  `t4`.`bfuid_col_18` AS `gameId`,
  `t4`.`bfuid_col_20` AS `homeTeamName`,
  `t5`.`bfuid_col_21` AS `awayTeamName`
FROM (
  SELECT
    `t0`.`bfuid_col_18`,
    `t0`.`bfuid_col_20`
  FROM `bigframes-dev._76f0f906c2e04e83c3496619541347a5922c80ee._1492e3bc_b2fc_40fe_b1a0_7f0335e759e6_bqdf_6d56c24f-bccc-4661-80a2-347fa9731384` AS `t0`
) AS `t4`
INNER JOIN (
  SELECT
    `t1`.`bfuid_col_18` AS `bfuid_col_141`,
    `t1`.`bfuid_col_21`
  FROM `bigframes-dev._76f0f906c2e04e83c3496619541347a5922c80ee._1492e3bc_b2fc_40fe_b1a0_7f0335e759e6_bqdf_c9ed6466-9c5c-42e4-8385-36f07073dcf2` AS `t1`
) AS `t5`
  ON COALESCE(`t4`.`bfuid_col_18`, '0') = COALESCE(`t5`.`bfuid_col_141`, '0')
  AND COALESCE(`t4`.`bfuid_col_18`, '1') = COALESCE(`t5`.`bfuid_col_141`, '1')


### Special Column Names

In [22]:
df.rename(columns={"homeTeamName": "HOME    TEAM"})

Unnamed: 0,gameId,year,HOME TEAM,awayTeamName,duration_minutes
0,7283fe5e-7b8b-4d3a-b729-476cda6e4c45,2016,Rockies,Mets,191
1,4f086577-3575-4b8c-a7c9-e6a05d674367,2016,Phillies,Dodgers,174
2,28cac4ff-bec0-4ae6-94f2-0dbc763afff2,2016,White Sox,Braves,168
3,8606d487-9de5-436b-afaf-02773cf900f0,2016,Mets,Twins,255
4,fabb7d45-7929-4f83-ae04-0e6c308eaf7b,2016,Pirates,Brewers,167
5,d12c179f-f52e-401d-8453-58bfca1d0d4f,2016,Yankees,Angels,178
6,62b6c3a3-4d0e-4c3a-b1fe-b0a6bcbbcbb1,2016,Diamondbacks,Phillies,214
7,4119beed-2d02-4d80-bf73-7c4a5099de37,2016,Red Sox,Athletics,187
8,32a3546f-dde0-428f-aee2-8983d72ea67e,2016,Red Sox,Twins,165
9,9c736289-013b-4324-bf65-e764b35f921e,2016,Orioles,Royals,157


In [23]:
df.rename(columns={"homeTeamName": "homeTeam!@#$%col"})

Unnamed: 0,gameId,year,homeTeam!@#$%col,awayTeamName,duration_minutes
0,7283fe5e-7b8b-4d3a-b729-476cda6e4c45,2016,Rockies,Mets,191
1,4f086577-3575-4b8c-a7c9-e6a05d674367,2016,Phillies,Dodgers,174
2,28cac4ff-bec0-4ae6-94f2-0dbc763afff2,2016,White Sox,Braves,168
3,8606d487-9de5-436b-afaf-02773cf900f0,2016,Mets,Twins,255
4,fabb7d45-7929-4f83-ae04-0e6c308eaf7b,2016,Pirates,Brewers,167
5,d12c179f-f52e-401d-8453-58bfca1d0d4f,2016,Yankees,Angels,178
6,62b6c3a3-4d0e-4c3a-b1fe-b0a6bcbbcbb1,2016,Diamondbacks,Phillies,214
7,4119beed-2d02-4d80-bf73-7c4a5099de37,2016,Red Sox,Athletics,187
8,32a3546f-dde0-428f-aee2-8983d72ea67e,2016,Red Sox,Twins,165
9,9c736289-013b-4324-bf65-e764b35f921e,2016,Orioles,Royals,157


In [24]:
df3 = df.rename(columns={"homeTeamName": "team", "awayTeamName": "team"})
df3

Unnamed: 0,gameId,year,team,team.1,duration_minutes
0,7283fe5e-7b8b-4d3a-b729-476cda6e4c45,2016,Rockies,Mets,191
1,4f086577-3575-4b8c-a7c9-e6a05d674367,2016,Phillies,Dodgers,174
2,28cac4ff-bec0-4ae6-94f2-0dbc763afff2,2016,White Sox,Braves,168
3,8606d487-9de5-436b-afaf-02773cf900f0,2016,Mets,Twins,255
4,fabb7d45-7929-4f83-ae04-0e6c308eaf7b,2016,Pirates,Brewers,167
5,d12c179f-f52e-401d-8453-58bfca1d0d4f,2016,Yankees,Angels,178
6,62b6c3a3-4d0e-4c3a-b1fe-b0a6bcbbcbb1,2016,Diamondbacks,Phillies,214
7,4119beed-2d02-4d80-bf73-7c4a5099de37,2016,Red Sox,Athletics,187
8,32a3546f-dde0-428f-aee2-8983d72ea67e,2016,Red Sox,Twins,165
9,9c736289-013b-4324-bf65-e764b35f921e,2016,Orioles,Royals,157


In [25]:
df3["team"]

Unnamed: 0,team,team.1
0,Rockies,Mets
1,Phillies,Dodgers
2,White Sox,Braves
3,Mets,Twins
4,Pirates,Brewers
5,Yankees,Angels
6,Diamondbacks,Phillies
7,Red Sox,Athletics
8,Red Sox,Twins
9,Orioles,Royals


### Binary Operation

In [26]:
df4 = df[["year", "duration_minutes"]]
df4

Unnamed: 0,year,duration_minutes
0,2016,191
1,2016,174
2,2016,168
3,2016,255
4,2016,167
5,2016,178
6,2016,214
7,2016,187
8,2016,165
9,2016,157


In [27]:
df4 + 1

Unnamed: 0,year,duration_minutes
0,2017,192
1,2017,175
2,2017,169
3,2017,256
4,2017,168
5,2017,179
6,2017,215
7,2017,188
8,2017,166
9,2017,158


### Download the result as (in-memory) pandas DataFrame

In [28]:
dfp = df4.to_pandas()
dfp

Unnamed: 0,year,duration_minutes
0,2016,191
1,2016,174
2,2016,168
3,2016,255
4,2016,167
...,...,...
2426,2016,238
2427,2016,205
2428,2016,233
2429,2016,172
