# Demo to Show DataFrame Operations

In [1]:
# On the instance where you are running jupyter,
# authenticate with gcloud first:
#
#     gcloud auth application-default login

import bigframes.pandas as bpd

bpd.options.bigquery.location = "US"

In [2]:
df = bpd.read_gbq("bigquery-public-data.baseball.schedules")

### Select a subset of the DF

In [3]:
df = df[["gameId", "year", "homeTeamName", "awayTeamName", "duration_minutes"]]

In [4]:
df
# Here starts real execution, may take a while

Unnamed: 0,gameId,year,homeTeamName,awayTeamName,duration_minutes
0,e14b6493-9e7f-404f-840a-8a680cc364bf,2016,Marlins,Cubs,187
1,1f32b347-cbcb-4c31-a145-0e685306d168,2016,Marlins,Cubs,189
2,0c2292d1-7398-48be-bf8e-b41dad5e1a43,2016,Braves,Cubs,165
3,8fbec734-a15a-42ab-8d51-60790de7750b,2016,Braves,Cubs,222
4,89e514d5-fbf5-4b9d-bdac-6ca45bfd18dd,2016,Phillies,Cubs,164
5,6a83e76c-dc0d-4924-9d3d-a2e7e0ab5b52,2016,Diamondbacks,Cubs,201
6,76ea8662-c7e6-4c38-8f2a-efe373e428ce,2016,Athletics,Cubs,173
7,66fad23d-6e89-4f99-be29-d49b6e94f95d,2016,Athletics,Cubs,176
8,d977367c-cf0c-4687-95a0-eb4542efcb01,2016,Rockies,Cubs,180
9,a87070ff-1084-43ca-a7ba-69278f93ecba,2016,Cardinals,Cubs,157


In [5]:
df.shape

(2431, 5)

### Retrieve properties of the DF.

In [6]:
df.dtypes

gameId              string[pyarrow]
year                          Int64
homeTeamName        string[pyarrow]
awayTeamName        string[pyarrow]
duration_minutes              Int64
dtype: object

In [7]:
df.columns

Index(['gameId', 'year', 'homeTeamName', 'awayTeamName', 'duration_minutes'], dtype='object')

### Add a new column

In [8]:
df = df.assign(title=df['homeTeamName'] + " vs " + df['awayTeamName'])
df

Unnamed: 0,gameId,year,homeTeamName,awayTeamName,duration_minutes,title
0,e14b6493-9e7f-404f-840a-8a680cc364bf,2016,Marlins,Cubs,187,Marlins vs Cubs
1,1f32b347-cbcb-4c31-a145-0e685306d168,2016,Marlins,Cubs,189,Marlins vs Cubs
2,0c2292d1-7398-48be-bf8e-b41dad5e1a43,2016,Braves,Cubs,165,Braves vs Cubs
3,8fbec734-a15a-42ab-8d51-60790de7750b,2016,Braves,Cubs,222,Braves vs Cubs
4,89e514d5-fbf5-4b9d-bdac-6ca45bfd18dd,2016,Phillies,Cubs,164,Phillies vs Cubs
5,6a83e76c-dc0d-4924-9d3d-a2e7e0ab5b52,2016,Diamondbacks,Cubs,201,Diamondbacks vs Cubs
6,76ea8662-c7e6-4c38-8f2a-efe373e428ce,2016,Athletics,Cubs,173,Athletics vs Cubs
7,66fad23d-6e89-4f99-be29-d49b6e94f95d,2016,Athletics,Cubs,176,Athletics vs Cubs
8,d977367c-cf0c-4687-95a0-eb4542efcb01,2016,Rockies,Cubs,180,Rockies vs Cubs
9,a87070ff-1084-43ca-a7ba-69278f93ecba,2016,Cardinals,Cubs,157,Cardinals vs Cubs


### Manipulate the column

In [9]:
df = df.rename(columns={"title": "headline"})
df

Unnamed: 0,gameId,year,homeTeamName,awayTeamName,duration_minutes,headline
0,e14b6493-9e7f-404f-840a-8a680cc364bf,2016,Marlins,Cubs,187,Marlins vs Cubs
1,1f32b347-cbcb-4c31-a145-0e685306d168,2016,Marlins,Cubs,189,Marlins vs Cubs
2,0c2292d1-7398-48be-bf8e-b41dad5e1a43,2016,Braves,Cubs,165,Braves vs Cubs
3,8fbec734-a15a-42ab-8d51-60790de7750b,2016,Braves,Cubs,222,Braves vs Cubs
4,89e514d5-fbf5-4b9d-bdac-6ca45bfd18dd,2016,Phillies,Cubs,164,Phillies vs Cubs
5,6a83e76c-dc0d-4924-9d3d-a2e7e0ab5b52,2016,Diamondbacks,Cubs,201,Diamondbacks vs Cubs
6,76ea8662-c7e6-4c38-8f2a-efe373e428ce,2016,Athletics,Cubs,173,Athletics vs Cubs
7,66fad23d-6e89-4f99-be29-d49b6e94f95d,2016,Athletics,Cubs,176,Athletics vs Cubs
8,d977367c-cf0c-4687-95a0-eb4542efcb01,2016,Rockies,Cubs,180,Rockies vs Cubs
9,a87070ff-1084-43ca-a7ba-69278f93ecba,2016,Cardinals,Cubs,157,Cardinals vs Cubs


In [10]:
df = df.drop(columns="headline")

In [11]:
df

Unnamed: 0,gameId,year,homeTeamName,awayTeamName,duration_minutes
0,e14b6493-9e7f-404f-840a-8a680cc364bf,2016,Marlins,Cubs,187
1,1f32b347-cbcb-4c31-a145-0e685306d168,2016,Marlins,Cubs,189
2,0c2292d1-7398-48be-bf8e-b41dad5e1a43,2016,Braves,Cubs,165
3,8fbec734-a15a-42ab-8d51-60790de7750b,2016,Braves,Cubs,222
4,89e514d5-fbf5-4b9d-bdac-6ca45bfd18dd,2016,Phillies,Cubs,164
5,6a83e76c-dc0d-4924-9d3d-a2e7e0ab5b52,2016,Diamondbacks,Cubs,201
6,76ea8662-c7e6-4c38-8f2a-efe373e428ce,2016,Athletics,Cubs,173
7,66fad23d-6e89-4f99-be29-d49b6e94f95d,2016,Athletics,Cubs,176
8,d977367c-cf0c-4687-95a0-eb4542efcb01,2016,Rockies,Cubs,180
9,a87070ff-1084-43ca-a7ba-69278f93ecba,2016,Cardinals,Cubs,157


### Drop Nan values

In [12]:
df = df.dropna()
df

Unnamed: 0,gameId,year,homeTeamName,awayTeamName,duration_minutes
0,e14b6493-9e7f-404f-840a-8a680cc364bf,2016,Marlins,Cubs,187
1,1f32b347-cbcb-4c31-a145-0e685306d168,2016,Marlins,Cubs,189
2,0c2292d1-7398-48be-bf8e-b41dad5e1a43,2016,Braves,Cubs,165
3,8fbec734-a15a-42ab-8d51-60790de7750b,2016,Braves,Cubs,222
4,89e514d5-fbf5-4b9d-bdac-6ca45bfd18dd,2016,Phillies,Cubs,164
5,6a83e76c-dc0d-4924-9d3d-a2e7e0ab5b52,2016,Diamondbacks,Cubs,201
6,76ea8662-c7e6-4c38-8f2a-efe373e428ce,2016,Athletics,Cubs,173
7,66fad23d-6e89-4f99-be29-d49b6e94f95d,2016,Athletics,Cubs,176
8,d977367c-cf0c-4687-95a0-eb4542efcb01,2016,Rockies,Cubs,180
9,a87070ff-1084-43ca-a7ba-69278f93ecba,2016,Cardinals,Cubs,157


### Join two DFs

In [13]:
df1 = df[["gameId", "homeTeamName"]]
df1

Unnamed: 0,gameId,homeTeamName
0,e14b6493-9e7f-404f-840a-8a680cc364bf,Marlins
1,1f32b347-cbcb-4c31-a145-0e685306d168,Marlins
2,0c2292d1-7398-48be-bf8e-b41dad5e1a43,Braves
3,8fbec734-a15a-42ab-8d51-60790de7750b,Braves
4,89e514d5-fbf5-4b9d-bdac-6ca45bfd18dd,Phillies
5,6a83e76c-dc0d-4924-9d3d-a2e7e0ab5b52,Diamondbacks
6,76ea8662-c7e6-4c38-8f2a-efe373e428ce,Athletics
7,66fad23d-6e89-4f99-be29-d49b6e94f95d,Athletics
8,d977367c-cf0c-4687-95a0-eb4542efcb01,Rockies
9,a87070ff-1084-43ca-a7ba-69278f93ecba,Cardinals


In [14]:
df2 = df[["gameId", "awayTeamName"]].head(2)
df2

Unnamed: 0,gameId,awayTeamName
0,e14b6493-9e7f-404f-840a-8a680cc364bf,Cubs
1,1f32b347-cbcb-4c31-a145-0e685306d168,Cubs


In [15]:
df1.merge(df2, on="gameId", how="inner")

Unnamed: 0,gameId,homeTeamName,awayTeamName
0,e14b6493-9e7f-404f-840a-8a680cc364bf,Marlins,Cubs
1,1f32b347-cbcb-4c31-a145-0e685306d168,Marlins,Cubs


In [16]:
df1.merge(df2, on="gameId", how="outer")

Unnamed: 0,gameId,homeTeamName,awayTeamName
0,e14b6493-9e7f-404f-840a-8a680cc364bf,Marlins,Cubs
1,1f32b347-cbcb-4c31-a145-0e685306d168,Marlins,Cubs
2,0c2292d1-7398-48be-bf8e-b41dad5e1a43,Braves,
3,8fbec734-a15a-42ab-8d51-60790de7750b,Braves,
4,89e514d5-fbf5-4b9d-bdac-6ca45bfd18dd,Phillies,
5,6a83e76c-dc0d-4924-9d3d-a2e7e0ab5b52,Diamondbacks,
6,76ea8662-c7e6-4c38-8f2a-efe373e428ce,Athletics,
7,66fad23d-6e89-4f99-be29-d49b6e94f95d,Athletics,
8,d977367c-cf0c-4687-95a0-eb4542efcb01,Rockies,
9,a87070ff-1084-43ca-a7ba-69278f93ecba,Cardinals,


In [17]:
df1.merge(df2, on="gameId", how="left")

Unnamed: 0,gameId,homeTeamName,awayTeamName
0,e14b6493-9e7f-404f-840a-8a680cc364bf,Marlins,Cubs
1,1f32b347-cbcb-4c31-a145-0e685306d168,Marlins,Cubs
2,0c2292d1-7398-48be-bf8e-b41dad5e1a43,Braves,
3,8fbec734-a15a-42ab-8d51-60790de7750b,Braves,
4,89e514d5-fbf5-4b9d-bdac-6ca45bfd18dd,Phillies,
5,6a83e76c-dc0d-4924-9d3d-a2e7e0ab5b52,Diamondbacks,
6,76ea8662-c7e6-4c38-8f2a-efe373e428ce,Athletics,
7,66fad23d-6e89-4f99-be29-d49b6e94f95d,Athletics,
8,d977367c-cf0c-4687-95a0-eb4542efcb01,Rockies,
9,a87070ff-1084-43ca-a7ba-69278f93ecba,Cardinals,


In [18]:
df1.merge(df2, on="gameId", how="right")

Unnamed: 0,gameId,homeTeamName,awayTeamName
0,e14b6493-9e7f-404f-840a-8a680cc364bf,Marlins,Cubs
1,1f32b347-cbcb-4c31-a145-0e685306d168,Marlins,Cubs


### Concat two DFs

In [19]:
bpd.concat([df, df])

Unnamed: 0,gameId,year,homeTeamName,awayTeamName,duration_minutes
0,e14b6493-9e7f-404f-840a-8a680cc364bf,2016,Marlins,Cubs,187
1,1f32b347-cbcb-4c31-a145-0e685306d168,2016,Marlins,Cubs,189
2,0c2292d1-7398-48be-bf8e-b41dad5e1a43,2016,Braves,Cubs,165
3,8fbec734-a15a-42ab-8d51-60790de7750b,2016,Braves,Cubs,222
4,89e514d5-fbf5-4b9d-bdac-6ca45bfd18dd,2016,Phillies,Cubs,164
5,6a83e76c-dc0d-4924-9d3d-a2e7e0ab5b52,2016,Diamondbacks,Cubs,201
6,76ea8662-c7e6-4c38-8f2a-efe373e428ce,2016,Athletics,Cubs,173
7,66fad23d-6e89-4f99-be29-d49b6e94f95d,2016,Athletics,Cubs,176
8,d977367c-cf0c-4687-95a0-eb4542efcb01,2016,Rockies,Cubs,180
9,a87070ff-1084-43ca-a7ba-69278f93ecba,2016,Cardinals,Cubs,157


### Access column through property

In [20]:
df.homeTeamName

0               Marlins
1               Marlins
2                Braves
3                Braves
4              Phillies
             ...       
2426            Dodgers
2427            Dodgers
2428               Mets
2429               Mets
2430    American League
Name: homeTeamName, Length: 2431, dtype: string

### Retrieve SQL

In [21]:
print(df1.merge(df2, on="gameId", how="inner")._to_sql_query(always_include_index=False)[0])

WITH t0 AS (
  SELECT t17.*, t17.`bigframes_ordering_id` AS `bigframes_index_0`
  FROM (
    SELECT * FROM `7150072e4dc649849bc421d3a492eee1`
  ) t17
),
t1 AS (
  SELECT coalesce(`bigframes_ordering_id`, `bigframes_ordering_id`) AS `index_2`,
         CONCAT(CONCAT(t0.`homeTeamName`, ' vs '), t0.`awayTeamName`) AS `col_3`,
         t0.`bigframes_ordering_id` AS `bigframes_ordering_id`
  FROM t0
),
t2 AS (
  SELECT `bigframes_ordering_id` AS `bigframes_index_0`, t0.`gameId`, t0.`year`,
         t0.`homeTeamName`, t0.`awayTeamName`, t0.`duration_minutes`,
         t0.`bigframes_ordering_id`
  FROM t0
),
t3 AS (
  SELECT t2.`bigframes_index_0`, t2.`gameId`, t2.`year`, t2.`homeTeamName`,
         t2.`awayTeamName`, t2.`duration_minutes`,
         t2.`bigframes_ordering_id` AS `bigframes_ordering_id_x`,
         t1.`index_2`, t1.`col_3`,
         t1.`bigframes_ordering_id` AS `bigframes_ordering_id_y`
  FROM t2
    LEFT OUTER JOIN t1
      ON IFNULL(CAST(t2.`bigframes_index_0` AS STRING), '

### Special Column Names

In [22]:
df.rename(columns={"homeTeamName": "HOME    TEAM"})

Unnamed: 0,gameId,year,HOME TEAM,awayTeamName,duration_minutes
0,e14b6493-9e7f-404f-840a-8a680cc364bf,2016,Marlins,Cubs,187
1,1f32b347-cbcb-4c31-a145-0e685306d168,2016,Marlins,Cubs,189
2,0c2292d1-7398-48be-bf8e-b41dad5e1a43,2016,Braves,Cubs,165
3,8fbec734-a15a-42ab-8d51-60790de7750b,2016,Braves,Cubs,222
4,89e514d5-fbf5-4b9d-bdac-6ca45bfd18dd,2016,Phillies,Cubs,164
5,6a83e76c-dc0d-4924-9d3d-a2e7e0ab5b52,2016,Diamondbacks,Cubs,201
6,76ea8662-c7e6-4c38-8f2a-efe373e428ce,2016,Athletics,Cubs,173
7,66fad23d-6e89-4f99-be29-d49b6e94f95d,2016,Athletics,Cubs,176
8,d977367c-cf0c-4687-95a0-eb4542efcb01,2016,Rockies,Cubs,180
9,a87070ff-1084-43ca-a7ba-69278f93ecba,2016,Cardinals,Cubs,157


In [23]:
df.rename(columns={"homeTeamName": "homeTeam!@#$%col"})

Unnamed: 0,gameId,year,homeTeam!@#$%col,awayTeamName,duration_minutes
0,e14b6493-9e7f-404f-840a-8a680cc364bf,2016,Marlins,Cubs,187
1,1f32b347-cbcb-4c31-a145-0e685306d168,2016,Marlins,Cubs,189
2,0c2292d1-7398-48be-bf8e-b41dad5e1a43,2016,Braves,Cubs,165
3,8fbec734-a15a-42ab-8d51-60790de7750b,2016,Braves,Cubs,222
4,89e514d5-fbf5-4b9d-bdac-6ca45bfd18dd,2016,Phillies,Cubs,164
5,6a83e76c-dc0d-4924-9d3d-a2e7e0ab5b52,2016,Diamondbacks,Cubs,201
6,76ea8662-c7e6-4c38-8f2a-efe373e428ce,2016,Athletics,Cubs,173
7,66fad23d-6e89-4f99-be29-d49b6e94f95d,2016,Athletics,Cubs,176
8,d977367c-cf0c-4687-95a0-eb4542efcb01,2016,Rockies,Cubs,180
9,a87070ff-1084-43ca-a7ba-69278f93ecba,2016,Cardinals,Cubs,157


In [24]:
df3 = df.rename(columns={"homeTeamName": "team", "awayTeamName": "team"})
df3

Unnamed: 0,gameId,year,team,team.1,duration_minutes
0,e14b6493-9e7f-404f-840a-8a680cc364bf,2016,Marlins,Cubs,187
1,1f32b347-cbcb-4c31-a145-0e685306d168,2016,Marlins,Cubs,189
2,0c2292d1-7398-48be-bf8e-b41dad5e1a43,2016,Braves,Cubs,165
3,8fbec734-a15a-42ab-8d51-60790de7750b,2016,Braves,Cubs,222
4,89e514d5-fbf5-4b9d-bdac-6ca45bfd18dd,2016,Phillies,Cubs,164
5,6a83e76c-dc0d-4924-9d3d-a2e7e0ab5b52,2016,Diamondbacks,Cubs,201
6,76ea8662-c7e6-4c38-8f2a-efe373e428ce,2016,Athletics,Cubs,173
7,66fad23d-6e89-4f99-be29-d49b6e94f95d,2016,Athletics,Cubs,176
8,d977367c-cf0c-4687-95a0-eb4542efcb01,2016,Rockies,Cubs,180
9,a87070ff-1084-43ca-a7ba-69278f93ecba,2016,Cardinals,Cubs,157


In [25]:
df3["team"]

Unnamed: 0,team,team.1
0,Marlins,Cubs
1,Marlins,Cubs
2,Braves,Cubs
3,Braves,Cubs
4,Phillies,Cubs
5,Diamondbacks,Cubs
6,Athletics,Cubs
7,Athletics,Cubs
8,Rockies,Cubs
9,Cardinals,Cubs


### Binary Operation

In [26]:
df4 = df[["year", "duration_minutes"]]
df4

Unnamed: 0,year,duration_minutes
0,2016,187
1,2016,189
2,2016,165
3,2016,222
4,2016,164
5,2016,201
6,2016,173
7,2016,176
8,2016,180
9,2016,157


In [27]:
df4 + 1

Unnamed: 0,year,duration_minutes
0,2017,188
1,2017,190
2,2017,166
3,2017,223
4,2017,165
5,2017,202
6,2017,174
7,2017,177
8,2017,181
9,2017,158
