
#**Bigquery SQL query 2**   

This is a practice of some basic SQL commands and functions on bigquery.

Data in tables are subsets of kaggle fifa19 data
from https://www.kaggle.com/karangadiya/fifa19

In [0]:
# in colab

from google.colab import auth

auth.authenticate_user()
print('Authenticated')

Authenticated


In [0]:
from google.cloud import bigquery as bq
import humanize
import time
import pandas as pd

## Exploring the dataset

In [0]:
# Create a "Client" object

project_id = "sql-python-"
client = bq.Client(project=project_id)

# dataset
dataset_id = 'kaggle_fifa19'
dataset_ref = client.dataset(dataset_id)
dataset=client.get_dataset(dataset_ref)

In [0]:
# List all the tables in the dataset
tables = list(client.list_tables(dataset))

for tb in tables:  
    print(tb.table_id)

data19_3
data19_3_view
data19_basic
data19_basic_dup
data19_bio
data19_new
data19_new1
data19_new2
data19_value


In [0]:
table_id = 'data19_3_view'
table_ref = dataset_ref.table(table_id)

table=client.get_table(table_ref)
print("table has {} rows".format(table.num_rows))
table.schema

table has 0 rows


[SchemaField('ID', 'INTEGER', 'NULLABLE', None, ()),
 SchemaField('Name', 'STRING', 'NULLABLE', None, ()),
 SchemaField('Age', 'INTEGER', 'NULLABLE', None, ()),
 SchemaField('Nationality', 'STRING', 'NULLABLE', None, ()),
 SchemaField('Club', 'STRING', 'NULLABLE', None, ()),
 SchemaField('Preferred_Foot', 'STRING', 'NULLABLE', None, ()),
 SchemaField('Position', 'STRING', 'NULLABLE', None, ()),
 SchemaField('Jersey_Number', 'FLOAT', 'NULLABLE', None, ()),
 SchemaField('Joined', 'STRING', 'NULLABLE', None, ()),
 SchemaField('Contract_Valid_Until', 'INTEGER', 'NULLABLE', None, ()),
 SchemaField('Height_CM', 'INTEGER', 'NULLABLE', None, ()),
 SchemaField('Weight_KG', 'INTEGER', 'NULLABLE', None, ()),
 SchemaField('Value_ME', 'FLOAT', 'NULLABLE', None, ()),
 SchemaField('Wage_ME', 'FLOAT', 'NULLABLE', None, ()),
 SchemaField('Release_Clause_ME', 'FLOAT', 'NULLABLE', None, ()),
 SchemaField('List_ID', 'INTEGER', 'NULLABLE', None, ())]

In [0]:
#bq client lib
client.list_rows(table,  selected_fields=table.schema[:], max_results=10).to_dataframe()

Unnamed: 0,List_ID,ID,Name,Nationality,Club,Age,Height,Weight,Value_ME,Wage_ME
0,1,158023,L. Messi,Argentina,FC Barcelona,31,170,72,110.5,0.565
1,2,20801,Cristiano Ronaldo,Portugal,Juventus,33,188,83,77.0,0.405
2,3,190871,Neymar Jr,Brazil,Paris Saint-Germain,26,175,68,118.5,0.29
3,4,193080,De Gea,Spain,Manchester United,27,193,76,72.0,0.26
4,5,192985,K. De Bruyne,Belgium,Manchester City,27,180,70,102.0,0.355
5,6,183277,E. Hazard,Belgium,Chelsea,27,173,74,93.0,0.34
6,7,177003,L. Modrić,Croatia,Real Madrid,32,173,66,67.0,0.42
7,8,176580,L. Suárez,Uruguay,FC Barcelona,31,183,86,80.0,0.455
8,9,155862,Sergio Ramos,Spain,Real Madrid,32,183,82,51.0,0.38
9,10,200389,J. Oblak,Slovenia,Atlético Madrid,25,188,87,68.0,0.094


In [0]:
# list data from view instead of table

query = """

    SELECT *
    FROM `kaggle_fifa19.data19_3_view`
    LIMIT 100
"""

client.query(query,project=project_id).result().to_dataframe()
#pd.read_gbq(query, project_id=project_id, dialect='standard')


Unnamed: 0,ID,Name,Age,Nationality,Club,Preferred_Foot,Position,Jersey_Number,Joined,Contract_Valid_Until,Height_CM,Weight_KG,Value_ME,Wage_ME,Release_Clause_ME,List_ID
0,202605,A. Milošević,26,Sweden,AIK,Right,RCB,6.0,"Mar 1, 2018",2018,190,82,2.5,0.006,3.4,3803
1,230407,Miltisco Paulino,30,Brazil,Santos,Right,LCB,4.0,"Jan 1, 2018",2018,180,77,1.8,0.011,3.3,3555
2,200845,P. Hoban,26,Republic of Ireland,Dundalk,Right,ST,9.0,"Dec 1, 2017",2018,180,84,1.3,0.002,2.0,5126
3,212715,S. Palacios,26,Argentina,Pachuca,Right,RM,34.0,"Jul 1, 2017",2018,170,69,2.8,0.018,4.9,4115
4,230202,Jailson Caeiro,26,Brazil,Botafogo,Right,RB,2.0,"Jan 1, 2018",2018,178,75,4.8,0.018,9.1,2002
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,214700,M. Navarrete,26,Chile,Unión La Calera,Right,LCB,3.0,"Jul 1, 2017",2018,178,73,1.7,0.002,2.5,4659
96,230258,Rosberto Dourado,30,Brazil,Atlético Mineiro,Right,RCM,13.0,"Jan 1, 2018",2018,175,70,16.5,0.046,31.4,226
97,230230,Eltildo Correia,26,Brazil,Atlético Mineiro,Right,RCB,3.0,"Jan 1, 2018",2018,188,78,6.5,0.030,12.4,1254
98,168567,A. Ramírez,37,Colombia,Atlético Nacional,Right,RCM,29.0,"Jan 10, 2017",2018,175,76,1.8,0.004,2.5,2115


## Query example 1

WHERE, select row 
GROUP BY, for aggregation
HAVING, select row after aggregation fuctions
ORDER BY, sort after above

In [0]:
#COUNT

query="""
SELECT Nationality, COUNT(Nationality) Num_Player 
FROM kaggle_fifa19.data19_3
WHERE Value_ME > 10
GROUP BY Nationality
HAVING Num_Player > 10
ORDER BY Num_Player DESC
LIMIT 10
"""
client.query(query, project=project_id).result().to_dataframe()

Unnamed: 0,Nationality,Num_Player
0,Spain,116
1,France,78
2,Brazil,77
3,Germany,64
4,Argentina,50
5,England,49
6,Italy,45
7,Portugal,36
8,Netherlands,30
9,Belgium,24


In [0]:
# SUM, AVG

query="""
SELECT Club, SUM(Wage_ME) as Sum_wage,  COUNT(Club) Num_Player, AVG(Wage_ME) as Avg_wage 
FROM kaggle_fifa19.data19_3
WHERE Value_ME > 1
GROUP BY Club
ORDER BY Sum_wage DESC
LIMIT 10
"""
client.query(query, project=project_id).result().to_dataframe()

Unnamed: 0,Club,Sum_wage,Num_Player,Avg_wage
0,Real Madrid,4.949,27,0.183296
1,FC Barcelona,4.788,28,0.171
2,Manchester City,3.7,26,0.142308
3,Manchester United,3.349,29,0.115483
4,Juventus,3.287,24,0.136958
5,Chelsea,2.99,25,0.1196
6,Liverpool,2.836,25,0.11344
7,Tottenham Hotspur,2.581,27,0.095593
8,Arsenal,2.491,25,0.09964
9,Paris Saint-Germain,2.128,25,0.08512


## Query example 2
The "maxium salary" query

In [0]:
# list highest wage
query = """
SELECT CLub, Name, Wage_ME
FROM kaggle_fifa19.data19_3
WHERE Wage_ME = 
(SELECT MAX(Wage_ME) FROM kaggle_fifa19.data19_3)

"""

client.query(query,project=project_id).result().to_dataframe()

Unnamed: 0,CLub,Name,Wage_ME
0,FC Barcelona,L. Messi,0.565


In [0]:
# list the players with the highest wage of each club
query = """
WITH t1 AS(
SELECT Club, MAX(Wage_ME) AS max_wage
FROM kaggle_fifa19.data19_3
GROUP BY Club
) 

SELECT t2.Club, t2.Name, t2.Wage_ME
FROM kaggle_fifa19.data19_3 as t2, t1
WHERE t2.Wage_ME = t1.max_wage and t2.Club = t1.Club
ORDER BY max_wage DESC
LIMIT 10
"""

client.query(query,project=project_id).result().to_dataframe()

Unnamed: 0,Club,Name,Wage_ME
0,FC Barcelona,L. Messi,0.565
1,Real Madrid,L. Modrić,0.42
2,Juventus,Cristiano Ronaldo,0.405
3,Manchester City,K. De Bruyne,0.355
4,Chelsea,E. Hazard,0.34
5,Paris Saint-Germain,Neymar Jr,0.29
6,Arsenal,P. Aubameyang,0.265
7,Manchester United,De Gea,0.26
8,Liverpool,M. Salah,0.255
9,FC Bayern München,R. Lewandowski,0.205


In [0]:
# INNER JOIN method
query = """
SELECT t1.Club, t1.Name, t1.Wage_ME 
FROM
(SELECT max(Wage_ME) as max_wage, Club 
FROM kaggle_fifa19.data19_3 
GROUP BY Club) as t2
Inner Join 
kaggle_fifa19.data19_3 t1 
on t2.Club=t1.Club and t2.max_wage=t1.Wage_ME
ORDER BY Wage_ME DESC
LIMIT 10

"""

client.query(query,project=project_id).result().to_dataframe()

Unnamed: 0,Club,Name,Wage_ME
0,FC Barcelona,L. Messi,0.565
1,Real Madrid,L. Modrić,0.42
2,Juventus,Cristiano Ronaldo,0.405
3,Manchester City,K. De Bruyne,0.355
4,Chelsea,E. Hazard,0.34
5,Paris Saint-Germain,Neymar Jr,0.29
6,Arsenal,P. Aubameyang,0.265
7,Manchester United,De Gea,0.26
8,Liverpool,M. Salah,0.255
9,FC Bayern München,R. Lewandowski,0.205


## Query example 3
The typical "second highest salary" query

In [0]:
#1 max of the data without first max
query = """

SELECT MAX(Wage_ME) as second_highest
FROM kaggle_fifa19.data19_3
WHERE Wage_ME < 
(SELECT MAX(Wage_ME) FROM kaggle_fifa19.data19_3  )

"""

client.query(query,project=project_id).result().to_dataframe()

Unnamed: 0,second_highest
0,0.455


In [0]:
#2 second in the order
query = """

SELECT CLub, Name, Wage_ME  #distinct
FROM kaggle_fifa19.data19_3
ORDER BY Wage_ME DESC
LIMIT 1 OFFSET 1
"""

client.query(query,project=project_id).result().to_dataframe()

Unnamed: 0,CLub,Name,Wage_ME
0,FC Barcelona,L. Suárez,0.455


In [0]:
#3 rank and pick
query = """
SELECT Club, Name, Wage
FROM 
(     
 SELECT CLub, Name, Wage_ME wage, 
 ROW_NUMBER() OVER(ORDER BY Wage_ME DESC) AS Wage_rank,
 FROM kaggle_fifa19.data19_3 
 ORDER BY Wage DESC
 )
 WHERE wage_rank = 2
 """

client.query(query,project=project_id).result().to_dataframe()

Unnamed: 0,Club,Name,Wage
0,FC Barcelona,L. Suárez,0.455


In [0]:
# second in EACH club
# rank top 3

query = """

SELECT *
FROM (
SELECT Club, Name, substring(Wage_ME,1,4) 
      ROW_NUMBER() OVER(PARTITION BY Club ORDER BY Wage_ME DESC) AS Wage_rank,
      MAX(Wage_ME) OVER(PARTITION BY Club) AS CLub_max
FROM kaggle_fifa19.data19_3
)
WHERE Wage_rank < 4      
ORDER BY Club_max DESC, Club, Wage_rank
LIMIT 20
"""

client.query(query,project=project_id).result().to_dataframe()

Unnamed: 0,Club,Name,Wage_ME,Wage_rank,CLub_max
0,FC Barcelona,L. Messi,0.565,1,0.565
1,FC Barcelona,L. Suárez,0.455,2,0.565
2,FC Barcelona,Coutinho,0.34,3,0.565
3,Real Madrid,L. Modrić,0.42,1,0.42
4,Real Madrid,Sergio Ramos,0.38,2,0.42
5,Real Madrid,T. Kroos,0.355,3,0.42
6,Juventus,Cristiano Ronaldo,0.405,1,0.405
7,Juventus,G. Chiellini,0.215,2,0.405
8,Juventus,P. Dybala,0.205,3,0.405
9,Manchester City,K. De Bruyne,0.355,1,0.355


In [0]:
# pick #2 

query = """

SELECT Club, Name, Wage_ME, Wage_rank
FROM (
SELECT Club, Name, Wage_ME, 
      ROW_NUMBER() OVER(PARTITION BY Club ORDER BY Wage_ME DESC) AS Wage_rank,
      MAX(Wage_ME) OVER(PARTITION BY Club) AS CLub_max
FROM kaggle_fifa19.data19_3
)
WHERE Wage_rank = 2        # not, like, between % _
ORDER BY Club_max DESC, Club
LIMIT 10
"""

client.query(query,project=project_id).result().to_dataframe()

Unnamed: 0,Club,Name,Wage_ME,Wage_rank
0,FC Barcelona,L. Suárez,0.455,2
1,Real Madrid,Sergio Ramos,0.38,2
2,Juventus,G. Chiellini,0.215,2
3,Manchester City,S. Agüero,0.3,2
4,Chelsea,N. Kanté,0.225,2
5,Paris Saint-Germain,E. Cavani,0.2,2
6,Arsenal,M. Özil,0.19,2
7,Manchester United,R. Lukaku,0.23,2
8,Liverpool,S. Mané,0.195,2
9,FC Bayern München,M. Hummels,0.16,2


## UNION, INTERSECT

In [0]:
#2 second in the order
query = """

(SELECT CLub, Name, Nationality, Value_ME
FROM kaggle_fifa19.data19_3
WHERE Club = "FC Barcelona"
ORDER BY Value_ME DESC
LIMIT 5)

UNION ALL  # ALL|DISTINCT,  INTERSECT

(SELECT CLub, Name, Nationality, Value_ME
FROM kaggle_fifa19.data19_3
WHERE Nationality = "Argentina"
ORDER BY Value_ME DESC
LIMIT 5)

"""

client.query(query,project=project_id).result().to_dataframe()

Unnamed: 0,CLub,Name,Nationality,Value_ME
0,FC Barcelona,L. Messi,Argentina,110.5
1,FC Barcelona,L. Suárez,Uruguay,80.0
2,FC Barcelona,Coutinho,Brazil,69.5
3,FC Barcelona,M. ter Stegen,Germany,58.0
4,FC Barcelona,S. Umtiti,France,57.0
5,FC Barcelona,L. Messi,Argentina,110.5
6,Juventus,P. Dybala,Argentina,89.0
7,Manchester City,S. Agüero,Argentina,64.5
8,Inter,M. Icardi,Argentina,64.5
9,Atalanta,A. Gómez,Argentina,30.0


## DISTINCT and repeat values

In [0]:
# players with the same ages
query = """
SELECT DISTINCT t1.Age, t1.Name, t1.Club
FROM kaggle_fifa19.data19_3 t1, kaggle_fifa19.data19_3 t2
WHERE t2.Age = t1.Age and t1.Name != t2.Name and t1.Club in ("FC Barcelona", "Real Madrid")
ORDER BY t1.Age DESC, t1.Club
LIMIT 10
"""

client.query(query,project=project_id).result().to_dataframe()

Unnamed: 0,Age,Name,Club
0,32,T. Vermaelen,FC Barcelona
1,32,Sergio Ramos,Real Madrid
2,32,L. Modrić,Real Madrid
3,31,Piqué,FC Barcelona
4,31,L. Messi,FC Barcelona
5,31,A. Vidal,FC Barcelona
6,31,L. Suárez,FC Barcelona
7,31,K. Navas,Real Madrid
8,31,Kiko Casilla,Real Madrid
9,30,I. Rakitić,FC Barcelona


In [0]:
# numbers of players with the same ages
query = """
SELECT DISTINCT Age, COUNT(Age) as Num_Players
FROM kaggle_fifa19.data19_3 
WHERE Club in ("FC Barcelona", "Real Madrid")
GROUP BY Age
ORDER BY Age DESC
"""

client.query(query,project=project_id).result().to_dataframe()

Unnamed: 0,Age,Num_Players
0,32,3
1,31,6
2,30,3
3,29,3
4,28,3
5,27,1
6,26,7
7,25,2
8,24,4
9,23,3


## DATE / TIME

In [0]:
# current time
query = """
SELECT CURRENT_DATE() as the_date, EXTRACT(WEEK FROM CURRENT_DATE()) as weeks
"""

client.query(query,project=project_id).result().to_dataframe()

Unnamed: 0,the_date,weeks
0,2020-02-18,7


In [0]:
# convert format and calculate date_diff
query = """
SElECT Name, joined_date, Age, (Age-years) as joined_age, years
FROM(
SELECT Name, Age, PARSE_DATE('%B %e, %Y', Joined) joined_date, 
DATE_DIFF(
    CURRENT_DATE(), 
    PARSE_DATE('%B %e, %Y', Joined), 
    YEAR
  ) years
FROM kaggle_fifa19.data19_3
WHERE Club = "FC Barcelona"
ORDER BY Wage_ME DESC
LIMIT 20
)
ORDER BY joined_date
"""

client.query(query,project=project_id).result().to_dataframe()

#update kaggle_fifa19.data19_3 
#set joined_date = parse_date('%B %e, %Y', Joined)

Unnamed: 0,Name,joined_date,Age,joined_age,years
0,L. Messi,2004-07-01,31,15,16
1,Piqué,2008-07-01,31,19,12
2,Sergio Busquets,2008-09-01,29,17,12
3,Jordi Alba,2012-07-01,29,21,8
4,Sergi Roberto,2013-07-01,26,19,7
5,Rafinha,2013-07-01,25,18,7
6,I. Rakitić,2014-07-01,30,24,6
7,M. ter Stegen,2014-07-01,26,20,6
8,L. Suárez,2014-07-11,31,25,6
9,Munir,2015-07-01,22,17,5


## **Reference**

getting started colab bq https://colab.research.google.com/notebooks/bigquery.ipynb

standard SQL query syntax https://cloud.google.com/bigquery/docs/reference/standard-sql/query-syntax


