
#  BigQuery: managing tables with Python

This is a practice of managing tables and importing csv data file to a table into bigquery with python codes.

original source of the imported data (edited): https://www.kaggle.com/karangadiya/fifa19

In [0]:
# on local computer
#set google service account creddential for using bigquery

#import os

#os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = "C:/Users/---/---.json"

In [2]:
# in colab

from google.colab import auth

auth.authenticate_user()
print('Authenticated')

Authenticated


In [0]:
from google.cloud import bigquery as bq
import humanize
import time

## BigQuery data

client

|___project

  |___dataset

   |____table


In [0]:
# Create a "Client" object

project_id = "sql-python-265823"

client = bq.Client(project=project_id)

#Get job details via Google Cloud Python Client for BigQuery
#Sometimes you might want more details about the job, such as the amount of data processed and billed. For the full range of details related to the job you’ll need the BigQuery Python Client. Here’s a code snippet you can use to get the amount of data processed and billed for a specified job.

 
#job = client.get_job("job id returned from previous step")
#bytes_processed = humanize.naturalsize(job.total_bytes_processed, binary=True)
#bytes_billed = humanize.naturalsize(job.total_bytes_billed, binary=True)
#print('%s processed, %s billed (%s)' % (bytes_processed, bytes_billed, job.ended - job.created))


## Creating a dataset

In [5]:
# Construct a reference to the dataset
dataset_id = 'kaggle_fifa19'
dataset_ref = client.dataset(dataset_id)

try:
  dataset=client.get_dataset(dataset_ref)
  print('Dataset exists')
except :
  dataset = bq.Dataset(dataset_ref)
  dataset = client.create_dataset(dataset)
  print('Dataset {} created.'.format(dataset.dataset_id))


Dataset exists


## Creating a table

In [27]:
table_ref = dataset_ref.table('data19_basic_dup')

try:
   table=client.get_table(table_ref)
   print('Table exists')
except :
   schema = [bq.SchemaField('Name', 'STRING', mode='REQUIRED'),
             bq.SchemaField('Nationality', 'STRING', mode='REQUIRED'),
             bq.SchemaField('Clue', 'STRING', mode='REQUIRED'),
             bq.SchemaField('Age', 'INTEGER', mode='REQUIRED'),
        ]
   table = bq.Table(table_ref, schema=schema)
   table = client.create_table(table)
   print('table {} created.'.format(table.table_id))

table.schema



table data19_basic_dup created.


[SchemaField('Name', 'STRING', 'REQUIRED', None, ()),
 SchemaField('Nationality', 'STRING', 'REQUIRED', None, ()),
 SchemaField('Clue', 'STRING', 'REQUIRED', None, ()),
 SchemaField('Age', 'INTEGER', 'REQUIRED', None, ())]

## Listing tables

In [28]:
# List all the tables in the dataset
tables = list(client.list_tables(dataset))

# Print names of all tables in the dataset (there are four!)
for table in tables:  
    print(table.table_id)



data19_1
data19_2
data19_3
data19_basic
data19_basic_dup


## Copying multiple tables

In [29]:
table_ref1 = dataset_ref.table('data19_basic')
table_ref2 = dataset_ref.table('data19_basic_dup')
table_ids=[table_ref1,table_ref2]

dest_table_id = dataset_ref.table('data19_basic_dup2')

job = client.copy_table(table_ids, dest_table_id)  # Make an API request.
job.result()  # Wait for the job to complete.

print("Copy {} to to {} ".format(table_ids, dest_table_id))

Copy [TableReference(DatasetReference('sql-python-265823', 'kaggle_fifa19'), 'data19_basic'), TableReference(DatasetReference('sql-python-265823', 'kaggle_fifa19'), 'data19_basic_dup')] to to TableReference(DatasetReference('sql-python-265823', 'kaggle_fifa19'), 'data19_basic_dup2') 


## Delete a table

In [30]:
table_ref_del= dataset_ref.table('data19_basic_dup2')
client.delete_table(table_ref_del, not_found_ok=True)  # Make an API request.
print("Deleted table '{}'.".format(table_ref_del))



Deleted table 'TableReference(DatasetReference('sql-python-265823', 'kaggle_fifa19'), 'data19_basic_dup2')'.


## Restoring a deleted table

In [31]:
#current time as snapshot epoch
snapshot_epoch = int(time.time() * 1000)

table_id_del= 'sql-python-265823.kaggle_fifa19.data19_basic_dup'
client.delete_table(table_id_del) 

# Construct the restore-from table ID using a snapshot decorator.
snapshot_table_id = "{}@{}".format(table_id_del, snapshot_epoch)

table_id_rec= 'sql-python-265823.kaggle_fifa19.data19_basic_rec'

# Construct and run a copy job.
job = client.copy_table(snapshot_table_id,table_id_rec, location="US") 

job.result()  

print(
    "Copied data from deleted table {} to {}".format(table_id_del, table_id_rec)
)

Copied data from deleted table sql-python-265823.kaggle_fifa19.data19_basic_dup to sql-python-265823.kaggle_fifa19.data19_basic_rec


In [32]:
# List all the tables in the dataset
tables = list(client.list_tables(dataset))

# Print names of all tables in the dataset (there are four!)
for table in tables:  
    print(table.table_id)

data19_1
data19_2
data19_3
data19_basic
data19_basic_rec


## Importing a csv file to a table

In [0]:
# import to create/expand a table
table_id='data19_2'
table_ref = dataset_ref.table(table_id)

job_config = bigquery.LoadJobConfig()
job_config.source_format = bigquery.SourceFormat.CSV
job_config.skip_leading_rows = 1
job_config.autodetect = True

file_name ='/content/data19_2.csv'

with open(file_name, "rb") as source_file:

   job = client.load_table_from_file(source_file, table_ref, job_config=job_config)

job.result()  # Waits for table load to complete.

print("Loaded {} rows into {}:{}.".format(job.output_rows, dataset_id, table_id))


Loaded 18207 rows into kaggle_fifa19:data19_2.


In [0]:
new_table = client.get_table(table_ref)
print("Loaded {} rows.".format(new_table.num_rows))  
print(new_table.table_id)
new_table.schema

Loaded 18159 rows.
data19_3


[SchemaField('ID', 'INTEGER', 'NULLABLE', None, ()),
 SchemaField('Name', 'STRING', 'NULLABLE', None, ()),
 SchemaField('Age', 'INTEGER', 'NULLABLE', None, ()),
 SchemaField('Nationality', 'STRING', 'NULLABLE', None, ()),
 SchemaField('Club', 'STRING', 'NULLABLE', None, ()),
 SchemaField('Preferred_Foot', 'STRING', 'NULLABLE', None, ()),
 SchemaField('Position', 'STRING', 'NULLABLE', None, ()),
 SchemaField('Jersey_Number', 'FLOAT', 'NULLABLE', None, ()),
 SchemaField('Joined', 'STRING', 'NULLABLE', None, ()),
 SchemaField('Contract_Valid_Until', 'STRING', 'NULLABLE', None, ()),
 SchemaField('Height_CM_', 'INTEGER', 'NULLABLE', None, ()),
 SchemaField('Weight_KG_', 'INTEGER', 'NULLABLE', None, ()),
 SchemaField('Value_EUR_', 'FLOAT', 'NULLABLE', None, ()),
 SchemaField('Wage_EUR_', 'FLOAT', 'NULLABLE', None, ()),
 SchemaField('Release_Clause_EUR_', 'FLOAT', 'NULLABLE', None, ())]

In [0]:
new_table.num_rows

18159

In [0]:
# List all the tables in the dataset
tables = list(client.list_tables(dataset))

# Print names of all tables in the dataset (there are four!)
for table in tables:  
    print(table.table_id)


## **Reference**

Quick start
https://www.kaggle.com/learn/intro-to-sql

Python Client for Google BigQuery
https://googleapis.dev/python/bigquery/latest/index.html

Managing tables https://cloud.google.com/bigquery/docs/managing-tables

Importing tables
https://hevodata.com/blog/api-to-bigquery/


