In [1]:
import json
import pandas as pd
import time
import gcsfs

In [2]:
from google.cloud import storage
from google.cloud import bigquery
from pathlib import Path
import os

In [3]:
PROJ_ROOT = Path().resolve().parent
KEYS_DIR = PROJ_ROOT / 'keys' 
keys = KEYS_DIR / 'keys-for-smart-science-muse-data.json'

In [4]:
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = str(keys)

In [5]:
project = 'smart-science'
dataset_id = 'Learners_Questions'
table_id = 'Combined_tables_39k'
bucket_name = 'muse-data'

# Setup Storage vars
storage_client = storage.Client(project=project)
bucket = storage_client.get_bucket('muse-data')

# Setup BigQuery vars
bq_client = bigquery.Client()
dataset_ref = bq_client.dataset(dataset_id, project=project)
table_ref = dataset_ref.table(table_id)

#destination
destination_uri = "gs://{}/{}".format(bucket_name, "combined_tables_raw.json")

In [7]:
#set job configuration to extract job as JSON files
job_config = bigquery.ExtractJobConfig()
job_config.destination_format = 'NEWLINE_DELIMITED_JSON'
job_config.write_disposition = 'WRITE_TRUNCATE'

In [8]:
extract_job = bq_client.extract_table(
    table_ref,
    destination_uri,
    # Location must match that of the source table.
    location="US",
    job_config=job_config
)  # API request
extract_job.result()  # Waits for job to complete.

print(
    "Exported {}:{}.{} to {}".format(project, dataset_id, table_id, destination_uri)
)

Exported smart-science:Learners_Questions.Combined_Tables to gs://muse-data/combined_tables_raw.json


In [9]:
df = pd.read_json(destination_uri, lines=True)

In [10]:
df.shape

(32687, 11)

In [11]:
df

Unnamed: 0,answer,answer_type,last_modified,learner,learner_name,metatag,name,question,tag,text,who
0,,binary,2017-06-04 15:56:53.69455 UTC,4535845151309824,Carolina,[answer],UFqE8BiENzhk07YcA0oD8Eaxrb50LSts,368,[yes],Do you have reliable childcare for {kid_name}?,parent
1,,binary,2017-06-04 15:56:57.46436 UTC,4535845151309824,Carolina,[answer],kLHlljhhWAnDEAixy2iaR9QkNhPYgdiw,371,[yes],Do you work outside of the home?,parent
2,,binary,2017-08-31 11:44:55.22537 UTC,4535845151309824,Carolina,[answer],AI9iuqtEMakSt0EqPasH4B9MxQuDgice,38,[definitely],Did you graduate from high school?,parent
3,1.0,binary,2017-01-15 01:09:13.02664 UTC,4536217521618944,Katie,[answer],zUXTth86bHSuv5tzk3HZP3pyt2cj7Zhp,22,[yes],Do you know where the nearest playground is fr...,parent
4,1.0,binary,2017-01-15 01:11:47.43045 UTC,4536217521618944,Katie,[answer],cjTk7pkKA37b4TAOuDl1gO5n0JUxOW4f,22,[yes],Do you know where the nearest playground is fr...,parent
5,1.0,scale,2017-01-15 01:09:10.63448 UTC,4536217521618944,Katie,[answer],fF1jcJpeFa0QNrMZDLDDYfvHT3iT5OeD,236,[yes],"Does {kid_name} complete familiar books, espec...",child
6,1.0,scale,2017-01-15 01:11:49.84119 UTC,4536217521618944,Katie,[answer],iFDQCB2EFajl0ZOJRWxVFb2Stg7trWHb,283,[yes],Does {kid_name} enjoy trying new things?,child
7,1.0,scale,2017-01-15 01:08:00.33935 UTC,4536217521618944,Katie,[answer],qiLwtrCvLnM6nDyuHMyV0SieyuYxFtlq,285,[yes],Does {kid_name} enjoy creating elaborate prete...,child
8,0.0,binary,2017-01-15 01:13:38.63336 UTC,4536217521618944,Katie,[answer],P5apuN9tKCPat5dhGPvWCz6WIEeofC0s,41,[no],Are you the first person in your family to gra...,parent
9,1.0,scale,2017-01-15 01:07:52.71889 UTC,4536217521618944,Katie,[answer],wlbqIQAZcupMEcczJxbCGwX2QiNKJogQ,434,[yes],Does {kid_name} usually share with others?,child


In [14]:
df['learner'].drop_duplicates()

0        4535845151309824
3        4536217521618944
23       4536398325481472
25       4551846551093248
30       4554084669456384
42       4563324725362688
44       4570824568733696
48       4576288907984896
54       4594617454428160
60       4597232309370880
62       4611123475120128
65       4613797645910016
67       4619529606922240
70       4623601126866944
75       4628968502198272
77       4631671437524992
79       4631909271339008
81       4640195836116992
147      4643024344711168
163      4643216242507776
166      4646135276765184
168      4649037902905344
183      4649666666823680
219      4651483148582912
222      4654768731455488
224      4657658128760832
226      4657999083732992
229      4659795990675456
239      4664326778519552
241      4666370654470144
               ...       
32550    6577171304808448
32554    6577703167721472
32558    6579174856720384
32560    6580381545398272
32562    6582805790195712
32564    6583070106845184
32565    6592382661296128
32575    659

2734 in 'Learners' SQL

In [17]:
df['last_modified'] = pd.to_datetime(df['last_modified'])
least_recent_date = df['last_modified'].min()
recent_date = df['last_modified'].max()

In [18]:
recent_date

Timestamp('2018-11-28 03:55:30.195524+0000', tz='UTC')

In [21]:
latest = df.loc[df.groupby('name').last_modified.idxmax()]

In [23]:
latest.shape

(16120, 11)

In [28]:
lastest = df.sort_values('last_modified').groupby('name').tail(1)

In [26]:
lastest.shape

(16120, 11)