#### 1) Download college dataset from bucket:
#### `gsutil cp gs://cs327e-open-access/college_normalized.zip .`

#### 2) Load college dataset into BQ
#### Refer to https://cloud.google.com/bigquery/docs/reference/bq-cli-reference#top_of_page for documentation on bq tool and bq load command

In [3]:
dataset_id = "college"

In [4]:
!bq --location=US mk --dataset {dataset_id}

BigQuery error in mk operation: Dataset 'authentic-light-303018:college' already
exists.


In [5]:
!bq --location=US load --autodetect --skip_leading_rows=1 \
--source_format=CSV {dataset_id}.Class '/home/jupyter/college_normalized/class.csv'

Upload complete.
Waiting on bqjob_r794edffcc9f0d241_00000178b8deacef_1 ... (1s) Current status: DONE   


In [6]:
!bq --location=US load --autodetect --skip_leading_rows=1 \
--source_format=CSV {dataset_id}.Student '/home/jupyter/college_normalized/student.csv'

Upload complete.
Waiting on bqjob_r35a8e6b3f47c89f2_00000178b8dec89c_1 ... (1s) Current status: DONE   


In [7]:
!bq --location=US load --autodetect --skip_leading_rows=1 \
--source_format=CSV {dataset_id}.Instructor '/home/jupyter/college_normalized/instructor.csv'

Upload complete.
Waiting on bqjob_r3480090028a2e7d_00000178b8dee545_1 ... (1s) Current status: DONE   


In [8]:
!bq --location=US load --autodetect --skip_leading_rows=1 \
--source_format=CSV {dataset_id}.Takes '/home/jupyter/college_normalized/takes.csv'

Upload complete.
Waiting on bqjob_r15758ca68943f872_00000178b8df0aaf_1 ... (3s) Current status: DONE   


In [9]:
!bq --location=US load --autodetect --skip_leading_rows=1 \
--source_format=CSV {dataset_id}.Teaches '/home/jupyter/college_normalized/teaches.csv'

Upload complete.
Waiting on bqjob_r4927e62601174aa2_00000178b8df297e_1 ... (1s) Current status: DONE   


#### Suppose that Student had not loaded correctly with the --autodetect option

In [10]:
%%bigquery
drop table college.Student

In [11]:
!bq --location=US load --skip_leading_rows=1 --allow_jagged_rows=true --source_format=CSV {dataset_id}.Student \
/home/jupyter/college_normalized/student.csv \
sid:STRING,fname:STRING,lname:STRING,dob:DATE,status:STRING

Upload complete.
Waiting on bqjob_r23e04a3fd1dc33fe_00000178b8df5466_1 ... (1s) Current status: DONE   


#### Get record count for each table

In [12]:
%%bigquery
select count(*) as count from college.Student

Unnamed: 0,count
0,12


In [13]:
%%bigquery
select count(*) as count from college.Class

Unnamed: 0,count
0,11


In [14]:
%%bigquery
select count(*) as count from college.Instructor

Unnamed: 0,count
0,10


In [15]:
%%bigquery
select count(*) as count from college.Takes

Unnamed: 0,count
0,16


In [16]:
%%bigquery
select count(*) as count from college.Teaches

Unnamed: 0,count
0,12


#### Get sample records

In [17]:
%%bigquery
select * from college.Class
limit 5

Unnamed: 0,cno,cname,credits
0,CS329E,Elements of Web Programming,3
1,CS327E,Elements of Databases,3
2,CS313E,Elements of Software Engineering,3
3,CS326E,Elements of Networking,3
4,M358K,Applied Statistics,3


In [18]:
%%bigquery
select * from college.Student
limit 5

Unnamed: 0,sid,fname,lname,dob,status
0,kev18,Kevin,Lin,1999-10-05,CUR
1,sudeepa4,Sudeepa,Roy,2001-10-01,CUR
2,jerryh,Jerry,Hargrove,1999-01-03,CUR
3,jc,James,Cowe,2000-04-22,CUR
4,paulg,Paul,Gore,2000-09-17,CUR


In [19]:
%%bigquery
select * from college.Instructor
limit 5

Unnamed: 0,string_field_0,string_field_1,string_field_2
0,neeman,Joe Neeman,Mathematics
1,koch,Hans Koch,Mathematics
2,mueller,Peter Mueller,Mathematics
3,tran,Ngoc Tran,Mathematics
4,scohen,Shirley Cohen,Computer Science


In [21]:
%%bigquery
select * from college.Takes
limit 5

Unnamed: 0,sid,cno,cname,credits,grade
0,kev18,M362K,Probability I,3,A
1,kev18,CS329E,Elements of Web Programming,3,A
2,paulg,CS329E,Elements of Web Programming,3,A
3,paulg,CS313E,Elements of Software Engineering,3,A
4,jc,CS327E,Elements of Databases,3,B


In [20]:
%%bigquery
select * from college.Teaches
limit 5

Unnamed: 0,tid,cno,cname,credits
0,neeman,M362K,Probability I,3
1,mueller,M362K,Probability I,3
2,cannata,CS347,Data Management,3
3,tran,M358K,Applied Statistics,3
4,downing,CS373,Software Engineering,3


#### How should we fix the Instructor's schema?

In [21]:
%%bigquery
drop table college.Instructor

In [22]:
!bq --location=US load --skip_leading_rows=1 --allow_jagged_rows=true --source_format=CSV {dataset_id}.Instructor \
/home/jupyter/college_normalized/instructor.csv \
tid:STRING,instructor:STRING,dept:STRING

Upload complete.
Waiting on bqjob_r1a0bb2efd83decd7_00000178b8e536bc_1 ... (0s) Current status: DONE   


In [24]:
%%bigquery 
select * from college.Instructor
limit 10

Unnamed: 0,tid,instructor,dept
0,neeman,Joe Neeman,Mathematics
1,koch,Hans Koch,Mathematics
2,mueller,Peter Mueller,Mathematics
3,tran,Ngoc Tran,Mathematics
4,scohen,Shirley Cohen,Computer Science
5,bulko,Bill Bulko,Computer Science
6,fares,Fares Fraij,Computer Science
7,cannata,Phil Cannata,Computer Science
8,downing,Glenn Downing,Computer Science
9,mitra,Shyamal Mitra,Computer Science


**Practice Problems**

For each class, how many students are enrolled in the class?
Return the cno and enrollment count for each class.

In [27]:
%%bigquery
SELECT c.cno, count(*)
FROM college.Class c LEFT JOIN college.Takes t
ON c.cno = t.cno
GROUP BY c.cno

Unnamed: 0,cno,enrollment
0,M362K,1
1,CS329E,3
2,CS313E,3
3,CS327E,2
4,M328K,1
5,CS303E,1
6,M358K,1
7,CS326E,1
8,CS331E,1
9,CS373,1


For each class which has at least two students enrolled, how many 
students are taking the class?

In [32]:
%%bigquery
SELECT cno, count(*) as enrollment from college.Takes group by cno having count(*) >= 2

Unnamed: 0,cno,enrollment
0,CS329E,3
1,CS313E,3
2,CS327E,2
