# ETL Process on Weather Dataset:

# The dataset had been provided by the instructor as an .xlsx file
# The file was converted from an .xlsx file to a .csv file

In [None]:
# Uploading 311 complaint .csv from Google Drive to Google Colab
# Source:https://www.youtube.com/watch?v=BuuH0wsJ8-k&t=167s
! gdown --id 10dG5nWRL8rnqPvK2cYRf89d19EMRjTds

Downloading...
From: https://drive.google.com/uc?id=10dG5nWRL8rnqPvK2cYRf89d19EMRjTds
To: /content/weather_data_5_boroughs_daily.csv
100% 1.40M/1.40M [00:00<00:00, 65.7MB/s]


In [None]:
import pandas as pd
from google.cloud import bigquery
import os

In [None]:
df = pd.read_csv(r'weather_data_5_boroughs_daily.csv')
df.head()

Unnamed: 0,Station,latitude,longitude,Borough,City,State,ZipCode,WDate,Temperature_Max,Temperature_Avg,...,Dewpot_M,Humidity_Max,Humidity_Avg,Humidity_M,Wdspeed_Max,Wdspeed_Avg,Wdspeed_M,Pressure_Max,Pressure_M,Precipitation_Total
0,KNYBRONX14,40.8616,-73.8809,Bronx,Botanical Garden,NY,10458,1/1/2016,41.2,38.1,...,16.9,60,51.0,45,28.4,11.3,1.3,30.1,29.96,0.0
1,KNYBRONX14,40.8616,-73.8809,Bronx,Botanical Garden,NY,10458,1/2/2016,39.4,35.2,...,14.0,56,48.0,42,25.3,10.1,0.0,30.11,29.95,0.0
2,KNYBRONX14,40.8616,-73.8809,Bronx,Botanical Garden,NY,10458,1/3/2016,44.7,38.6,...,19.5,60,49.0,36,28.4,10.8,1.6,29.97,29.78,0.0
3,KNYBRONX14,40.8616,-73.8809,Bronx,Botanical Garden,NY,10458,1/4/2016,35.6,25.9,...,-0.8,62,48.0,31,36.5,10.2,1.1,30.39,29.86,0.0
4,KNYBRONX14,40.8616,-73.8809,Bronx,Botanical Garden,NY,10458,1/5/2016,28.8,18.5,...,-6.9,64,45.0,21,27.5,6.2,0.0,30.64,30.37,0.0


In [None]:
# Data Profiling:
# Thankfully, this data set has been pre-cleaned and there
# does not appear to be any N/A values on the .csv file provided
df.isna().any

<bound method NDFrame._add_numeric_operations.<locals>.any of        Station  latitude  longitude  Borough   City  State  ZipCode  WDate  \
0        False     False      False    False  False  False    False  False   
1        False     False      False    False  False  False    False  False   
2        False     False      False    False  False  False    False  False   
3        False     False      False    False  False  False    False  False   
4        False     False      False    False  False  False    False  False   
...        ...       ...        ...      ...    ...    ...      ...    ...   
10521    False     False      False    False  False  False    False  False   
10522    False     False      False    False  False  False    False  False   
10523    False     False      False    False  False  False    False  False   
10524    False     False      False    False  False  False    False  False   
10525    False     False      False    False  False  False    False  False   

 

In [None]:
master_columns = df[['Station', 'latitude', 'longitude', 'Borough', 'City', 'State', 'ZipCode', 'WDate', 'Temperature_Max', 'Temperature_Avg', 'Temperature_M', 'Humidity_Max', 'Humidity_Avg', 'Humidity_M', 'Wdspeed_Max', 'Wdspeed_Avg', 'Wdspeed_M', 'Precipitation_Total']]
print(len(master_columns))
master_columns.head()

10526


Unnamed: 0,Station,latitude,longitude,Borough,City,State,ZipCode,WDate,Temperature_Max,Temperature_Avg,Temperature_M,Humidity_Max,Humidity_Avg,Humidity_M,Wdspeed_Max,Wdspeed_Avg,Wdspeed_M,Precipitation_Total
0,KNYBRONX14,40.8616,-73.8809,Bronx,Botanical Garden,NY,10458,1/1/2016,41.2,38.1,33.9,60,51.0,45,28.4,11.3,1.3,0.0
1,KNYBRONX14,40.8616,-73.8809,Bronx,Botanical Garden,NY,10458,1/2/2016,39.4,35.2,32.4,56,48.0,42,25.3,10.1,0.0,0.0
2,KNYBRONX14,40.8616,-73.8809,Bronx,Botanical Garden,NY,10458,1/3/2016,44.7,38.6,34.5,60,49.0,36,28.4,10.8,1.6,0.0
3,KNYBRONX14,40.8616,-73.8809,Bronx,Botanical Garden,NY,10458,1/4/2016,35.6,25.9,12.1,62,48.0,31,36.5,10.2,1.1,0.0
4,KNYBRONX14,40.8616,-73.8809,Bronx,Botanical Garden,NY,10458,1/5/2016,28.8,18.5,10.0,64,45.0,21,27.5,6.2,0.0,0.0


In [None]:
# Data Profiling:
master_columns.isna().any()

Station                False
latitude               False
longitude              False
Borough                False
City                   False
State                  False
ZipCode                False
WDate                  False
Temperature_Max        False
Temperature_Avg        False
Temperature_M          False
Humidity_Max           False
Humidity_Avg           False
Humidity_M             False
Wdspeed_Max            False
Wdspeed_Avg            False
Wdspeed_M              False
Precipitation_Total    False
dtype: bool

# Creating dataframe for the Date Dimension Table and Transforming/Cleaning it

In [None]:
date_dim_tbl = master_columns[['WDate']]
print(len(date_dim_tbl))
date_dim_tbl.head()

10526


Unnamed: 0,WDate
0,1/1/2016
1,1/2/2016
2,1/3/2016
3,1/4/2016
4,1/5/2016


In [None]:
date_dim_tbl.insert(1, 'Year', pd.DatetimeIndex(master_columns['WDate']).year)
date_dim_tbl.insert(2, 'Month', pd.DatetimeIndex(master_columns['WDate']).month)
date_dim_tbl.insert(3, 'Day', pd.DatetimeIndex(master_columns['WDate']).day)


In [None]:
print(len(date_dim_tbl))
date_dim_tbl.head()

10526


Unnamed: 0,WDate,Year,Month,Day
0,1/1/2016,2016,1,1
1,1/2/2016,2016,1,2
2,1/3/2016,2016,1,3
3,1/4/2016,2016,1,4
4,1/5/2016,2016,1,5


In [None]:
date_dim_tbl2 = date_dim_tbl.copy()
date_dim_tbl2 = date_dim_tbl2.drop_duplicates()

In [None]:
date_dim_tbl2.insert(0, 'Date_ID', range(1, 1+len(date_dim_tbl2)))

In [None]:
print(len(date_dim_tbl2))
date_dim_tbl2.head()

2107


Unnamed: 0,Date_ID,WDate,Year,Month,Day
0,1,1/1/2016,2016,1,1
1,2,1/2/2016,2016,1,2
2,3,1/3/2016,2016,1,3
3,4,1/4/2016,2016,1,4
4,5,1/5/2016,2016,1,5


In [None]:
dataset_name = 'weather'
table_id_date = dataset_name + 'date_dim_tbl2'

# Creating dataframe for the Location Dimension Table and Transforming/Cleaning it

In [None]:
location_dim_tbl = master_columns[['latitude', 'longitude', 'Borough', 'City', 'State', 'ZipCode']]
print(len(location_dim_tbl))
location_dim_tbl.head()

10526


Unnamed: 0,latitude,longitude,Borough,City,State,ZipCode
0,40.8616,-73.8809,Bronx,Botanical Garden,NY,10458
1,40.8616,-73.8809,Bronx,Botanical Garden,NY,10458
2,40.8616,-73.8809,Bronx,Botanical Garden,NY,10458
3,40.8616,-73.8809,Bronx,Botanical Garden,NY,10458
4,40.8616,-73.8809,Bronx,Botanical Garden,NY,10458


In [None]:
# Need to convert ZipCode into string. It is currently set as an int value
location_dim_tbl['ZipCode'] = location_dim_tbl['ZipCode'].astype('str')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  location_dim_tbl['ZipCode'] = location_dim_tbl['ZipCode'].astype('str')


In [None]:
# Creating the composite key
# Staten Island has two separate weather stations with different zipcodes
# Need to make a composite key with Borough and ZipCode
location_dim_tbl.insert(0, 'Composite_Key', location_dim_tbl['Borough'] + '-' + location_dim_tbl['ZipCode'])

In [None]:
print(len(location_dim_tbl))
location_dim_tbl.head()

10526


Unnamed: 0,Composite_Key,latitude,longitude,Borough,City,State,ZipCode
0,Bronx-10458,40.8616,-73.8809,Bronx,Botanical Garden,NY,10458
1,Bronx-10458,40.8616,-73.8809,Bronx,Botanical Garden,NY,10458
2,Bronx-10458,40.8616,-73.8809,Bronx,Botanical Garden,NY,10458
3,Bronx-10458,40.8616,-73.8809,Bronx,Botanical Garden,NY,10458
4,Bronx-10458,40.8616,-73.8809,Bronx,Botanical Garden,NY,10458


In [None]:
location_dim_tbl2 = location_dim_tbl.copy()
location_dim_tbl2 = location_dim_tbl.drop_duplicates()

In [None]:
print(len(location_dim_tbl2))
location_dim_tbl2

6


Unnamed: 0,Composite_Key,latitude,longitude,Borough,City,State,ZipCode
0,Bronx-10458,40.8616,-73.8809,Bronx,Botanical Garden,NY,10458
2107,Brooklyn-11228,40.6215,-74.0096,Brooklyn,Dyker Heights,NY,11228
4207,Manhattan-10018,40.7638,-73.9918,Manhattan,New York,NY,10018
6313,Queens-11372,40.7557,-73.8831,Queens,Jackson Heights,NY,11372
8419,Staten Island-10306,40.5674,-74.1343,Staten Island,Richmondtown,NY,10306
10247,Staten Island-10308,40.5674,-74.1343,Staten Island,Richmondtown,NY,10308


In [None]:
location_dim_tbl2.insert(0, 'Location_ID', range(1, 1+len(location_dim_tbl2)))

In [None]:
location_dim_tbl2 = location_dim_tbl2.drop('Composite_Key', axis = 1) 

In [None]:
print(len(location_dim_tbl2))
location_dim_tbl2

6


Unnamed: 0,Location_ID,latitude,longitude,Borough,City,State,ZipCode
0,1,40.8616,-73.8809,Bronx,Botanical Garden,NY,10458
2107,2,40.6215,-74.0096,Brooklyn,Dyker Heights,NY,11228
4207,3,40.7638,-73.9918,Manhattan,New York,NY,10018
6313,4,40.7557,-73.8831,Queens,Jackson Heights,NY,11372
8419,5,40.5674,-74.1343,Staten Island,Richmondtown,NY,10306
10247,6,40.5674,-74.1343,Staten Island,Richmondtown,NY,10308


In [None]:
dataset_name = 'weather'
table_id_location = dataset_name + 'location_dim_tbl2'

# Creating dataframe for the Weather Parameter Dimension Table and Transforming/Cleaning it

In [None]:
weather_param_tbl = master_columns[['Temperature_Max', 'Temperature_Avg', 'Temperature_M', 'Humidity_Max', 'Humidity_Avg', 'Humidity_M', 'Wdspeed_Max', 'Wdspeed_Avg', 'Wdspeed_M', 'Precipitation_Total']]
print(len(weather_param_tbl))
weather_param_tbl.head()

10526


Unnamed: 0,Temperature_Max,Temperature_Avg,Temperature_M,Humidity_Max,Humidity_Avg,Humidity_M,Wdspeed_Max,Wdspeed_Avg,Wdspeed_M,Precipitation_Total
0,41.2,38.1,33.9,60,51.0,45,28.4,11.3,1.3,0.0
1,39.4,35.2,32.4,56,48.0,42,25.3,10.1,0.0,0.0
2,44.7,38.6,34.5,60,49.0,36,28.4,10.8,1.6,0.0
3,35.6,25.9,12.1,62,48.0,31,36.5,10.2,1.1,0.0
4,28.8,18.5,10.0,64,45.0,21,27.5,6.2,0.0,0.0


In [None]:
# We will need to account for each of the columns of the weather parameter table to distinguish one entry from another
# The composite key will be presented as an integer value, which is the sum of all the columns in the table
weather_param_tbl.insert(0, 'Composite_Key', weather_param_tbl['Temperature_Max'] + weather_param_tbl['Temperature_Avg'] + weather_param_tbl['Temperature_M'] + weather_param_tbl['Humidity_Max'] + weather_param_tbl['Humidity_Avg'] + weather_param_tbl['Humidity_M'] + weather_param_tbl['Wdspeed_Max'] + weather_param_tbl['Wdspeed_Avg'] + weather_param_tbl['Temperature_M'] + weather_param_tbl['Humidity_Max'] + weather_param_tbl['Humidity_Avg'] + weather_param_tbl['Humidity_M'] + weather_param_tbl['Wdspeed_Max'] + weather_param_tbl['Wdspeed_Avg'] + weather_param_tbl['Wdspeed_M'] + weather_param_tbl['Precipitation_Total'])

In [None]:
print(len(weather_param_tbl))
weather_param_tbl.head()

10526


Unnamed: 0,Composite_Key,Temperature_Max,Temperature_Avg,Temperature_M,Humidity_Max,Humidity_Avg,Humidity_M,Wdspeed_Max,Wdspeed_Avg,Wdspeed_M,Precipitation_Total
0,539.8,41.2,38.1,33.9,60,51.0,45,28.4,11.3,1.3,0.0
1,502.2,39.4,35.2,32.4,56,48.0,42,25.3,10.1,0.0,0.0
2,522.3,44.7,38.6,34.5,60,49.0,36,28.4,10.8,1.6,0.0
3,462.2,35.6,25.9,12.1,62,48.0,31,36.5,10.2,1.1,0.0
4,394.7,28.8,18.5,10.0,64,45.0,21,27.5,6.2,0.0,0.0


In [None]:
weather_param_tbl2 = weather_param_tbl.copy()
weather_param_tbl2 = weather_param_tbl2.drop_duplicates()
print(len(weather_param_tbl2))
weather_param_tbl2.head()

10430


Unnamed: 0,Composite_Key,Temperature_Max,Temperature_Avg,Temperature_M,Humidity_Max,Humidity_Avg,Humidity_M,Wdspeed_Max,Wdspeed_Avg,Wdspeed_M,Precipitation_Total
0,539.8,41.2,38.1,33.9,60,51.0,45,28.4,11.3,1.3,0.0
1,502.2,39.4,35.2,32.4,56,48.0,42,25.3,10.1,0.0,0.0
2,522.3,44.7,38.6,34.5,60,49.0,36,28.4,10.8,1.6,0.0
3,462.2,35.6,25.9,12.1,62,48.0,31,36.5,10.2,1.1,0.0
4,394.7,28.8,18.5,10.0,64,45.0,21,27.5,6.2,0.0,0.0


In [None]:
weather_param_tbl2.insert(0, 'Weather_Param_ID', range(1, 1+len(weather_param_tbl2)))

In [None]:
weather_param_tbl2 = weather_param_tbl2.drop('Composite_Key', axis = 1) 

In [None]:
print(len(weather_param_tbl2))
weather_param_tbl2.head()

10430


Unnamed: 0,Weather_Param_ID,Temperature_Max,Temperature_Avg,Temperature_M,Humidity_Max,Humidity_Avg,Humidity_M,Wdspeed_Max,Wdspeed_Avg,Wdspeed_M,Precipitation_Total
0,1,41.2,38.1,33.9,60,51.0,45,28.4,11.3,1.3,0.0
1,2,39.4,35.2,32.4,56,48.0,42,25.3,10.1,0.0,0.0
2,3,44.7,38.6,34.5,60,49.0,36,28.4,10.8,1.6,0.0
3,4,35.6,25.9,12.1,62,48.0,31,36.5,10.2,1.1,0.0
4,5,28.8,18.5,10.0,64,45.0,21,27.5,6.2,0.0,0.0


In [None]:
dataset_name = 'weather'
table_id_weather_param = dataset_name + 'weather_param_tbl2'

# Creating the fact table

In [None]:
weather_fact_tbl = pd.DataFrame(date_dim_tbl['WDate'], columns = ['WDate'])

In [None]:
weather_fact_tbl.insert(1, "Location", location_dim_tbl['Composite_Key'])
weather_fact_tbl.insert(2, "Weather_Param_ID", weather_param_tbl['Composite_Key'])

In [None]:
print(len(weather_fact_tbl))
weather_fact_tbl.head()

10526


Unnamed: 0,WDate,Location,Weather_Param_ID
0,1/1/2016,Bronx-10458,539.8
1,1/2/2016,Bronx-10458,502.2
2,1/3/2016,Bronx-10458,522.3
3,1/4/2016,Bronx-10458,462.2
4,1/5/2016,Bronx-10458,394.7


In [None]:
weather_fact_tbl.insert(0, 'Unique_Key', range(1, 1+len(weather_fact_tbl)))

In [None]:
weather_fact_tbl.head()

Unnamed: 0,Unique_Key,WDate,Location,Weather_Param_ID
0,1,1/1/2016,Bronx-10458,539.8
1,2,1/2/2016,Bronx-10458,502.2
2,3,1/3/2016,Bronx-10458,522.3
3,4,1/4/2016,Bronx-10458,462.2
4,5,1/5/2016,Bronx-10458,394.7


In [None]:
dataset_name = 'weather'
table_id_weather_fact = dataset_name + 'weather_fact_tbl'

# Reading/Writing into Google Cloud Project

In [None]:
# Feeding in Json file generated from GBQ:
# How to generate the key file: 
# http://holowczak.com/creating-a-service-account-and-key-file-for-google-bigquery/3/?doing_wp_cron=1671419520.2419290542602539062500
# Uploading the JSON files is the cell below

In [None]:
! gdown --id 1BFPm2cptd9EWD_187KSlYQrMUVy9sNzf

Downloading...
From: https://drive.google.com/uc?id=1BFPm2cptd9EWD_187KSlYQrMUVy9sNzf
To: /content/cis4400proj-370416-74e9d8391c22.json
100% 2.32k/2.32k [00:00<00:00, 3.07MB/s]


In [None]:
# Reading/Writing into Google Cloud project
# https://data.cityofnewyork.us/resource/erm2-nwe9.json

# Setting up the environment variable
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = 'cis4400proj-370416-74e9d8391c22.json'
# How to generate the JSON key file: 
# http://holowczak.com/creating-a-service-account-and-key-file-for-google-bigquery/3/?doing_wp_cron=1671419520.2419290542602539062500

client = bigquery.Client()

# Creating a dataframe for the Agency Dimension Table and Cleaning/ Transforming it

# Loading data into Google Big Query

# Loading Date Dimension Table 

In [None]:
# Client Object: How you read and write data into Google Big Query 
client = bigquery.Client()

In [None]:
job_config = bigquery.LoadJobConfig(
    
  # Specify a partial schema. All columns are always written to the table
  # The schema is used to assist in data type definitions

  schema = [
      
      bigquery.SchemaField("Date_ID", bigquery.enums.SqlTypeNames.INTEGER),
      bigquery.SchemaField("WDate", bigquery.enums.SqlTypeNames.DATE),
      bigquery.SchemaField("Year", bigquery.enums.SqlTypeNames.INTEGER),
      bigquery.SchemaField("Month", bigquery.enums.SqlTypeNames.INTEGER),
      bigquery.SchemaField("Day", bigquery.enums.SqlTypeNames.INTEGER)  
  ],

  write_disposition = "WRITE_TRUNCATE",

)

In [None]:
job = client.load_table_from_dataframe(
    date_dim_tbl2, table_id_date, job_config= job_config
)
job.result()

ArrowTypeError: ignored

In [None]:
job2 = client.load_table_from_dataframe(
df_fact, table_id2, job_config=job_config2)

job.result()

# Loading Location Dimension Table

In [None]:
# Client Object: How you read and write data into Google Big Query 
client = bigquery.Client()

In [None]:
job_config = bigquery.LoadJobConfig(
    
  # Specify a partial schema. All columns are always written to the table
  # The schema is used to assist in data type definitions

  schema = [
      
      bigquery.SchemaField("Location_ID", bigquery.enums.SqlTypeNames.STRING),
      bigquery.SchemaField("latitude", bigquery.enums.SqlTypeNames.STRING),
      bigquery.SchemaField("longitude", bigquery.enums.SqlTypeNames.STRING),
      bigquery.SchemaField("Borough", bigquery.enums.SqlTypeNames.STRING),
      bigquery.SchemaField("City", bigquery.enums.SqlTypeNames.STRING),
      bigquery.SchemaField("State", bigquery.enums.SqlTypeNames.STRING),
      bigquery.SchemaField("ZipCode", bigquery.enums.SqlTypeNames.INTEGER)
  ],

  write_disposition = "WRITE_TRUNCATE",

)



In [None]:
job = client.load_table_from_dataframe(
    location_dim_tbl2, table_id_location, job_config= job_config
)
job.result()

ArrowTypeError: ignored

# Loading Weather Parameter Dimension

In [None]:
# Client Object: How you read and write data into Google Big Query 
client = bigquery.Client()

In [None]:
job_config = bigquery.LoadJobConfig(
    
  # Specify a partial schema. All columns are always written to the table
  # The schema is used to assist in data type definitions

  schema = [
      
      bigquery.SchemaField("Weather_Param_ID", bigquery.enums.SqlTypeNames.STRING),
      bigquery.SchemaField("Temperature_Max", bigquery.enums.SqlTypeNames.INTEGER),
      bigquery.SchemaField("Temperature_Avg", bigquery.enums.SqlTypeNames.INTEGER),
      bigquery.SchemaField("Temperature_M", bigquery.enums.SqlTypeNames.INTEGER),
      bigquery.SchemaField("Humidity_Max", bigquery.enums.SqlTypeNames.INTEGER),
      bigquery.SchemaField("Humidity_Avg", bigquery.enums.SqlTypeNames.INTEGER),
      bigquery.SchemaField("Humidity_M", bigquery.enums.SqlTypeNames.INTEGER),
      bigquery.SchemaField("Wdspeed_Max", bigquery.enums.SqlTypeNames.INTEGER),
      bigquery.SchemaField("Wdspeed_Avg", bigquery.enums.SqlTypeNames.INTEGER),
      bigquery.SchemaField("Wdspeed_M", bigquery.enums.SqlTypeNames.INTEGER),
      bigquery.SchemaField("Precipitation_Total", bigquery.enums.SqlTypeNames.INTEGER),
  ],

  write_disposition = "WRITE_TRUNCATE",

)



In [None]:
job = client.load_table_from_dataframe(
    weather_param_tbl2, table_id_weather_param, job_config= job_config
)
job.result()

ArrowTypeError: ignored

# Loading Weather Fact Table

In [None]:
# Client Object: How you read and write data into Google Big Query 
client = bigquery.Client()

In [None]:
job_config = bigquery.LoadJobConfig(
    
  # Specify a partial schema. All columns are always written to the table
  # The schema is used to assist in data type definitions

  schema = [
      
      bigquery.SchemaField("Unique_Key", bigquery.enums.SqlTypeNames.STRING),
      bigquery.SchemaField("WDate", bigquery.enums.SqlTypeNames.DATE),
      bigquery.SchemaField("Location", bigquery.enums.SqlTypeNames.STRING),
      bigquery.SchemaField("Weather_Param_ID", bigquery.enums.SqlTypeNames.INTEGER)
  ],

  write_disposition = "WRITE_TRUNCATE",

)



In [None]:
job = client.load_table_from_dataframe(
    weather_fact_tbl, table_id_weather_fact, job_config= job_config
)
job.result()

ArrowTypeError: ignored