In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot
from google.cloud import bigquery #For BigQu
pd.options.display.max_colwidth = 100

In [4]:
#
def estimate_gigabytes_scanned(query, bq_client):
    # see https://cloud.google.com/bigquery/docs/reference/rest/v2/jobs#configuration.dryRun
    my_job_config = bigquery.job.QueryJobConfig()
    my_job_config.dry_run = True
    my_job = bq_client.query(query)

    return my_job

In [5]:
#Init
client = bigquery.Client()

hn_dataset_ref = client.dataset('chicago_crime', project='bigquery-public-data')
hn_dset = client.get_dataset(hn_dataset_ref)
hn_full = client.get_table(hn_dset.table('crime'))

In [6]:
accidents_query_2015_all = """
                           SELECT 
                               CAST(TIMESTAMP_TRUNC(crime.date, DAY) AS date) AS date, 
                               COUNT(crime.unique_key) AS crime_count
                           FROM 
                               `bigquery-public-data.chicago_crime.crime` AS crime
                           GROUP BY 
                               date
                           ORDER BY 
                               date;
                           """ 

In [194]:
accidents_2015 = estimate_gigabytes_scanned(accidents_query_2015_all, client)
df15 = accidents_2015.to_dataframe()

#
# frames = [df15, df16]
# results = pd.concat(frames)
crime_results = df15


In [16]:
weather_ds = client.dataset('sample', project='accuweather-com')

di_query = "SELECT * FROM `accuweather-com.sample.sample` LIMIT 1000"
acu_weather = estimate_gigabytes_scanned(di_query, client)
df_acu_weather = acu_weather.to_dataframe()
df_acu_weather.to_csv("acu_weather_date.csv", index=False)

In [197]:
query = """
SELECT 
 CAST(CONCAT(w.year,'-',w.mo,'-',w.da) AS date) AS date,
 AVG(w.temp) AS avg_temp,
 MAX(w.max) AS max_temp,
 MIN(w.min) AS min_temp
FROM        `bigquery-public-data.noaa_gsod.gsod2016`  w
INNER JOIN  `bigquery-public-data.noaa_gsod.stations`  s
 ON w.stn=s.usaf
 AND w.wban=s.wban
WHERE
 s.country='US'
 AND s.state = 'NY'
 AND s.name='CENTRAL PARK'
GROUP BY date
ORDER BY date
"""

basic_query = """
                SELECT
                w.stn,
                w.max,
w.min,
w.prcp,
w.dewp
                FROM
                  `bigquery-public-data.noaa_gsod.gsod2016` AS w
                INNER JOIN
                   `bigquery-public-data.noaa_gsod.stations` AS s
                  ON w.stn = s.usaf
                  AND w.wban = s.wban
                WHERE
                  lat > 41.7
                  AND lat < 42
                  AND lon > -87.7
                  AND lon < -87.5

                """

ugh = """
SELECT * FROM `bigquery-public-data.noaa_gsod.gsod2016` AS w WHERE w.stn = '998497'"""
frames = []

for i in range(2001,2019):

    weather_query = """
                SELECT
                  CAST(CONCAT(w.year,'-',w.mo,'-',w.da) AS date) AS date,
                    AVG(w.temp) AS avg_temp,
                     AVG(w.max) AS max_temp,
                     AVG(w.min) AS min_temp,
                     AVG(CAST(w.wdsp AS FLOAT64)) AS mean_wind_speed,
                     AVG(CAST(w.mxpsd AS FLOAT64)) AS max_sus_wind_speed
                FROM
                  `bigquery-public-data.noaa_gsod.gsod""" + str(i) + """` AS w
                INNER JOIN
                   `bigquery-public-data.noaa_gsod.stations` AS s
                  ON w.stn = s.usaf
                  AND w.wban = s.wban
                WHERE
                  lat > 41.7
                  AND lat < 42
                  AND lon > -87.7
                  AND lon < -87.5
                GROUP BY date
                ORDER BY date;
                """
    chicago_weather = estimate_gigabytes_scanned(weather_query, client)
    dfweather = chicago_weather.to_dataframe()
    frames.append(dfweather)

    

In [198]:
results = pd.concat(frames)
#final_df = final_df.set_index('DateAndTime')
# frames = [df15, df16]

results = results.set_index('date')
crime_results = crime_results.set_index('date')


# results = dfweather
results.replace('\\n',' ', regex=True, inplace=True)
results.to_csv("weather_date.csv", index=False)

In [202]:
final_frames = [results, crime_results]

merge=pd.concat(final_frames, axis=1, sort=True)


merge.to_csv("all_crime_weather.csv")

In [None]:
#clean up those pesky newline chars that have plaged me so.

from IPython.display import display
display(df15)

In [195]:
#hundy = results.sample(n=100000)

crime_results.replace('\\n',' ', regex=True, inplace=True)
crime_results.to_csv("crime_date.csv", index=False)
