In [None]:
!pip install fasttext
import fasttext
import pandas as pd
import numpy as np
import spacy
import gc

In [None]:
#Ruben's stuff
from google.cloud import bigquery
from google.oauth2 import service_account
import json, os

# Google Cloud services
gcp_service_account_credentials_json_filename = 'epfl-course-f41b0ed796f9.json' #need to upload the json credential files to the root directory of the google colab files
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = gcp_service_account_credentials_json_filename
credentials = service_account.Credentials.from_service_account_file(gcp_service_account_credentials_json_filename, scopes=['https://www.googleapis.com/auth/bigquery', 'https://www.googleapis.com/auth/drive'])
project_id = 'epfl-course'
bigquery_client = bigquery.Client(credentials=credentials, project=project_id)
bigquery_client = bigquery.Client()

def bq_execute_query(query, mode="INTERACTIVE", wait=False, to_dataframe=False):
    job_config = bigquery.QueryJobConfig(priority="bigquery.QueryPriority.{}".format(mode)) # Run at BATCH priority, which won't count toward concurrent rate limit, otherwise INTERACTIVE.
    query_job = bigquery_client.query(query, job_config)
    if wait==True:
        print("Executed BQ query: ", query_job.result())
    if to_dataframe==True:
        return(query_job.to_dataframe())
    else:
        return(query_job)

def upload_df_to_bq(df, bq_destination_table, write_disposition="WRITE_APPEND"):
    #bq_table_name = "epfl-course.dataset.table"
    job_config = bigquery.LoadJobConfig(create_disposition="CREATE_IF_NEEDED", write_disposition=write_disposition) #write_disposition="WRITE_TRUNCATE" in order to delete all the data from old table and insert new data
    upload_df_to_bq_job = bigquery_client.load_table_from_dataframe(
        df, bq_destination_table, job_config = job_config)
    print("Uploaded DF to BQ: ",upload_df_to_bq_job.result()) 

def upload_json_to_bq(json_object, bq_table):
    try:
        job_config = bigquery.LoadJobConfig()
        job_config.autodetect = False #Change to True if the table on BQ does not exits
        job_config.max_bad_records = 0
        job_config.ignore_unknown_values = True
        job_config.source_format = 'NEWLINE_DELIMITED_JSON'
        job_config.create_disposition= "CREATE_IF_NEEDED"
        job_config.write_disposition= "WRITE_APPEND"
        job_config.schema_to_json(schema_table)
        job = bigquery_client.load_table_from_file(json_object, bq_table, job_config = job_config)
        print("Loaded JSON to BQ table {} as job {}".format(bq_table, job.result()))
        assert job.job_type == 'load'
        assert job.state == 'DONE'
    except:
        print("ERROR Could not load JSON to BQ table {} as job {}".format(bq_table, job.result()))

def upload_file_to_gcs(filename, new_filename, folder=''):
    folder = folder if folder == '' else folder + '/'
    bucket = storage_client.get_bucket(CLOUD_STORAGE_BUCKET)
    blob = bucket.blob('{folder}{file}'.format(folder=folder,
                                               file=new_filename))
    blob.upload_from_filename(filename)

In [None]:
from datetime import timedelta
import datetime

#generate the dates for the query
dates = []
#for year in range(2016, 2021):
for year in range(2015, 2016):
  for month in range(1,12): 
      startDate = datetime.date(year,month,1)
      stopDate = datetime.date(year,month + 1, 1)  - timedelta(days=1)
      startDate = startDate.strftime('%Y-%m-%d')
      stopDate = stopDate.strftime('%Y-%m-%d')
      dates += ["'"+startDate+"' and '"+stopDate+"'"]
  #manually add december
  startDate = datetime.date(year,12,1)
  stopDate = datetime.date(year,12, 31)
  startDate = startDate.strftime('%Y-%m-%d')
  stopDate = stopDate.strftime('%Y-%m-%d')
  dates += ['"'+startDate+'" and "'+stopDate+'"']

#loading the model
fmodel = fasttext.load_model('./lid.176.bin')

#define the main query
trunc_query = """
SELECT 
  quoteId,
  quotation,
FROM
  `epfl-course.ada_project.quotes`
WHERE
  DATE(LEFT(quoteid, 10)) between """

In [None]:
dates

In [None]:
for dat in dates:
  #define and execute query
  sexy_query = trunc_query + dat + "\n"
  df = bq_execute_query(sexy_query, to_dataframe=True)

  #compute the languages and score
  temp = df.apply(lambda x: 
              fmodel.predict(x['quotation']), 
              axis = 1, 
              result_type = 'expand')
  df = pd.concat([df, temp], axis = 'columns')

  #rename columns
  df.columns = ['quoteId', 'quotation', 'language', 'score']
  #change the entries
  df.score = df.score.apply(lambda x: x[0])
  df.language = df.language.apply(lambda x: x[0][-2:])
  #adding date
  df['date'] = df.apply( lambda x: x['quoteId'][:10], axis = 1)
  df['date']= pd.to_datetime(df['date'])


  #push it on SQL
  table_id = "epfl-course.ada_project.lang_detect"
  project_id = 'epfl-course'
  bigquery_client = bigquery.Client(credentials=credentials, project=project_id)

  job_config = bigquery.LoadJobConfig(
      schema=[
          bigquery.SchemaField("quoteId", bigquery.enums.SqlTypeNames.STRING),
          bigquery.SchemaField("quotation", bigquery.enums.SqlTypeNames.STRING),
          bigquery.SchemaField("language", bigquery.enums.SqlTypeNames.STRING),
          bigquery.SchemaField("score", bigquery.enums.SqlTypeNames.FLOAT64),
          bigquery.SchemaField("date", bigquery.enums.SqlTypeNames.DATETIME),
      ],
  )

  job = client.load_table_from_dataframe(
  client = bigquery.Client()
      df, table_id, job_config=job_config
  )  # Make an API request.
  job.result()  # Wait for the job to complete.

  table = client.get_table(table_id)  # Make an API request.
  print(
      "Loaded {} rows and {} columns to {}".format(
          table.num_rows, len(table.schema), table_id
      )
  )
  print('Done with period: ', dat)