In [None]:
from gensim.models import LdaModel
import gensim
from gensim.models.coherencemodel import CoherenceModel
import pandas as pd
import numpy as np
import spacy
import gc
from gensim.test.utils import datapath

In [None]:
#Ruben's stuff
from google.cloud import bigquery
from google.oauth2 import service_account
import json, os

# Google Cloud services
gcp_service_account_credentials_json_filename = 'epfl-course-f41b0ed796f9.json' #need to upload the json credential files to the root directory of the google colab files
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = gcp_service_account_credentials_json_filename
credentials = service_account.Credentials.from_service_account_file(gcp_service_account_credentials_json_filename, scopes=['https://www.googleapis.com/auth/bigquery', 'https://www.googleapis.com/auth/drive'])
project_id = 'epfl-course'
bigquery_client = bigquery.Client(credentials=credentials, project=project_id)
bigquery_client = bigquery.Client()

def bq_execute_query(query, mode="INTERACTIVE", wait=False, to_dataframe=False):
    job_config = bigquery.QueryJobConfig(priority="bigquery.QueryPriority.{}".format(mode)) # Run at BATCH priority, which won't count toward concurrent rate limit, otherwise INTERACTIVE.
    query_job = bigquery_client.query(query, job_config)
    if wait==True:
        print("Executed BQ query: ", query_job.result())
    if to_dataframe==True:
        return(query_job.to_dataframe())
    else:
        return(query_job)

def upload_df_to_bq(df, bq_destination_table, write_disposition="WRITE_APPEND"):
    #bq_table_name = "epfl-course.dataset.table"
    job_config = bigquery.LoadJobConfig(create_disposition="CREATE_IF_NEEDED", write_disposition=write_disposition) #write_disposition="WRITE_TRUNCATE" in order to delete all the data from old table and insert new data
    upload_df_to_bq_job = bigquery_client.load_table_from_dataframe(
        df, bq_destination_table, job_config = job_config)
    print("Uploaded DF to BQ: ",upload_df_to_bq_job.result()) 

def upload_json_to_bq(json_object, bq_table):
    try:
        job_config = bigquery.LoadJobConfig()
        job_config.autodetect = False #Change to True if the table on BQ does not exits
        job_config.max_bad_records = 0
        job_config.ignore_unknown_values = True
        job_config.source_format = 'NEWLINE_DELIMITED_JSON'
        job_config.create_disposition= "CREATE_IF_NEEDED"
        job_config.write_disposition= "WRITE_APPEND"
        job_config.schema_to_json(schema_table)
        job = bigquery_client.load_table_from_file(json_object, bq_table, job_config = job_config)
        print("Loaded JSON to BQ table {} as job {}".format(bq_table, job.result()))
        assert job.job_type == 'load'
        assert job.state == 'DONE'
    except:
        print("ERROR Could not load JSON to BQ table {} as job {}".format(bq_table, job.result()))

def upload_file_to_gcs(filename, new_filename, folder=''):
    folder = folder if folder == '' else folder + '/'
    bucket = storage_client.get_bucket(CLOUD_STORAGE_BUCKET)
    blob = bucket.blob('{folder}{file}'.format(folder=folder,
                                               file=new_filename))
    blob.upload_from_filename(filename)

In [None]:
'''
SELECT *  
FROM `epfl-course.ada_project.merged_table_with_annotations_and_languages`
'''

In [None]:
#get the data
sexy_query = """
SELECT *  
FROM `epfl-course.ada_project.merged_table_with_annotations_and_languages_stems`
"""
df = bq_execute_query(sexy_query, to_dataframe=True)
df.index = df['quoteId']
df.head(10)

In [None]:
#import main model
name_model= '1_layer_model.txt' 
name_dictionary = '1_layer_model.txt.id2word'

#import second model
main_lda = LdaModel.load(name_model)
dictionary = gensim.corpora.Dictionary.load(name_dictionary)

In [None]:
#define the strings
name_model= '1_layer_model10.txt' 
name_dictionary = '1_layer_model10.txt.id2word'

#import what's needed
sub_lda = LdaModel.load(name_model)

In [None]:
#filter out on language
sel1 = 1*(df.score > 0.80)
sel2 = 1*df.language == 'en'
sel3 = sel1*sel2
print('Kept', round(100*sel3.sum()/len(df), 2), '% of points')

In [None]:
#generate corpus
corpus = [dictionary.doc2bow(doc) for doc in df.stems.values]
main_topic = main_lda[corpus]

#reformat prediction
#transform the predictions
politics = [8, 5, 4, 0, 3]
sport = [7]
misc = [1, 2]
art = [6]
#reformat topics:
prediction_reformatted = []
for doc in main_topic:
  politics_score = 0
  misc_score = 0
  sport_score = 0
  art_score = 0
  for t in doc:
    #check if politics
    if t[0] in politics:
      politics_score += t[1]
      continue

    #check if misc
    if t[0] in misc:
      misc_score += t[1]
      continue
    
    #check if sport
    if t[0] in sport:
      sport_score += t[1]
      continue

    #check if art
    if t[0] in art:
      art_score += t[1]
      continue

  erf = {
      'politics&biz&others': politics_score,
      'sport': sport_score,
      'art': art_score,
      'miscellaneous':misc_score,
  }
  main = dict()
  main['main'] = max(erf, key=erf.get)
  main['score'] = erf[main['main']]
  erf = {
      'scores': erf,
      'main': main,
      'tot': politics_score + sport_score + art_score + misc_score
  }
  prediction_reformatted += [erf]

In [None]:
#creating df out of the reformatted detected topics
confidence = [p['main']['score'] for p in prediction_reformatted]
topic = [p['main']['main'] for p in prediction_reformatted]
spectrum = [p['scores'] for p in prediction_reformatted]
df['main_topic_score'] = confidence
df['main_topic'] = topic
df['main_topic_spectrum'] = spectrum

In [None]:
table_id = "epfl-course.ada_project.geo_annotated_main_topic"
project_id = 'epfl-course'
bigquery_client = bigquery.Client(credentials=credentials, project=project_id)

job_config = bigquery.LoadJobConfig(

)

client = bigquery.Client()
job = client.load_table_from_dataframe(
    df, table_id, job_config=job_config
)  # Make an API request.
job.result()  # Wait for the job to complete.

table = client.get_table(table_id)  # Make an API request.
print(
    "Loaded {} rows and {} columns to {}".format(
        table.num_rows, len(table.schema), table_id
    )
)

In [None]:
#filtering only the politics&biz&others quotes
pol = df[(df['main_topic'] == 'politics&biz&others') &  (df['main_topic_score'] >0.5)].copy()
#retaining only those with high confidence
print('With 0.5 confidence, retaining only', len(pol)/len(df[df['main_topic'] == 'politics&biz&others']),'%')

In [None]:
#generate corpus
corpus = [dictionary.doc2bow(doc) for doc in pol.stems.values]
sub_topic = sub_lda[corpus]

#reformat prediction
#transform the predictions
unknown = [0, 6]
biz_tech = [1]
econ = [2]
war = [3]
politics = [4]
leg = [5]
social = [7, 8]
env = [9]
#reformat topics:
prediction_reformatted = []
for doc in sub_topic:
  unknown_score = 0
  biz_tech_score = 0
  econ_score = 0
  war_score = 0
  politics_score = 0
  leg_score = 0
  social_score = 0
  env_score = 0
  for t in doc:
    #check if politics
    if t[0] in unknown:
      unknown_score += t[1]
      continue

    #check if misc
    if t[0] in econ:
      econ_score += t[1]
      continue
    
    #check if sport
    if t[0] in biz_tech:
      biz_tech_score += t[1]
      continue

    #check if art
    if t[0] in politics:
      politics_score += t[1]
      continue
    
    #check if art
    if t[0] in leg:
      leg_score += t[1]
      continue
    
    #check if art
    if t[0] in social:
      social_score += t[1]
      continue

    #check if art
    if t[0] in env:
      env_score += t[1]
      continue

    #check if art
    if t[0] in war:
      war_score += t[1]
      continue

  spectrum = {
      
      '?':unknown_score,
      'businness and tech':biz_tech_score,
      'economy&market':econ_score,
      'politics':politics_score,
      'violence&cooperation': war_score,
      'legislation&law':leg_score,
      'social issues':social_score,
      'environments':env_score}
  erf = dict()
  erf['second_topic_spectrum'] = spectrum
  s = sorted(spectrum, key=spectrum.get, reverse=True)
  erf['second_topic_1'] = s[0]
  erf['second_topic_2'] = s[1]
  erf['second_topic_3'] = s[3]

  prediction_reformatted += [erf]

In [None]:
#creating df out of the reformatted detected topics
pol['second_topic_1'] = [p['second_topic_1'] for p  in prediction_reformatted]
pol['second_topic_2'] = [p['second_topic_2'] for p  in prediction_reformatted]
pol['second_topic_3'] = [p['second_topic_3'] for p  in prediction_reformatted] 
pol['second_topic_spectrum'] = [p['second_topic_spectrum'] for p  in prediction_reformatted]

In [None]:
from pprint import pprint
i = np.random.randint(0, len(pol))
pprint(i)
pprint(pol.iloc[i].quotation)
print('\n *********************')
pprint('1:'+  pol.iloc[i]['second_topic_1'])
pprint('2:'+ pol.iloc[i]['second_topic_2'])
pprint('3:'+ pol.iloc[i]['second_topic_3'])
print('\n *********************')
pprint(pol.iloc[i]['second_topic_spectrum'])

In [None]:
df.drop(columns='quoteId', inplace = True)
pol.drop(columns='quoteId', inplace = True)

In [None]:
asd

In [None]:
table_id = "epfl-course.ada_project.geo_annotated_sub_topics"
project_id = 'epfl-course'
bigquery_client = bigquery.Client(credentials=credentials, project=project_id)

job_config = bigquery.LoadJobConfig(

)

client = bigquery.Client()
job = client.load_table_from_dataframe(
    pol, table_id, job_config=job_config
)  # Make an API request.
job.result()  # Wait for the job to complete.

table = client.get_table(table_id)  # Make an API request.
print(
    "Loaded {} rows and {} columns to {}".format(
        table.num_rows, len(table.schema), table_id
    )
)
