In [None]:
#!pip install pyLDAvis==2.1.2

In [None]:
from gensim.models import LdaModel
import gensim
from gensim.models.coherencemodel import CoherenceModel
import pandas as pd
import numpy as np
from gensim.test.utils import datapath
import spacy
import gc

In [None]:
#Ruben's stuff
from google.cloud import bigquery
from google.oauth2 import service_account
import json, os

# Google Cloud services
gcp_service_account_credentials_json_filename = 'epfl-course-f41b0ed796f9.json' #need to upload the json credential files to the root directory of the google colab files
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = gcp_service_account_credentials_json_filename
credentials = service_account.Credentials.from_service_account_file(gcp_service_account_credentials_json_filename, scopes=['https://www.googleapis.com/auth/bigquery', 'https://www.googleapis.com/auth/drive'])
project_id = 'epfl-course'
bigquery_client = bigquery.Client(credentials=credentials, project=project_id)
bigquery_client = bigquery.Client()

def bq_execute_query(query, mode="INTERACTIVE", wait=False, to_dataframe=False):
    job_config = bigquery.QueryJobConfig(priority="bigquery.QueryPriority.{}".format(mode)) # Run at BATCH priority, which won't count toward concurrent rate limit, otherwise INTERACTIVE.
    query_job = bigquery_client.query(query, job_config)
    if wait==True:
        print("Executed BQ query: ", query_job.result())
    if to_dataframe==True:
        return(query_job.to_dataframe())
    else:
        return(query_job)

def upload_df_to_bq(df, bq_destination_table, write_disposition="WRITE_APPEND"):
    #bq_table_name = "epfl-course.dataset.table"
    job_config = bigquery.LoadJobConfig(create_disposition="CREATE_IF_NEEDED", write_disposition=write_disposition) #write_disposition="WRITE_TRUNCATE" in order to delete all the data from old table and insert new data
    upload_df_to_bq_job = bigquery_client.load_table_from_dataframe(
        df, bq_destination_table, job_config = job_config)
    print("Uploaded DF to BQ: ",upload_df_to_bq_job.result()) 

def upload_json_to_bq(json_object, bq_table):
    try:
        job_config = bigquery.LoadJobConfig()
        job_config.autodetect = False #Change to True if the table on BQ does not exits
        job_config.max_bad_records = 0
        job_config.ignore_unknown_values = True
        job_config.source_format = 'NEWLINE_DELIMITED_JSON'
        job_config.create_disposition= "CREATE_IF_NEEDED"
        job_config.write_disposition= "WRITE_APPEND"
        job_config.schema_to_json(schema_table)
        job = bigquery_client.load_table_from_file(json_object, bq_table, job_config = job_config)
        print("Loaded JSON to BQ table {} as job {}".format(bq_table, job.result()))
        assert job.job_type == 'load'
        assert job.state == 'DONE'
    except:
        print("ERROR Could not load JSON to BQ table {} as job {}".format(bq_table, job.result()))

def upload_file_to_gcs(filename, new_filename, folder=''):
    folder = folder if folder == '' else folder + '/'
    bucket = storage_client.get_bucket(CLOUD_STORAGE_BUCKET)
    blob = bucket.blob('{folder}{file}'.format(folder=folder,
                                               file=new_filename))
    blob.upload_from_filename(filename)

In [None]:
#We train LDA on two months of quotes
sexy_query = """
SELECT 
  quoteId,
  lemmas,
  stems,
  geoNames,
FROM
  `epfl-course.ada_project.ste`
WHERE
  DATE(LEFT(quoteid, 10)) between "2016-01-01" and "2016-02-28"
"""
df = bq_execute_query(sexy_query, to_dataframe=True)
print('Dates from', df.quoteId.min()[:10])
print('Dates from', df.quoteId.max()[:10])
df.head()

In [None]:
#import locally saved version of quotes, filtered by language
good_quotes = pd.read_csv('filtered_quotes.csv.gz')
good_quotes.head()

In [None]:
print('Good quotes', len(good_quotes))
print('all quotes', len(df))
print('Discarded quotes', len(df) - len(good_quotes))

In [None]:
#select only the quotes in english
df.set_index('quoteId', inplace = True)
good_quotes.set_index('quoteId', inplace = True)
df = df.loc[good_quotes.index]
#create the dictionary
dictionary = gensim.corpora.Dictionary(df.stems.values)
#use the dictionary to get the bow
corpus = [dictionary.doc2bow(doc) for doc in df.stems.values]

In [None]:
#tain model 
lda_model =  gensim.models.LdaMulticore(corpus, 
                                  num_topics = 9, 
                                  id2word = dictionary,                                    
                                  passes = 10,
                                  workers = 5)

In [None]:
#save model
coherence_model_lda = CoherenceModel(model=lda_model, texts=df.stems.values, dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print("number of topics ", 9,"coherence_value :" , coherence_lda)

# Save model to disk.
temp_file = datapath("./1_layer_model")
lda_model.save("1_layer_model.txt")

In [None]:
#train model
lda_model =  gensim.models.LdaMulticore(corpus, 
                                  num_topics = 30, 
                                  id2word = dictionary,                                    
                                  passes = 10,
                                  workers = 5)

In [None]:
#save model
coherence_model_lda = CoherenceModel(model=lda_model, texts=df.stems.values, dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print("number of topics ",30,"coherence_value :" , coherence_lda)

# Save model to disk.
temp_file = datapath("./1_layer_model30")
lda_model.save("1_layer_model30.txt")

# Evaluating the model


In [None]:
#evaluating on quotes from march 
sexy_query = """
SELECT 
  quoteId,
  lemmas,
  stems,
  geoNames,
FROM
  `epfl-course.ada_project.quote_preprocessed_spacy_with_geo`
WHERE
  DATE(LEFT(quoteid, 10)) between "2016-03-01" and "2016-03-31"
"""
test_df = bq_execute_query(sexy_query, to_dataframe=True)

In [None]:
print(len(test_df))
test_df.head()

In [None]:
#30 topics
from pprint import pprint
lda_model_30 = LdaModel.load('1_layer_model30.txt')
dictionary_30 = gensim.corpora.Dictionary.load('1_layer_model30.txt.id2word')

coherence_model_lda = CoherenceModel(model=lda_model_30, texts=test_df.stems.values, dictionary=dictionary_30, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print("number of topics ",30,"coherence_value :" , coherence_lda)

# Print the Keyword in the 10 topics
pprint(lda_model_30_topics.print_topics())

In [None]:
!pip install pyLDAvis==2.1.2

In [None]:
import pyLDAvis.gensim
import pickle 
import pyLDAvis

In [None]:
#10 topics
from pprint import pprint
lda_model_10 = LdaModel.load('1_layer_model.txt')
dictionary_10 = gensim.corpora.Dictionary.load('1_layer_model.txt.id2word')

coherence_model_lda = CoherenceModel(model=lda_model_10, texts=test_df.stems.values, dictionary=dictionary_10, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print("number of topics ",9,"coherence_value :" , coherence_lda)

# Print the Keyword in the 10 topics
#pprint(lda_model_10_topics.print_topics())

In [None]:
# Visualize the topics
pyLDAvis.enable_notebook()
corpus = [dictionary_10.doc2bow(doc) for doc in test_df.stems.values]
LDAvis_prepared = pyLDAvis.gensim.prepare(lda_model_10, corpus, dictionary_10)
LDAvis_prepared

In [None]:
pyLDAvis.enable_notebook()
corpus = [dictionary_30.doc2bow(doc) for doc in test_df.stems.values]
LDAvis_prepared = pyLDAvis.gensim.prepare(lda_model_30, corpus, dictionary_30)
LDAvis_prepared

# Select the best model

Aggregate the topics about politics in Lda_9 

In [None]:
sub_corpus = [dictionary_10.doc2bow(doc) for doc in sub_df.stems.values]
predictions = lda_model_10[sub_corpus]

In [None]:
scores = []
for p in predictions:
  tot_score = 0
  for s in p:
    tot_score += s[1]
  scores.append(tot_score)
scores = np.array(scores)
scores.min()

In [None]:
#reformat the prediction of the model to get the main topic for each quote
politics = [8, 5, 4, 0, 3]
sport = [7]
misc = [1, 2]
art = [6]
#reformat topics:
prediction_reformatted = []
for doc in predictions:
  politics_score = 0
  misc_score = 0
  sport_score = 0
  art_score = 0
  for t in doc:
    #check if politics
    if t[0] in politics:
      politics_score += t[1]
      continue

    #check if misc
    if t[0] in misc:
      misc_score += t[1]
      continue
    
    #check if sport
    if t[0] in sport:
      sport_score += t[1]
      continue

    #check if art
    if t[0] in art:
      art_score += t[1]
      continue

  erf = {
      'politics&biz': politics_score,
      'sport': sport_score,
      'art': art_score,
      'misc':misc_score,
  }
  main = dict()
  main['main'] = max(erf, key=erf.get)
  main['score'] = erf[main['main']]
  erf = {
      'scores': erf,
      'main': main,
      'tot': politics_score + sport_score + art_score + misc_score
  }
  prediction_reformatted += [erf]


In [None]:
norms = [p['tot'] for p in prediction_reformatted]
norms = np.array(norms)
(abs(norms - scores) < 10**5).all()

In [None]:
confidence = [p['main']['score'] for p in prediction_reformatted]
topic = [p['main']['main'] for p in prediction_reformatted]
sub_df['score'] = confidence
sub_df['topic'] = topic

In [None]:
sub_df[['stems','score', 'topic']].head(20)

In [None]:
sub_df[sub_df['score']> .6].groupby(['topic']).mean()

In [None]:
#divide the dataset
corpus = [dictionary_10.doc2bow(doc) for doc in df.stems.values]
print(len(corpus))
predictions = lda_model_10[corpus]

#transform the predictions and aggregate the similar topics 
politics = [8, 5, 4, 0, 3]
sport = [7]
misc = [1, 2]
art = [6]
#reformat topics:
prediction_reformatted = []
for doc in predictions:
  politics_score = 0
  misc_score = 0
  sport_score = 0
  art_score = 0
  for t in doc:
    #check if politics
    if t[0] in politics:
      politics_score += t[1]
      continue

    #check if misc
    if t[0] in misc:
      misc_score += t[1]
      continue
    
    #check if sport
    if t[0] in sport:
      sport_score += t[1]
      continue

    #check if art
    if t[0] in art:
      art_score += t[1]
      continue

  erf = {
      'politics&biz&tech': politics_score,
      'sport': sport_score,
      'art': art_score,
      'misc':misc_score,
  }
  main = dict()
  main['main'] = max(erf, key=erf.get)
  main['score'] = erf[main['main']]
  erf = {
      'scores': erf,
      'main': main,
      'tot': politics_score + sport_score + art_score + misc_score
  }
  prediction_reformatted += [erf]


In [None]:
#creating df out of the reformatted detected topics
confidence = [p['main']['score'] for p in prediction_reformatted]
topic = [p['main']['main'] for p in prediction_reformatted]
spectrum = [p['scores'] for p in prediction_reformatted]
df['score'] = confidence
df['topic'] = topic
df['spectrum'] = spectrum
df.head(10)

In [None]:
#selecting the subset we want to divide further
pol = df[df['topic'] == 'politics&biz&tech'  ]
#retaining only those with high confidence
print('With 0.7 confidence, retaining only', len(pol[pol.score >0.5])/len(pol),'%')
#using the same dictionary as model10
corpus = [dictionary_10.doc2bow(doc) for doc in pol.stems.values]

In [None]:
#saving the model to file 
df.to_csv("politics_biz_tech_quotes.csv.gz", 
           index=True, 
           compression="gzip")