# 1. Setup

In [2]:
import contextualized_topic_models
from contextualized_topic_models.models.ctm import ZeroShotTM, CombinedTM
from contextualized_topic_models.utils.data_preparation import TopicModelDataPreparation
from contextualized_topic_models.utils.preprocessing import WhiteSpacePreprocessingStopwords
import nltk
import torch
import random
import numpy as np
import pandas as pd    
import os
import pickle
cwd = os.getcwd()
absolute_path = [INSERT PATH HERE]
os.chdir(absolute_path)
language = "Spanish" 
languagelowercase = "spanish"
typeofsource = "reviews"
scriptfile = f"02_{language}"
topicnum = '19'
topicnumber = 19
folder = "03_data/02_output/"

In [3]:
filename = '03_data/02_output/0100_reliable_langdetection.csv' # load dataset of reviews 
fullsample = pd.read_csv(filename, usecols = ["app", "review", "language", "date", "rating"], header='infer', encoding="utf-8") 
fullsample.columns = ['id', 'text', 'language', 'date', 'rating'] # rename columns 


In [4]:
sample = fullsample[fullsample['language'] == language] 
nonsample = fullsample[fullsample['language'] != language] 
test_docs = nonsample['text'].tolist() 

In [5]:
def fix_seeds():
  torch.manual_seed(10)
  torch.cuda.manual_seed(10)
  np.random.seed(10)
  random.seed(10)
  torch.backends.cudnn.enabled = False
  torch.backends.cudnn.deterministic = True

In [6]:
## Preprocessing
from nltk.corpus import stopwords as stop_words
nltk.download('stopwords')
stopwords = list(set(stop_words.words(languagelowercase)))

new_stopwords = ["im", "x", "xx", "xxx", "xxxx", "xo", "xoxo", "xox", "day",
                 "ovia", "ladytimer", "clue", "pinkbird", "clue", "Ovia",
                 "clover", "womanlog", "fertility friend", "woom",
                 "tempdrop", "femm", "glow", "maya", "natural cycles", "ava",
                 "kindara", "flo", 
                 "app", "apps", "application", "applications", "nurx"]

stopwords = list(set(stopwords+new_stopwords)) # combine the two lists, the base and the custom stop words

documents = sample.text.tolist()
sp = WhiteSpacePreprocessingStopwords(documents, stopwords_list=stopwords, min_words=3, remove_numbers=True,
                             max_df=0.4) 
preprocessed_documents, unpreprocessed_corpus, vocab = sp.preprocess()

df_unpreprocessed_corpus = pd.DataFrame(unpreprocessed_corpus)
df_sample = pd.DataFrame(sample)
df_unpreprocessed_corpus.columns = ['text']

keys = list(df_unpreprocessed_corpus.columns.values)
i1 = df_sample.set_index(keys).index
i2 = df_unpreprocessed_corpus.set_index(keys).index
toremove = df_sample[~i1.isin(i2)]
toremove = list(toremove['text'])

[nltk_data] Error loading stopwords: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:997)>


In [7]:
sample = sample[-sample["text"].isin(toremove)]

In [None]:
zero_tp = TopicModelDataPreparation("paraphrase-multilingual-mpnet-base-v2") # we use a multilingual for good measure (50+ languages)
zero_training_dataset = zero_tp.fit(text_for_contextual=unpreprocessed_corpus, text_for_bow=preprocessed_documents) 

In [None]:
# Run the model
zero_ctm = ZeroShotTM(bow_size=len(zero_tp.vocab), contextual_size=768,
                      n_components=topicnumber, num_epochs=50, batch_size = topicnumber)
zero_ctm.fit(zero_training_dataset)
mytopic_lists = zero_ctm.get_topic_lists(10) # get the top 10 words per topic


In [10]:
pickle.dump(zero_ctm, open(f"{folder}/{scriptfile}_{typeofsource}_{language}_{topicnum}_zero_ctm.pkl", "wb"))


## Training language topic prediction

In [11]:
traininglanguage_topics_predictions = zero_ctm.training_doc_topic_distributions

## Non-training language topic prediction

In [None]:
zero_tp = TopicModelDataPreparation("paraphrase-multilingual-mpnet-base-v2")
testing_dataset = zero_tp.transform(test_docs)
testinglanguage_topics_predictions = zero_ctm.get_thetas(testing_dataset, n_samples=1) 

# Import/export

In [14]:
### Exporing the matrices
## to DF
testinglanguage_topics_predictions = pd.DataFrame(testinglanguage_topics_predictions)
traininglanguage_topics_predictions = pd.DataFrame(traininglanguage_topics_predictions)
mytopic_lists = pd.DataFrame(mytopic_lists)

# export
testinglanguage_topics_predictions.to_csv(folder + scriptfile + topicnum + typeofsource + "testinglanguage_topics_predictions.csv",index=False)
traininglanguage_topics_predictions.to_csv(folder + scriptfile + topicnum + typeofsource +language + "traininglanguage_topics_predictions.csv",index=False)


In [None]:
### adding a uniique identifier
sample["comparableid"] = sample.index # getting my row index
sample = sample.applymap(str)
sample['newcomparableid'] = sample.comparableid + sample.id + sample.language

nonsample["comparableid"] = nonsample.index # getting my row index
nonsample = nonsample.applymap(str)
nonsample['newcomparableid'] = nonsample.comparableid + nonsample.id + nonsample.language

In [17]:
sample = pd.DataFrame(sample)
sample['row_num'] = sample.reset_index().index
traininglanguage_topics_predictions = pd.DataFrame(traininglanguage_topics_predictions)
traininglanguage_topics_predictions['row_num'] = traininglanguage_topics_predictions.reset_index().index
traininglanguage = sample.merge(traininglanguage_topics_predictions, on='row_num')

nonsample = pd.DataFrame(nonsample)
nonsample['row_num'] = nonsample.reset_index(drop=True).index
testinglanguage_topics_predictions = pd.DataFrame(testinglanguage_topics_predictions)
testinglanguage_topics_predictions['row_num'] = testinglanguage_topics_predictions.reset_index().index
testinglanguage = nonsample.merge(testinglanguage_topics_predictions, on='row_num')

In [18]:
# Export the 3 files that we will use later 
traininglanguage.to_csv(f"{folder}/{scriptfile}_{topicnum}_{typeofsource}_{language}.csv",index=False)
testinglanguage.to_csv(f"{folder}/{scriptfile}_{topicnum}_{typeofsource}_non_{language}.csv",index=False)
mytopic_lists.to_csv(f"{folder}/{scriptfile}_{topicnum}_{typeofsource}_{language}_mytopic_lists.csv",index=False)