# 1. Setup (run on 10 python)

In [1]:
import contextualized_topic_models
from contextualized_topic_models.models.ctm import ZeroShotTM, CombinedTM
from contextualized_topic_models.utils.data_preparation import TopicModelDataPreparation
from contextualized_topic_models.utils.preprocessing import WhiteSpacePreprocessingStopwords
import nltk
import torch
import random
import numpy as np
import pandas as pd    
import os
import pickle
cwd = os.getcwd()
absolute_path = [INSERT PATH HERE]

os.chdir(absolute_path)
language = "English" # here set the langauge we are focussing on
languagelowercase = "english"
typeofsource = "reviews"
scriptfile = f"02_{language}"
topicnum = '29'
topicnumber = 29
folder = "03_data/02_output/"

  _torch_pytree._register_pytree_node(


In [2]:
### import my file of cleaned, multilingual data from R script
filename = '03_data/02_output/0100_reliable_langdetection.csv' # load dataset of reviews 
fullsample = pd.read_csv(filename, usecols = ["app", "review", "language", "date", "rating"], header='infer', encoding="utf-8") 
fullsample.columns = ['id', 'text', 'language', 'date', 'rating'] # rename columns 


In [8]:
# Convert the 'date' column to datetime if it's not already
fullsample['date'] = pd.to_datetime(fullsample['date'])

# Sort the DataFrame by the 'date' column
df_sorted = fullsample.sort_values(by='date')

# Display the earliest and latest dates
earliest_date = df_sorted.iloc[0]
latest_date = df_sorted.iloc[-1]

# Display the sorted DataFrame
print("Earliest date entry:")
print(earliest_date)

print("\nLatest date entry:")
print(latest_date)



Earliest date entry:
id                                                   WomanLog
text        Its a good app. It would be nice to have an en...
language                                              English
date                                      1970-01-01 00:00:00
rating                                             2011-01-01
Name: 162124, dtype: object

Latest date entry:
id                                           Fertility Friend
text        This app is FANTASTIC! Along with being able t...
language                                              English
date                            1970-01-01 00:00:00.000000005
rating                                             2013-03-01
Name: 239435, dtype: object


In [3]:
sample = fullsample[fullsample['language'] == language] # selecting all traininglanguage (or whicher sample we are using to train the model) 
nonsample = fullsample[fullsample['language'] != language] 
test_docs = nonsample['text'].tolist() 

In [4]:
def fix_seeds():
  torch.manual_seed(10)
  torch.cuda.manual_seed(10)
  np.random.seed(10)
  random.seed(10)
  torch.backends.cudnn.enabled = False
  torch.backends.cudnn.deterministic = True

In [None]:
## Preprocessing
from nltk.corpus import stopwords as stop_words
nltk.download('stopwords')
stopwords = list(set(stop_words.words(languagelowercase)))

new_stopwords = ["im", "x", "xx", "xxx", "xxxx", "xo", "xoxo", "xox", "day",
                 "ovia", "ladytimer", "clue", "pinkbird", "clue", "Ovia",
                 "clover", "womanlog", "fertility friend", "woom",
                 "tempdrop", "femm", "glow", "maya", "natural cycles", "ava",
                 "kindara", "flo",
                 "app", "apps", "application", "applications", "nurx"]

stopwords = list(set(stopwords+new_stopwords)) # combine the two lists, the base and the custom stop words

documents = sample.text.tolist()
sp = WhiteSpacePreprocessingStopwords(documents, stopwords_list=stopwords, min_words=3, remove_numbers=True,
                             max_df=0.4) 
preprocessed_documents, unpreprocessed_corpus, vocab = sp.preprocess()

df_unpreprocessed_corpus = pd.DataFrame(unpreprocessed_corpus)
df_sample = pd.DataFrame(sample)
df_unpreprocessed_corpus.columns = ['text']

keys = list(df_unpreprocessed_corpus.columns.values)
i1 = df_sample.set_index(keys).index
i2 = df_unpreprocessed_corpus.set_index(keys).index
toremove = df_sample[~i1.isin(i2)]
toremove = list(toremove['text'])

In [6]:
sample = sample[-sample["text"].isin(toremove)]

In [None]:
zero_tp = TopicModelDataPreparation("paraphrase-multilingual-mpnet-base-v2") # we use a multilingual for good measure (50+ languages)
zero_training_dataset = zero_tp.fit(text_for_contextual=unpreprocessed_corpus, text_for_bow=preprocessed_documents) 

# Decide on the number of topics 

In [None]:
from contextualized_topic_models.evaluation.measures import CoherenceNPMI, InvertedRBO
corpus = [d.split() for d in preprocessed_documents]
num_topics = [4,5,6,7,8,9,
10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 
20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 
40, 41, 42, 43, 44, 45, 46, 47, 48, 49,
50, 51, 52, 53, 54, 55, 56, 57, 58,
59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80] 


num_runs = 1 


best_topic_coherence = -999
best_num_topics = 0
for n_components in num_topics:
  for i in range(num_runs):
    print("num topics:", n_components, "/ num run:", i)
    zero_ctm = ZeroShotTM(bow_size=len(zero_tp.vocab), contextual_size=768, 
                     n_components=n_components, num_epochs=4, batch_size = n_components) 
    zero_ctm.fit(zero_training_dataset) # run the model
    coh = CoherenceNPMI(zero_ctm.get_topic_lists(10), corpus)
    coh_score = coh.score()
    print("coherence score:", coh_score)
    if best_topic_coherence < coh_score:
      best_topic_coherence = coh_score
      best_num_topics = n_components
    print("current best coherence", best_topic_coherence, "/ best num topics", best_num_topics)

In [None]:
topicnum = '29'
topicnumber = 29

zero_ctm = ZeroShotTM(bow_size=len(zero_tp.vocab), contextual_size=768,
                      n_components=topicnumber, num_epochs=50, batch_size = topicnumber) 
zero_ctm.fit(zero_training_dataset)
mytopic_lists = zero_ctm.get_topic_lists(10) # get the top 10 words per topic


In [None]:
pickle.dump(zero_ctm, open(f"{folder}/{scriptfile}_{typeofsource}_{topicnum}_zero_ctm.pkl", "wb"))


## Training language topic prediction

In [None]:
traininglanguage_topics_predictions = zero_ctm.training_doc_topic_distributions
#pickle.dump(traininglanguage_topics_predictions, open(f"{folder}/{scriptfile}_{typeofsource}_{language}_{topicnum}_topics_predictions.pkl", "wb")) 

## Non-traininglanguage topic prediction

In [None]:
zero_tp = TopicModelDataPreparation("paraphrase-multilingual-mpnet-base-v2")
testing_dataset = zero_tp.transform(test_docs)
testinglanguage_topics_predictions = zero_ctm.get_thetas(testing_dataset, n_samples=1) 

#pickle.dump(testinglanguage_topics_predictions, open(f"{folder}/{scriptfile}_{typeofsource}_{topicnum}_non_topics_predictions.pkl", "wb"))

# Import/export

In [None]:
### Exporing the matrices
## to DF
testinglanguage_topics_predictions = pd.DataFrame(testinglanguage_topics_predictions)
traininglanguage_topics_predictions = pd.DataFrame(traininglanguage_topics_predictions)

mytopic_lists = pd.DataFrame(mytopic_lists)

# export
testinglanguage_topics_predictions.to_csv(folder + scriptfile + topicnum + typeofsource + "testinglanguage_topics_predictions.csv",index=False)
traininglanguage_topics_predictions.to_csv(folder + scriptfile + topicnum + typeofsource + "traininglanguage_topics_predictions.csv",index=False)


In [None]:
### adding a uniique identifier
sample["comparableid"] = sample.index # getting the row index
sample = sample.applymap(str)
sample['newcomparableid'] = sample.comparableid + sample.id + sample.language

nonsample["comparableid"] = nonsample.index # getting the row index
nonsample = nonsample.applymap(str)
nonsample['newcomparableid'] = nonsample.comparableid + nonsample.id + nonsample.language

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nonsample["comparableid"] = nonsample.index # getting my row index


In [None]:
sample = pd.DataFrame(sample)
sample['row_num'] = sample.reset_index().index
traininglanguage_topics_predictions = pd.DataFrame(traininglanguage_topics_predictions)
traininglanguage_topics_predictions['row_num'] = traininglanguage_topics_predictions.reset_index().index
traininglanguage = sample.merge(traininglanguage_topics_predictions, on='row_num')

nonsample = pd.DataFrame(nonsample)
nonsample['row_num'] = nonsample.reset_index(drop=True).index
testinglanguage_topics_predictions = pd.DataFrame(testinglanguage_topics_predictions)
testinglanguage_topics_predictions['row_num'] = testinglanguage_topics_predictions.reset_index().index
testinglanguage = nonsample.merge(testinglanguage_topics_predictions, on='row_num')

In [None]:
# Export the 3 files that we will use later, namely in the R script
traininglanguage.to_csv(f"{folder}/{scriptfile}_{topicnum}_{typeofsource}_{language}.csv",index=False)
testinglanguage.to_csv(f"{folder}/{scriptfile}_{topicnum}_{typeofsource}_non_{language}.csv",index=False)
mytopic_lists.to_csv(f"{folder}/{scriptfile}_{topicnum}_{typeofsource}_mytopic_lists.csv",index=False)