# 0301

In [1]:
import contextualized_topic_models
from contextualized_topic_models.models.ctm import ZeroShotTM, CombinedTM
from contextualized_topic_models.utils.data_preparation import TopicModelDataPreparation
from contextualized_topic_models.utils.preprocessing import WhiteSpacePreprocessingStopwords
import nltk
import torch
import random
import numpy as np
import pandas as pd    
import os
import pickle
cwd = os.getcwd()
print('Current Standard Directory is: ', cwd)
absolute_path = 'C:/Users/rbarker/OneDrive/02_Fertapp' # for work PC
os.chdir(absolute_path)
print('New working directory is: ', os.getcwd())

Current Standard Directory is:  C:\Users\rbarker\OneDrive\02_Fertapp\01_Python_script
New working directory is:  C:\Users\rbarker\OneDrive\02_Fertapp


In [2]:
### import my file of cleaned, multilingual data from R script
filename = '03_data/02_output/0100_reliable_langdetection.csv' # load dataset of reviews 
fullsample = pd.read_csv(filename, usecols = ["app", "review", "language", "date", "rating"], header='infer', encoding="utf-8") 
fullsample.columns = ['id', 'text', 'language', 'date', 'rating'] # rename columns 


In [3]:
spanishsample = fullsample[fullsample['language'] == "Spanish"]
nonspanishsample = fullsample[fullsample['language'] != "Spanish"] # selecting all non-spanish texts from the earlier sample
test_docs = nonspanishsample['text'].tolist() 

In [4]:
def fix_seeds():
  torch.manual_seed(10)
  torch.cuda.manual_seed(10)
  np.random.seed(10)
  random.seed(10)
  torch.backends.cudnn.enabled = False
  torch.backends.cudnn.deterministic = True

In [5]:
### Preprocessing
from nltk.corpus import stopwords as stop_words
nltk.download('stopwords')
stopwords = list(set(stop_words.words('spanish')))

# from 10_fertapp_cotis_python
new_stopwords = ["im", "x", "xx", "xxx", "xxxx", "xo", "xoxo", "xox", "day",
                 "ovia", "ladytimer", "clue", "pinkbird", "clue", "Ovia",
                 "clover", "womanlog", "fertility friend", "woom",
                 "tempdrop", "femm", "glow", "maya", "natural cycles", "ava",
                 "kindara", "flo", # these from fertility apps
                 "app", "apps", "application", "applications", "nurx"]

stopwords = list(set(stopwords+new_stopwords)) # combine the two lists, the base and the custom stop words

documents = spanishsample.text.tolist()
sp = WhiteSpacePreprocessingStopwords(documents, stopwords_list=stopwords, min_words=3, remove_numbers=True,
                             max_df=0.4) 
preprocessed_documents, unpreprocessed_corpus, vocab = sp.preprocess()


import pandas as pd
import pickle
df_unpreprocessed_corpus = pd.DataFrame(unpreprocessed_corpus)
df_spanishsample = pd.DataFrame(spanishsample)
df_unpreprocessed_corpus.columns = ['text']

keys = list(df_unpreprocessed_corpus.columns.values)
i1 = df_spanishsample.set_index(keys).index
i2 = df_unpreprocessed_corpus.set_index(keys).index
toremove = df_spanishsample[~i1.isin(i2)]
toremove = list(toremove['text'])




[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\rbarker\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
scriptfile = '0300_'
folder = '03_data/02_output/01_spanish/'
language = "spanish_"
spanishsample = spanishsample[-spanishsample["text"].isin(toremove)]
spanishsample.to_csv(folder  + scriptfile + language + "df_spanishsample.csv",index=False)
pickle.dump(spanishsample, open(folder + scriptfile + "spanishsample.pkl", "wb"))
pickle.dump(nonspanishsample, open(folder + scriptfile + "nonspanishsample.pkl", "wb"))

In [7]:
zero_tp = TopicModelDataPreparation("paraphrase-multilingual-mpnet-base-v2")
zero_training_dataset = zero_tp.fit(text_for_contextual=unpreprocessed_corpus, text_for_bow=preprocessed_documents) 
zero_tp.vocab 

Batches:   0%|          | 0/263 [00:00<?, ?it/s]

['aborto',
 'abre',
 'abri',
 'abrir',
 'abrirla',
 'abro',
 'absolutamente',
 'aca',
 'acaba',
 'acabo',
 'acceder',
 'accesible',
 'acceso',
 'accidente',
 'accidentes',
 'acerca',
 'acertada',
 'acertadas',
 'acertado',
 'acertados',
 'acertar',
 'acertiva',
 'acerto',
 'acierta',
 'acne',
 'acompana',
 'aconseja',
 'aconsejo',
 'activadas',
 'activar',
 'actividad',
 'actividades',
 'actual',
 'actualice',
 'actualiza',
 'actualizacion',
 'actualizaciones',
 'actualizada',
 'actualizado',
 'actualizando',
 'actualizar',
 'actualizarla',
 'actualizaron',
 'actualizo',
 'actualmente',
 'acuerdo',
 'adapta',
 'adecuado',
 'adelanta',
 'adelante',
 'ademas',
 'adicional',
 'adicionales',
 'adolescentes',
 'adoro',
 'afecta',
 'agenda',
 'agrada',
 'agradable',
 'agradeceria',
 'agradecida',
 'agradezco',
 'agrado',
 'agregar',
 'agregaran',
 'agregaria',
 'agregarle',
 'agreguen',
 'agua',
 'ah',
 'ahi',
 'ahora',
 'ahorita',
 'ajusta',
 'ajustar',
 'ajustes',
 'ala',
 'alarma',
 'alar

In [9]:
## Run the model - here is where the problem is
scriptfile = '0301_'
folder = '03_data/02_output/01_spanish/'
topicnum = '33'
topicnumber = 33
zero_ctm = ZeroShotTM(bow_size=len(zero_tp.vocab), contextual_size=768,
                      n_components=topicnumber, num_epochs=50, batch_size = 33) # we get an error I think maybe related to batch size (DataLoader worker (pid(s)) - but we get this problem even when batch_size = 1
zero_ctm.fit(zero_training_dataset) #, n_samples=1) 

pickle.dump(zero_ctm, open(folder + scriptfile + topicnum +language + "k_zero_ctm.pkl", "wb"))


0it [00:00, ?it/s]

RuntimeError: DataLoader worker (pid(s) 312, 12764, 9644, 14228, 10064, 10800, 10236, 12092) exited unexpectedly


## Spanish topic prediction

In [None]:
spanish_topics_predictions = zero_ctm.training_doc_topic_distributions
scriptfile = '0301_'
folder = '03_data/02_output/01_spanish/'
pickle.dump(spanish_topics_predictions, open(folder + scriptfile + topicnum + language +"spanish_topics_predictions.pkl", "wb")) 

## Non-Spanish topic prediction

In [None]:
zero_tp = TopicModelDataPreparation("paraphrase-multilingual-mpnet-base-v2")
testing_dataset = zero_tp.transform(test_docs)

nonspanish_topics_predictions = zero_ctm.get_thetas(testing_dataset, n_samples=1) 

pickle.dump(nonspanish_topics_predictions, open(folder + scriptfile  + topicnum + "nonspanish_topics_predictions.pkl", "wb"))



# 0202 - import/export

In [None]:
import pickle
import os
cwd = os.getcwd()
print('Current Standard Directory is: ', cwd)
absolute_path = 'C:/Users/rbarker/OneDrive/02_Fertapp' 
os.chdir(absolute_path)
print('New working directory is: ', os.getcwd())

sourcescriptfile = "0301_"
folder = '03_data/02_output/01_spanish/'
topicnum = '33'
language = 'spanish_'
zero_ctm = pickle.load(open(folder + sourcescriptfile + topicnum +language + "k_zero_ctm.pkl", "rb"))
nonspanish_topics_predictions = pickle.load(open(folder + sourcescriptfile + topicnum + "nonspanish_topics_predictions.pkl", "rb"))
spanish_topics_predictions = pickle.load(open(folder + sourcescriptfile +  topicnum +language + "spanish_topics_predictions.pkl", "rb"))
mytopic_lists = zero_ctm.get_topic_lists(10)

In [None]:
### Exporing the matrices
import pandas as pd
scriptfile = "0301_"
topicnum = '33'
typeofsource = 'reviews'

## to DF
nonspanish_topics_predictions = pd.DataFrame(nonspanish_topics_predictions)
spanish_topics_predictions = pd.DataFrame(spanish_topics_predictions)
mytopic_lists = pd.DataFrame(mytopic_lists)

# export
nonspanish_topics_predictions.to_csv(folder + scriptfile + topicnum + typeofsource + "nonspanish_topics_predictions.csv",index=False)
spanish_topics_predictions.to_csv(folder + scriptfile + topicnum + typeofsource +language + "spanish_topics_predictions.csv",index=False)
mytopic_lists.to_csv(folder + scriptfile + topicnum + typeofsource +language +  "mytopic_lists.csv",index=False)


In [None]:
sourcescriptfile = '0300_'
spanishsample = pickle.load(open(folder + sourcescriptfile + "spanishsample.pkl", "rb"))
nonspanishsample = pickle.load(open(folder + sourcescriptfile + "nonspanishsample.pkl", "rb"))

In [None]:
### adding a uniique identifier
spanishsample["comparableid"] = spanishsample.index # getting my row index
spanishsample = spanishsample.applymap(str)
spanishsample['newcomparableid'] = spanishsample.comparableid + spanishsample.id + spanishsample.language

nonspanishsample["comparableid"] = nonspanishsample.index # getting my row index
nonspanishsample = nonspanishsample.applymap(str)
nonspanishsample['newcomparableid'] = nonspanishsample.comparableid + nonspanishsample.id + nonspanishsample.language

In [None]:
spanishsample = pd.DataFrame(spanishsample)
spanishsample['row_num'] = spanishsample.reset_index().index
spanish_topics_predictions = pd.DataFrame(spanish_topics_predictions)
spanish_topics_predictions['row_num'] = spanish_topics_predictions.reset_index().index
spanish = spanishsample.merge(spanish_topics_predictions, on='row_num')


nonspanishsample = pd.DataFrame(nonspanishsample)
nonspanishsample['row_num'] = nonspanishsample.reset_index(drop=True).index
nonspanish_topics_predictions = pd.DataFrame(nonspanish_topics_predictions)
nonspanish_topics_predictions['row_num'] = nonspanish_topics_predictions.reset_index().index
nonspanish = nonspanishsample.merge(nonspanish_topics_predictions, on='row_num')



In [None]:
### here, we export 
scriptfile = "0302_"
folder = '03_data/02_output/01_spanish/'
topicnum = '33'
typeofsource = 'reviews'
spanish.to_csv(folder + scriptfile + topicnum + typeofsource + "spanish.csv",index=False)
nonspanish.to_csv(folder + scriptfile + topicnum + typeofsource + "nonspanish.csv",index=False)