# Turbo Topics Train
## Dependencies

In [1]:
# !git clone https://github.com/blei-lab/turbotopics.git
# !git clone https://github.com/blei-lab/lda-c.git

In [2]:
# !pip install -r requirements.txt

In [30]:
import pandas as pd
import numpy as np
import joblib
import os
import re
from tqdm import tqdm

### text preprocessing dependencies
import nltk
from nltk.tokenize.casual import TweetTokenizer
nltk.download('wordnet')
from nltk.stem.wordnet import WordNetLemmatizer

### sklearn dependencies
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation as LDA
from sklearn.model_selection import GridSearchCV

### import gensim dependencies
from gensim.models import LdaModel, TfidfModel
from gensim.corpora import Dictionary

[nltk_data] Downloading package wordnet to /home/datallah/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [21]:
filepath = '/home/datallah/datallah-jaymefis-gibsonce/'
random_state = 42
stop = {'a', 'about', 'above', 'after', 'again', 'against', 'ain',
        'all', 'am', 'an', 'and', 'any', 'are', 'aren', "aren't",
        'as', 'at', 'be', 'because', 'been', 'before', 'being',
        'below', 'between', 'both', 'but', 'by', 'can', 'couldn',
        "couldn't", 'd', 'did', 'didn', "didn't", 'do', 'does',
        'doesn', "doesn't", 'doing', 'don', "don't", 'down', 'during',
        'each', 'few', 'for', 'from', 'further', 'had', 'hadn',
        "hadn't", 'has', 'hasn', "hasn't", 'have', 'haven', "haven't",
        'having', 'here', 'how', 'i', 'if', 'in', 'into', 'is', 'isn',
        "isn't", 'it', "it's", 'its', 'itself', 'just', 'll', 'm',
        'ma', 'me', 'mightn', "mightn't", 'more', 'most', 'mustn',
        "mustn't", 'my', 'myself', 'needn', "needn't", 'no', 'nor',
        'not', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or',
        'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own',
        're', 's', 'same', 'shan', "shan't", 'should', "should've",
        'shouldn', "shouldn't", 'so', 'some', 'such', 't', 'than',
        'that', "that'll", 'the', 'their', 'theirs', 'them', 'themselves',
        'then', 'there', 'these', 'they', 'this', 'those', 'through',
        'to', 'too', 'under', 'until', 'up', 've', 'very', 'was', 'wasn',
        "wasn't", 'we', 'were', 'weren', "weren't", 'what', 'when', 'where',
        'which', 'while', 'who', 'whom', 'why', 'will', 'with', 'won',
        "won't", 'wouldn', "wouldn't", 'y', 'you', "you'd", "you'll",
        "you're", "you've", 'your', 'yours', 'yourself', 'yourselves'}
wnl = WordNetLemmatizer()
lemma_stop_words = [wnl.lemmatize(wrd) for wrd in stop]

## Load & Preprocess

In [22]:
# load df's in dict iterable
def load_df_dict(size = 'one', typ = None):
    """
    Accepts 'one', 'three', or 'five' as input sizes.
    If 'train', 'validate', or 'test' are inserted for type,
    only that kind of sample will be loaded.
    """
    df_dict = {}
    for sample in os.listdir(filepath + 'samples/'):
        if size in sample and (typ is None or typ in sample):
            name = sample.replace('.csv', '')
            temp_df = pd.read_csv(filepath + 'samples/' + sample)[['source', ' response_text', ' op_gender']]
            temp_df = temp_df.rename(columns = {' response_text': 'response_text', ' op_gender': 'op_gender'}).dropna()
            df_dict[name] = temp_df
    return df_dict

In [23]:
df_dict = load_df_dict(size = 'one', typ = 'train')
print(df_dict.keys())

dict_keys(['train_one_million'])


In [24]:
# remove links as these will only apply to specific responses
def rm_link(text):
    return re.sub(r'https?://\S+', '', text)

# remove punctuation
def rm_punct(text):
    return re.sub(r'[^\w\s]', '', text)

# create class that lemmatizes tweet tokens
# this will be used when creating the term matrix
class LemmaTokenizer(object):
    def __init__(self):
        self.wnl = WordNetLemmatizer()
        self.tt = TweetTokenizer(preserve_case=False, reduce_len=True,
                                 strip_handles=True, match_phone_numbers=False)
    def __call__(self, docs):
        return [self.wnl.lemmatize(t) for t in self.tt.tokenize(rm_link(rm_punct(docs))) if t not in stop]

In [25]:
X_train = df_dict['train_one_million'].response_text
y_train = df_dict['train_one_million'].op_gender

In [9]:
train_tokenized = X_train.apply(LemmaTokenizer())

KeyboardInterrupt: 

In [9]:
dictionary = Dictionary(train_tokenized)
train_bow = [dictionary.doc2bow(doc) for doc in train_tokenized]
tfidf_model = TfidfModel(train_bow)
corpus_tfidf = tfidf_model[train_bow]

## Train LDA

In [15]:
lda_model = LdaModel(corpus_tfidf,
                     num_topics = 2, 
                     id2word = dictionary,
                     passes = 10,
                     decay = 0.7,
                     offset = 30,
                     random_state = random_state)

In [16]:
top_df = pd.DataFrame(lda_model.get_topics(), columns = lda_model.id2word.values(),
                      index = [f'topic_{i}' for i in range(lda_model.num_topics)]).transpose()
top_df['topic_ind'] = np.where(top_df.topic_0 > top_df.topic_1, 0, 1)
top_df['assign'] = ' ' + top_df.reset_index().index.astype(str) + ':' + top_df.topic_ind.astype(str)
# create features series for vocab input
features = top_df.index
# create dictionary-like string for assign input
assigns = '\n'.join(top_df['assign'])
top_df.sample(10)

Unnamed: 0,topic_0,topic_1,topic_ind,assign
vermeulen,1.058348e-06,4.290971e-07,0,88413:0
cassandra,1.543211e-05,4.579894e-07,0,37572:0
done,1.319327e-06,0.001339289,1,669:1
عجیب,3.985494e-06,4.649203e-07,0,188522:0
colme,1.003921e-06,4.245716e-07,0,33687:0
nonperfect,9.913872e-07,4.27987e-07,0,5213:0
نتیجه,2.742741e-06,4.261452e-07,0,39139:0
overdiagnosed,1.130861e-06,1.407297e-06,1,157106:1
2azjktau,1.000821e-06,4.294682e-07,0,62550:0
twicei,1.121672e-06,8.63756e-07,0,179968:0


In [13]:
# create storage path if not exists
tt_path = filepath + 'turbotopics/'
if os.path.exists(tt_path) == False:
    os.mkdir(tt_path)
# save corpus to text file
corpus_path = tt_path + 'corpus.txt'
with open(corpus_path, 'w') as f: f.write('\n'.join(X_train))
# create ngrams file for saved results
out_path = tt_path + "ngrams.txt"
out_file = open(out_path, 'w')
out_file.close()
# save vocab separated by newlines
vocab_path = tt_path + 'vocab.dat'
with open(vocab_path, 'w') as f: f.write('\n'.join(features))
# save index:topic document
assign_path = tt_path + 'assign.dat'
with open(assign_path, 'w') as f: f.write(assigns)
# assign output location
tt_out_path = tt_path + 'tt_result'

NameError: name 'features' is not defined

In [18]:
%%bash -s "$corpus_path" "$vocab_path" "$assign_path" "$tt_out_path"
cd turbotopics
python2.7 lda_topics.py --corpus="$1" --vocab="$2" --assign="$3" --out="$4" --ntopics=2 --pval=0.001

reading vocabulary from /home/datallah/datallah-jaymefis-gibsonce/turbotopics/vocab.dat
writing topic 0
computing initial counts


Traceback (most recent call last):
  File "lda_topics.py", line 106, in <module>
    pvalue=opt.pvalue)
  File "lda_topics.py", line 75, in turbo_topic
    cnts = tt.nested_sig_bigrams(iter_gen, update_fun, test, min)
  File "/home/datallah/implicit-gender-bias/unsupervised_notebooks/turbotopics/turbotopics.py", line 425, in nested_sig_bigrams
    for doc in iter_generator(): update_fun(counts, doc)
  File "lda_topics.py", line 72, in update_fun
    update_counts_from_topic(doc[0], doc[1], topic, counts)
  File "lda_topics.py", line 65, in update_counts_from_topic
    counts_obj.update_counts(doc, root_filter=root_filter)
  File "/home/datallah/implicit-gender-bias/unsupervised_notebooks/turbotopics/turbotopics.py", line 67, in update_counts
    if (not root_filter(w)): continue
  File "lda_topics.py", line 64, in <lambda>
    root_filter = lambda w: topicmap.get(w.split()[0], -1)==topic
IndexError: list index out of range


CalledProcessError: Command 'b'cd turbotopics\npython2.7 lda_topics.py --corpus="$1" --vocab="$2" --assign="$3" --out="$4" --ntopics=2 --pval=0.001\n'' returned non-zero exit status 1.

## Train LDA-C

In [28]:
X_train_sample = X_train.sample(frac = 0.2, random_state = random_state)
train_tokenized = X_train_sample.apply(LemmaTokenizer())

In [31]:
train_tokenized.head()

KeyError: '_oh'

In [32]:
train_tokenized = [' '.join(x) for x in train_tokenized]

In [33]:
new_path = '/home/datallah/implicit-gender-bias/unsupervised_notebooks/lda-c/'
# create storage path if not exists
tt_path = new_path + 'output/'
if os.path.exists(tt_path) == False:
    os.mkdir(tt_path)
# save corpus to text file
corpus_path = new_path + 'corpus.txt'
with open(corpus_path, 'w') as f: f.write('\n'.join(train_tokenized))
# settings LDA-C location
settings = tt_path + 'settings.txt'

In [34]:
# import sys

# # These are the usual ipython objects, including this one you are creating
# ipython_vars = ['In', 'Out', 'exit', 'quit', 'get_ipython', 'ipython_vars']

# # Get a sorted list of the objects and their sizes
# sorted([(x, sys.getsizeof(globals().get(x))) for x in dir() if not x.startswith('_') and x not in sys.modules and x not in ipython_vars], key=lambda x: x[1], reverse=True)

In [37]:
globals().clear()

In [36]:
%%bash 
cd lda-c
make
/home/datallah/implicit-gender-bias/unsupervised_notebooks/lda-c/lda est 0.1 2 settings.txt corpus.txt random output

make: 'lda' is up to date.


bash: line 3: 1472570 Killed                  /home/datallah/implicit-gender-bias/unsupervised_notebooks/lda-c/lda est 0.1 2 settings.txt corpus.txt random output


CalledProcessError: Command 'b'cd lda-c\nmake\n/home/datallah/implicit-gender-bias/unsupervised_notebooks/lda-c/lda est 0.1 2 settings.txt corpus.txt random output\n'' returned non-zero exit status 137.