In [1]:
import datetime as dt
import enum
import json
import os
import sys
from pathlib import Path
from typing import Tuple

import hyperopt
import hyperopt.pyll
import numpy as np
from datasets import list_metrics
from transformers import logging as hf_logging

sys.path.append(os.path.join(os.getcwd(), '..'))
from utils import get_project_path, get_transformers_layers_num, loggers
from models.Model import ModelConstruction
from models.transformersModel import TransformersModel
from preprocessing.cleaningText import cleaningMap
from preprocessing.pretrainedTransformersPipeline import PretrainedTransformersPipeLine

PROJECT_DIRECTORY = get_project_path()

# hf_logging.set_verbosity_error()
hf_logging.enable_explicit_format()
logger = loggers.getLogger("Notebook", debug=True)

data folder is set to `/home/sniper/projects_local/CIL/Computational-Intelligence-Lab/venv/lib64/python3.9/site-packages/neuspell/../data` script


Since the GPL-licensed package `unidecode` is not installed, using Python's `unicodedata` package which yields worse results.


In [2]:
d = {}
with open("/home/sniper/projects_local/CIL/Computational-Intelligence-Lab/src/configs/dev/squeezebert.json") as fr:
    d = json.load(fr)

In [3]:
model_name_or_path = d['model_name_or_path']
tokenizer_name_or_path = d.get('tokenizer_name_or_path', model_name_or_path)
model = TransformersModel(modelName_or_pipeLine=model_name_or_path,
                            tokenizer_name_or_path=tokenizer_name_or_path,
                            fast_tokenizer=d.get('fast_tokenizer'),
                            text_pre_cleaning=d.get('text_pre_cleaning', 'default'))

[2021-07-26 04:24:09] - PretrainedTransformersPipeLine - {line:76} INFO - PretrainedTransformersPipeLine created


In [4]:
if type(d['metric']) is str:
        d['metric'] = [d['metric']]
assert (d['metric'][0] in list_metrics()), \
        f"The metric for evaluation is not supported.\n" \
        f"It should be in https://huggingface.co/metrics"

In [5]:
model.registerMetric(*d['metric'])

In [6]:
model.loadData(ratio=d['data_load_ratio'])

[2021-07-26 04:24:14] - PretrainedTransformersPipeLine - {line:113} INFO - loading data for PretrainedTransformersPipeLine squeezebert/squeezebert-uncased
[2021-07-26 04:24:15] - InputPipeline - {line:66} INFO - Dataset loaded!
[2021-07-26 04:24:15] - InputPipeline - {line:67} DEBUG - Positive: 338, Negative: 342, Test: 10000


In [7]:
text_pre_cleaning_function = cleaningMap("masks")
logger.info('Cleaning the dataset ...')
allData = text_pre_cleaning_function(model.pipeLine.allData)

[2021-07-26 04:24:18] - Notebook - {line:2} INFO - Cleaning the dataset ...


In [8]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from dateutil import parser
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

def lemmatize(input):
    """
    Lemmatizes input using NLTK's WordNetLemmatizer
    """
    lemmatizer=WordNetLemmatizer()
    input_str=word_tokenize(input)
    new_words = []
    for word in input_str:
        new_words.append(lemmatizer.lemmatize(word))
    return ' '.join(new_words)

[nltk_data] Downloading package stopwords to /home/sniper/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/sniper/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/sniper/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [9]:
tt = TweetTokenizer()

In [10]:
dataTokenized = [tt.tokenize(dat) for dat in allData]

In [11]:
dataLematized = lemmatize(' '.join([' '.join(m) for m in dataTokenized]))

In [11]:
import gensim.corpora as corpora
# Create Dictionary
id2word = corpora.Dictionary(dataTokenized)
# Create Corpus
corpus = [id2word.doc2bow(text) for text in dataTokenized]

In [12]:
import gensim

In [13]:
num_topics = 10 # positive/negative
# Build LDA model
lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                       id2word=id2word,
                                       num_topics=num_topics)

In [21]:
with open("../../data/test_data.txt") as fr:
    testTweets = fr.readlines()
testTweetsTokenized = [tt.tokenize(testTweet) for testTweet in testTweets]
# 
# lda_model[testTweetsTokenized[0]]

In [24]:
from gensim.test.utils import common_corpus, common_dictionary
testTweetBow = [common_dictionary.doc2bow(tweet) for tweet in testTweetsTokenized]

In [33]:
testTweetsVectors = np.array([lda_model[it] for it in testTweetBow])

In [41]:
vectorProbabilities = testTweetsVectors[:,:,1]
test_topics = np.argmax(vectorProbabilities, axis=-1)
print(test_topics)
# print()

[0 0 0 ... 0 0 0]


In [16]:
import pyLDAvis
# from pyLDAvis import gensim_models 
import pickle 


# Visualize the topics
pyLDAvis.enable_notebook()



In [21]:
from pyLDAvis import gensim as gensimvis
# from pyLDAvis import gensim_models 
LDAvis_data_filepath = os.path.join(f'./ldavis_prepared2_{num_topics}.ld')

## This is a bit time consuming - make the if statement True
## if you want to execute visualization prep yourself
if 1 == 1:
    LDAvis_prepared = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
    with open(LDAvis_data_filepath, 'wb') as f:
        pickle.dump(LDAvis_prepared, f)



In [1]:
LDAvis_prepared

NameError: name 'LDAvis_prepared' is not defined

In [None]:
# encodedDatasetArgs = {'splitter': splitter,
#                               'tokenizerConfig': tokenizer_config,
#                               'cleaning_function': text_pre_cleaning_function}

# logger.info('Cleaned!')
# for train_dataset, val_dataset in model.pipeLine.getEncodedDataset(**encodedDatasetArgs):
#             pass