In [1]:
import numpy as np
import pandas as pd
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
import tensorflow as tf
from transformers import BertTokenizer, TFBertForSequenceClassification
from transformers import InputExample, InputFeatures

import functions as f
import preprocessing as pp
import difflib

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\jasro\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\jasro\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\jasro\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\jasro\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [2]:
PATH = {}
PATH["dataset_classification"] = "dataset/classification/"
PATH["dataset_labeling"] = "dataset/seq_labeling/"
PATH["music_reviews_train"] = PATH["dataset_classification"] + "music_reviews_train.json.gz"
PATH["music_reviews_dev"] = PATH["dataset_classification"] + "music_reviews_dev.json.gz"
PATH["music_reviews_test"] = PATH["dataset_classification"] + "music_reviews_test_masked.json.gz"
train = f.readJson(PATH["music_reviews_train"])
test = f.readJson(PATH["music_reviews_dev"])
X_test, y_test, test_idx, test_missing_idx = f.json_divide(test)
X_train, y_train, train_idx, train_missing_idx = f.json_divide(test)

Number of data:  100000
Number of data:  10000


In [3]:
bt = BertTokenizer.from_pretrained("bert-base-uncased")
print(" ".join(bt.tokenize("mistypped woords here aosijd !!")))

misty ##pped woo ##rds here ao ##si ##j ##d ! !


In [7]:
sentences = X_test[:1]

In [8]:
def compare_b4_after_basicpreprocessing(sentences):
    # compare b4 and after basic-preprocessing
    print("Sentences before basic-preprocessing")
    for i, s in enumerate(sentences): 
        print(f"***{i}***")
        print("--Before")
        print(s)
        print("--After")
        tokens = []
        for token in pp.basic_preprocess(s.split()):
            tokens.extend([t for t in token])
        s_preprocessed = " ".join(tokens)
        print(s_preprocessed)
        
#         print("---- DIFFERENCE")
        
#         d = difflib.Differ()
#         diff = d.compare([s], [s_preprocessed])
#         print('\n'.join(diff))

#         output_list = [li for li in difflib.ndiff(s, s_preprocessed) if li[0] != ' ']
#         print(output_list)
compare_b4_after_basicpreprocessing(sentences)

Sentences before basic-preprocessing
***0***
--Before
My dentist recommended this as a relaxation technique for dental visits. They give me an ipod with headphones, play this on it and it relieves some of the stress of dental treatment, which I dislike intensely.
It worked so well that I bought my own copy to try at home. I fall asleep after a couple of minutes and stay asleep. Instead of tossing and turning, I hardly move at all. Highly recommend.
--After
My dentist recommended this as a relaxation technique for dental visits . They give me an ipod with headphones play this on it and it relieves some of stress of dental treatment which I dislike intensely . It worked so well that I bought my own copy to try at home . I fall asleep after a couple of minutes and stay asleep . Instead of tossing and turning I hardly move at all . Highly recommend .


In [9]:
def compare_bert_and_keras_tokenizer(sentences):
    # compare berttokenizer and keras tokenizer 
    start = 6666
    no = 5000
    kt = pp.tokenizer_init(X_train[start:(start+no)], X_test) 
    bt = BertTokenizer.from_pretrained("bert-base-uncased")
    for i, s in enumerate(sentences):
        print(f"***{i}***")
        print("--Before")
        print(s)
        print("--After bert tokenization")
        print(" ".join(bt.tokenize(s)))
        print("--After keras tokenization")
        sequence = kt.texts_to_sequences(s)
        tokens = kt.sequences_to_texts(sequence)
        sentence = ""
        for t in tokens:
            if t != "": sentence += t
            else: sentence += " "
        print(sentence)
    
    # compare berttokenizer and keras tokenizer with inputs that basic-preprocessed
compare_bert_and_keras_tokenizer(sentences)

***0***
--Before
My dentist recommended this as a relaxation technique for dental visits. They give me an ipod with headphones, play this on it and it relieves some of the stress of dental treatment, which I dislike intensely.
It worked so well that I bought my own copy to try at home. I fall asleep after a couple of minutes and stay asleep. Instead of tossing and turning, I hardly move at all. Highly recommend.
--After bert tokenization
my dentist recommended this as a relaxation technique for dental visits . they give me an ipod with head ##phones , play this on it and it relieve ##s some of the stress of dental treatment , which i dislike intensely . it worked so well that i bought my own copy to try at home . i fall asleep after a couple of minutes and stay asleep . instead of tossing and turning , i hardly move at all . highly recommend .
--After keras tokenization
my dentist recommended this as a relaxation technique for dental visits  they give me an ipod with headphones  play t

In [10]:
def compare_bert_and_keras_tokenizer_w_basic_preprocessing(sentences):
    # compare berttokenizer and keras tokenizer 
    start = 6666
    no = 5000
    kt = pp.tokenizer_init(X_train[start:(start+no)], X_test) # what should it be trained on? Fuck
    bt = BertTokenizer.from_pretrained("bert-base-uncased")
    for i, s in enumerate(sentences):
        print(f"***{i}***")
        print("--Before")
        print(s)
        
        print("--After basic_preprocessing")
        print(s)
        tokens = []
        for token in pp.basic_preprocess(s.split()):
            tokens.extend([t for t in token])
        s_basic_preprocessed = " ".join(tokens)
        
        print("--After bert tokenization with basic preprocessing")
        print(" ".join(bt.tokenize(s_basic_preprocessed)))
        print("--After keras tokenization with basic preprocessing")
        sequence = kt.texts_to_sequences(s_basic_preprocessed)
        tokens = kt.sequences_to_texts(sequence)
        sentence = ""
        for t in tokens:
            if t != "": sentence += t
            else: sentence += " "
        print(sentence)
        
        print("**********")
        print("**********")
        print("**********")
    
    # compare berttokenizer and keras tokenizer with inputs that basic-preprocessed
compare_bert_and_keras_tokenizer_w_basic_preprocessing(X_test[:100])

***0***
--Before
My dentist recommended this as a relaxation technique for dental visits. They give me an ipod with headphones, play this on it and it relieves some of the stress of dental treatment, which I dislike intensely.
It worked so well that I bought my own copy to try at home. I fall asleep after a couple of minutes and stay asleep. Instead of tossing and turning, I hardly move at all. Highly recommend.
--After basic_preprocessing
My dentist recommended this as a relaxation technique for dental visits. They give me an ipod with headphones, play this on it and it relieves some of the stress of dental treatment, which I dislike intensely.
It worked so well that I bought my own copy to try at home. I fall asleep after a couple of minutes and stay asleep. Instead of tossing and turning, I hardly move at all. Highly recommend.
--After bert tokenization with basic preprocessing
my dentist recommended this as a relaxation technique for dental visits . they give me an ipod with head #

--After bert tokenization with basic preprocessing
this works great in a sp ##oof of a horse throwing it s long mane around . the song is so awful it s hysterical .
--After keras tokenization with basic preprocessing
this works great in a spoof of a horse throwing it s long mane around   the song is so awful it s hysterical  
**********
**********
**********
***59***
--Before
It is one of the most beautiful pieces of music I have encountered in a long time. The music is well put togather if you need rest and relation this is a cd to do just that .
--After basic_preprocessing
It is one of the most beautiful pieces of music I have encountered in a long time. The music is well put togather if you need rest and relation this is a cd to do just that .
--After bert tokenization with basic preprocessing
it is one of most beautiful pieces of music i have encountered in a long time . the music is well put to ##gat ##her if you need rest and relation this is a cd to do just that .
--After keras 