In [37]:
# !pip install --upgrade pip
# !pip install numpy requests nlpaug
# !pip install comet_ml
# !pip install -q pyyaml h5py
# !pip install scikit-plot
# !pip install tensorflow
# !pip install websocket-client==0.47.0
# !pip3 install patool
# !pip3 install transformers
# !pip3 install dask
# !pip3 install 'fsspec>=0.3.3'


In [2]:
# import patoolib
# patoolib.extract_archive("/content/Archive.zip", outdir="/content/")

In [3]:

# Install libraries
from comet_ml import Experiment
import pandas as pd
import numpy as np
import nlpaug.augmenter.word as nlpaw
from sklearn.model_selection import train_test_split
import tqdm as tqdm
import os
import random
import seaborn as sns
import scikitplot as skplt
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
import tensorflow as tf
from tensorflow.keras import backend as K
from tensorflow.keras import initializers
from transformers import DistilBertTokenizerFast
from transformers import TFDistilBertModel, DistilBertConfig
import gc
import dask
import dask.dataframe as dd

# Import utility functions
# from train_utils import batch_encode

# Import matplotlib
pd.plotting.register_matplotlib_converters()
import matplotlib.pyplot as plt
get_ipython().run_line_magic('matplotlib', 'inline')

# Import utility functions
from data_utils import analyze_dist,augment_sentence,augment_text,combine_toxic_classes,get_relevant_words,undersample_majority

# Load the data
dask_df_train_valid = dd.read_csv('train.csv',header=None)
dask_df_train_valid.npartitions

test = pd.read_csv('test.csv',engine='python', encoding='utf-8', error_bad_lines=False,header=None)
y_test=test[0]
# Check data
# 1= Negative , 2 = Positive  

df_pos = dask_df_train_valid[(dask_df_train_valid[0] == 2)].compute()
print("Total positives : ",df_pos.shape[0])

df_neg = dask_df_train_valid[(dask_df_train_valid[0] == 1)].compute()
print("Total Negatives : ",df_neg.shape[0])


# Allow us to see full text (not truncated)
pd.set_option('display.max_colwidth', None)
dask_df_train_valid.head()

Total positives :  280000
Total Negatives :  280000


Unnamed: 0,0,1
0,1,"Unfortunately, the frustration of being Dr. Goldberg's patient is a repeat of the experience I've had with so many other doctors in NYC -- good doctor, terrible staff. It seems that his staff simply never answers the phone. It usually takes 2 hours of repeated calling to get an answer. Who has time for that or wants to deal with it? I have run into this problem with many other doctors and I just don't get it. You have office workers, you have patients with medical needs, why isn't anyone answering the phone? It's incomprehensible and not work the aggravation. It's with regret that I feel that I have to give Dr. Goldberg 2 stars."
1,2,"Been going to Dr. Goldberg for over 10 years. I think I was one of his 1st patients when he started at MHMG. He's been great over the years and is really all about the big picture. It is because of him, not my now former gyn Dr. Markoff, that I found out I have fibroids. He explores all options with you and is very patient and understanding. He doesn't judge and asks all the right questions. Very thorough and wants to be kept in the loop on every aspect of your medical health and your life."
2,1,"I don't know what Dr. Goldberg was like before moving to Arizona, but let me tell you, STAY AWAY from this doctor and this office. I was going to Dr. Johnson before he left and Goldberg took over when Johnson left. He is not a caring doctor. He is only interested in the co-pay and having you come in for medication refills every month. He will not give refills and could less about patients's financial situations. Trying to get your 90 days mail away pharmacy prescriptions through this guy is a joke. And to make matters even worse, his office staff is incompetent. 90% of the time when you call the office, they'll put you through to a voice mail, that NO ONE ever answers or returns your call. Both my adult children and husband have decided to leave this practice after experiencing such frustration. The entire office has an attitude like they are doing you a favor. Give me a break! Stay away from this doc and the practice. You deserve better and they will not be there when you really need them. I have never felt compelled to write a bad review about anyone until I met this pathetic excuse for a doctor who is all about the money."
3,1,"I'm writing this review to give you a heads up before you see this Doctor. The office staff and administration are very unprofessional. I left a message with multiple people regarding my bill, and no one ever called me back. I had to hound them to get an answer about my bill. \n\nSecond, and most important, make sure your insurance is going to cover Dr. Goldberg's visits and blood work. He recommended to me that I get a physical, and he knew I was a student because I told him. I got the physical done. Later, I found out my health insurance doesn't pay for preventative visits. I received an $800.00 bill for the blood work. I can't pay for my bill because I'm a student and don't have any cash flow at this current time. I can't believe the Doctor wouldn't give me a heads up to make sure my insurance would cover work that wasn't necessary and was strictly preventative. The office can't do anything to help me cover the bill. In addition, the office staff said the onus is on me to make sure my insurance covers visits. Frustrating situation!"
4,2,"All the food is great here. But the best thing they have is their wings. Their wings are simply fantastic!! The \""Wet Cajun\"" are by the best & most popular. I also like the seasoned salt wings. Wing Night is Monday & Wednesday night, $0.75 whole wings!\n\nThe dining area is nice. Very family friendly! The bar is very nice is well. This place is truly a Yinzer's dream!! \""Pittsburgh Dad\"" would love this place n'at!!"


# We have well balanced database
### Now we gonna down sample our training dataset as data is huge for our computing resource.


In [4]:
#Test with multiple values 
downsamplePercentage=(0.5/100)

print(downsamplePercentage)

df_pos = df_pos.sample(n=(int)((df_pos.shape[0])*downsamplePercentage))

df_neg = df_neg.sample(n=(int)((df_neg.shape[0])*downsamplePercentage)) 

# join downsampled data and shuffle them
df = pd.concat([df_pos,df_neg]).sample(frac=1)
print(df.shape)

0.005
(2800, 2)


In [5]:
# Generate 80-20 train-validation splits
X_train, X_valid, y_train, y_valid = train_test_split(df[1],
                                                      df[0],
                                                      train_size=0.8,
                                                      stratify=df[0],
                                                      shuffle=True,
                                                      random_state=42)

In [6]:

def batch_encode(tokenizer=[], texts=pd.Series([]), batch_size=256, max_length=512):
    """""""""
    A function that encodes a batch of texts and returns the texts'
    corresponding encodings and attention masks that are ready to be fed 
    into a pre-trained transformer model.
    
    Input:
        - tokenizer:   Tokenizer object from the PreTrainedTokenizer Class
        - texts:       List of strings where each string represents a text
        - batch_size:  Integer controlling number of texts in a batch
        - max_length:  Integer controlling max number of words to tokenize in a given text
    Output:
        - input_ids:       sequence of texts encoded as a tf.Tensor object
        - attention_mask:  the texts' attention mask encoded as a tf.Tensor object
    """""""""
    input_ids = []
    attention_mask = []
    
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        inputs = tokenizer.batch_encode_plus(batch,
                                             max_length=max_length,
                                             padding='longest', #implements dynamic padding
                                             truncation=True,
                                             return_attention_mask=True,
                                             return_token_type_ids=False
                                             )
        input_ids.extend(inputs['input_ids'])
        attention_mask.extend(inputs['attention_mask'])
    
    
    return tf.convert_to_tensor(input_ids), tf.convert_to_tensor(attention_mask)



# Instantiate DistilBERT tokenizer...we use the Fast version to optimize runtime
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

# Encode X_train
X_train_ids, X_train_attention = batch_encode(tokenizer, X_train.tolist())

# Encode X_valid
X_valid_ids, X_valid_attention = batch_encode(tokenizer, X_valid.tolist())



# Encode X_test
X_test_ids, X_test_attention = batch_encode(tokenizer, test[1].tolist())

########## Ensure reproducibility ##########
# Set parameters:
params = {'MAX_LENGTH': 128,
          'EPOCHS': 6,
          'LEARNING_RATE': 5e-5,
          'FT_EPOCHS': 6,
          'OPTIMIZER': 'adam',
          'FT_LEARNING_RATE': 2e-5,
          'BATCH_SIZE': 64,
          'NUM_STEPS': len(X_train.index) // 64,
          'DISTILBERT_DROPOUT': 0.2,
          'DISTILBERT_ATT_DROPOUT': 0.2,
          'LAYER_DROPOUT': 0.2,
          'KERNEL_INITIALIZER': 'GlorotNormal',
          'BIAS_INITIALIZER': 'zeros',
          'POS_PROBA_THRESHOLD': 0.5,          
          'ADDED_LAYERS': 'Dense 256, Dense 32, Dropout 0.2',
          'LR_SCHEDULE': '5e-5 for 6 epochs, Fine-tune w/ adam for 6 epochs @2e-5',
          'FREEZING': 'All DistilBERT layers frozen for 6 epochs, then unfrozen for 6',
          'CALLBACKS': '[early_stopping monitoring val_loss w/ patience=0]',
          'RANDOM_STATE':42
          }


# 1. Set `PYTHONHASHSEED` environment variable at a fixed value
os.environ['PYTHONHASHSEED']=str(params['RANDOM_STATE'])

# 2. Set `python` built-in pseudo-random generator at a fixed value
random.seed(params['RANDOM_STATE'])

# 3. Set `numpy` pseudo-random generator at a fixed value
np.random.seed(params['RANDOM_STATE'])

# 4. Set `tensorflow` pseudo-random generator at a fixed value
tf.random.set_seed(seed=params['RANDOM_STATE'])

## Build Model 
def build_model(transformer, max_length=512):
    """""""""
    Template for building a model off of the BERT or DistilBERT architecture
    for a binary classification task.

    Input:
      - transformer:  a base Hugging Face transformer model object (BERT or DistilBERT)
                      with no added classification head attached.
      - max_length:   integer controlling the maximum number of encoded tokens 
                      in a given sequence.
    
    Output:
      - model:        a compiled tf.keras.Model with added classification layers 
                      on top of the base pre-trained model architecture.
"""""""""
    # Define weight initializer with a random seed to ensure reproducibility
    weight_initializer = tf.keras.initializers.GlorotNormal(seed=params['RANDOM_STATE']) 
    
    # Define input layers
    input_ids_layer = tf.keras.layers.Input(shape=(max_length,), 
                                            name='input_ids', 
                                            dtype='int32')
    input_attention_layer = tf.keras.layers.Input(shape=(max_length,), 
                                                  name='input_attention', 
                                                  dtype='int32')
    
    # DistilBERT outputs a tuple where the first element at index 0
    # represents the hidden-state at the output of the model's last layer.
    # It is a tf.Tensor of shape (batch_size, sequence_length, hidden_size=768).
    last_hidden_state = transformer([input_ids_layer, input_attention_layer])[0]
    
    # We only care about DistilBERT's output for the [CLS] token, which is located
    # at index 0.  Splicing out the [CLS] tokens gives us 2D data.
    cls_token = last_hidden_state[:, 0, :]
    
    D1 = tf.keras.layers.Dropout(params['LAYER_DROPOUT'],
                                 seed=params['RANDOM_STATE']
                                )(cls_token)
    
    X = tf.keras.layers.Dense(256,
                              activation='relu',
                              kernel_initializer=weight_initializer,
                              bias_initializer='zeros'
                              )(D1)
    
    D2 = tf.keras.layers.Dropout(params['LAYER_DROPOUT'],
                                 seed=params['RANDOM_STATE']
                                )(X)
    
    X = tf.keras.layers.Dense(32,
                              activation='relu',
                              kernel_initializer=weight_initializer,
                              bias_initializer='zeros'
                              )(D2)
    
    D3 = tf.keras.layers.Dropout(params['LAYER_DROPOUT'],
                                 seed=params['RANDOM_STATE']
                                )(X)
    
    # Define a single node that makes up the output layer (for binary classification)
    output = tf.keras.layers.Dense(1, 
                                   activation='sigmoid',
                                   kernel_initializer=weight_initializer,  # CONSIDER USING CONSTRAINT
                                   bias_initializer='zeros'
                                   )(D3)
    
    # Define the model
    model = tf.keras.Model([input_ids_layer, input_attention_layer], output)
    
    # Compile the model
    model.compile(tf.keras.optimizers.Adam(lr=params['LEARNING_RATE']), 
                  loss='binary_crossentropy',
                  metrics=['accuracy'])
    
    return model

In [7]:
gc.collect()

20

In [8]:
config = DistilBertConfig(dropout=params['DISTILBERT_DROPOUT'], 
                          attention_dropout=params['DISTILBERT_ATT_DROPOUT'], 
                          output_hidden_states=True)
distilBERT = TFDistilBertModel.from_pretrained('distilbert-base-uncased', config=config)

# Freeze DistilBERT layers to preserve pre-trained weights 
for layer in distilBERT.layers:
    layer.trainable = False

# Build model
model = build_model(distilBERT)

# Train Weights of Added Layers and Classification Head 

# Train the model
train_history1 = model.fit(
    x = [X_train_ids, X_train_attention],
    y = y_train.to_numpy(),
    epochs = params['EPOCHS'],
    batch_size = params['BATCH_SIZE'],
    steps_per_epoch = params['NUM_STEPS'],
    validation_data = ([X_valid_ids, X_valid_attention], y_valid.to_numpy()),
    verbose=1
)

Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertModel: ['vocab_projector', 'vocab_layer_norm', 'vocab_transform', 'activation_13']
- This IS expected if you are initializing TFDistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFDistilBertModel were initialized from the model checkpoint at distilbert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.


Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


In [9]:
params['BATCH_SIZE']=16
# Unfreeze DistilBERT weights to enable fine-tuning
for layer in distilBERT.layers:
    layer.trainable = True

# Lower the learning rate to prevent destruction of pre-trained weights
optimizer = tf.keras.optimizers.Adam(lr=params['FT_LEARNING_RATE'])

# Recompile model after unfreezing
model.compile(optimizer=optimizer, 
              loss='binary_crossentropy',
              metrics=['accuracy'])

# Define callbacks
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss',
                                                  mode='min',
                                                  min_delta=0,
                                                  patience=0,
                                                  restore_best_weights=True)

# Train the model
train_history2 = model.fit(
    x = [X_train_ids, X_train_attention],
    y = y_train.to_numpy(),
    epochs = params['FT_EPOCHS'],
    batch_size = params['BATCH_SIZE'],
    steps_per_epoch = params['NUM_STEPS'],
    validation_data = ([X_valid_ids, X_valid_attention], y_valid.to_numpy()),
    callbacks=[early_stopping],
    verbose=1
)


Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


In [33]:
X_test = test[1][:500]
y_test = test[0][:500]
X_test_ids, X_test_attention = batch_encode(tokenizer, X_test.tolist())

y_pred = model.predict([X_test_ids, X_test_attention])
y_pred_thresh = np.where(y_pred >= params['POS_PROBA_THRESHOLD'], 2, 1)

# Get evaluation results
accuracy = accuracy_score(y_test, y_pred_thresh)
auc_roc = roc_auc_score(y_test, y_pred)

# Log the ROC curve
# fpr, tpr, thresholds = roc_curve(y_test.to_numpy(), y_pred)
print('Accuracy:  ', accuracy)    # 0.8801
print('ROC-AUC:   ', auc_roc)  

Accuracy:   0.5
ROC-AUC:    0.5


In [36]:
test

Unnamed: 0,0,1
0,2,"Contrary to other reviews, I have zero complaints about the service or the prices. I have been getting tire service here for the past 5 years now, and compared to my experience with places like Pep Boys, these guys are experienced and know what they're doing. \nAlso, this is one place that I do not feel like I am being taken advantage of, just because of my gender. Other auto mechanics have been notorious for capitalizing on my ignorance of cars, and have sucked my bank account dry. But here, my service and road coverage has all been well explained - and let up to me to decide. \nAnd they just renovated the waiting room. It looks a lot better than it did in previous years."
1,1,"Last summer I had an appointment to get new tires and had to wait a super long time. I also went in this week for them to fix a minor problem with a tire they put on. They \""fixed\"" it for free, and the very next morning I had the same issue. I called to complain, and the \""manager\"" didn't even apologize!!! So frustrated. Never going back. They seem overpriced, too."
2,2,"Friendly staff, same starbucks fair you get anywhere else. Sometimes the lines can get long."
3,1,"The food is good. Unfortunately the service is very hit or miss. The main issue seems to be with the kitchen, the waiters and waitresses are often very apologetic for the long waits and it's pretty obvious that some of them avoid the tables after taking the initial order to avoid hearing complaints."
4,2,"Even when we didn't have a car Filene's Basement was worth the bus trip to the Waterfront. I always find something (usually I find 3-4 things and spend about $60) and better still, I am always still wearing the clothes and shoes 3 months later. \n\nI kind of suspect this is the best shopping in Pittsburgh; it's much better than the usual department stores, better than Marshall's and TJ Maxx and better than the Saks downtown, even when it has a sale. Selection, bargains AND quality.\n\nI like this Filene's better than Gabriel Brothers, which are harder to get to. Gabriel Brothers are a real discount shopper's challenge and I'm afraid I didn't live in Pittsburgh long enough to develop the necessary skills . . . Filene's was still up and running in June 2007 when I left town."
...,...,...
37995,1,If I could give 0...I would. Don't do it.
37996,2,"Items Selected:\nChocolate Cinnamon Horn\nSmall Cinnamon Crunch Cronut\nBlueberry Fritter\nBlueberry Frosted Cake\nApple Cinnamon Bear Claw\nCinnamon Crunch Hole, Glazed Hole, Powdered Sugar Hole\n\nA new year and a new favorite, the second of back-to-back weeks at Ace Donuts again showed the high quality of the new bakery's goods and this time opting for items not available on my first visit it was a truly excellent quintet that kicked of 2015 - the soft ring beneath blueberry frosting eating more like butter-cake than a doughnut while the fritter and cronut again shined despite selecting smaller versions and different constituents. More than enough to share, but so good that one may not want to, it was largely due to my early hour of arrival that the Jumbo Bear Claw stuffed with ample amounts of cinnamon apples outshined any similar pastry in the city and although the combination of rich chocolate and substantial notes of cinnamon may not appeal to every palate the crispy exterior and soft insides of the gnarly horn was a veritable cornucopia of flavor, the warm delivery making the aromatics even more impactful and the chocolate just messy enough to justify the use of a fork and knife.\n\nUndoubtedly the best all-around doughnuttery in Las Vegas - Artisan, Old School, or Otherwise."
37997,1,"Expensive lunch meals. Fried pickles were good. Waitress messed up 2 orders out of 4. Don't think I'll return. Asked for no cheese waitress joked extra cheese, then brought my meal with cheese. Better places to eat in area."
37998,1,Highly overpriced and food was cold. Our waitress seemed confused and didn't know the menu. She had no idea about gluten free. The gluten free bun was awful.


In [None]:
In [ ]:
# Build train_history
history_df1 = pd.DataFrame(train_history1.history)
history_df2 = pd.DataFrame(train_history2.history)
history_df = history_df1.append(history_df2, ignore_index=True)

# Plot training and validation loss over each epoch
history_df.loc[:, ['loss', 'val_loss']].plot()
plt.title(label='Training + Validation Loss Over Time', fontsize=17, pad=19)
plt.xlabel('Epoch', labelpad=14, fontsize=14)
plt.ylabel('Binary Crossentropy Loss', labelpad=16, fontsize=14)
print("Minimum Validation Loss: {:0.4f}".format(history_df['val_loss'].min()))

# Save figure
plt.savefig('figures/balanced_trainvalloss.png', dpi=300.0, transparent=True)

In [None]:
# Plot confusion matrix
skplt.metrics.plot_confusion_matrix(y_test.to_list(),
                                    y_pred_thresh.tolist(),
                                    figsize=(6,6),
                                    text_fontsize=14)
plt.title(label='Test Confusion Matrix', fontsize=20, pad=17)
plt.xlabel('Predicted Label', labelpad=14)
plt.ylabel('True Label', labelpad=14)

# Save the figure
plt.savefig('figures/balanced_confusionmatrix.png', dpi=300.0, transparent=True)

In [None]:

# Save model
tf.saved_model.save(model, 'models/balanced_model')

In [None]:

def augment_sentence(sentence, aug, num_threads):
    """""""""
    Constructs a new sentence via text augmentation.
    
    Input:
        - sentence:     A string of text
        - aug:          An augmentation object defined by the nlpaug library
        - num_threads:  Integer controlling the number of threads to use if
                        augmenting text via CPU
    Output:
        - A string of text that been augmented
    """""""""
    return aug.augment(sentence, num_thread=num_threads)
    


def augment_text(df, aug, num_threads, num_times, review_type):
    """""""""
    Takes a pandas DataFrame and augments its text data.
    
    Input:
        - df:            A pandas DataFrame containing the columns:
                                - 'comment_text' containing strings of text to augment.
                                - 'isToxic' binary target variable containing 0's and 1's.
        - aug:           Augmentation object defined by the nlpaug library.
        - num_threads:   Integer controlling number of threads to use if augmenting
                         text via CPU
        - num_times:     Integer representing the number of times to augment text.
        - review_type:   Type of review to augment (postive or negative)
    Output:
        - df:            Copy of the same pandas DataFrame with augmented data 
                         appended to it and with rows randomly shuffled.
    """""""""
    
    # Get rows of data to augment
    to_augment = df[df[0]==review_type]
    to_augmentX = to_augment[1]
    to_augmentY = np.ones(len(to_augmentX.index) * num_times, dtype=np.int8)
    
    # Build up dictionary containing augmented data
    aug_dict = {1:[], 0:to_augmentY}
    for i in tqdm(range(num_times)):
        augX = [augment_sentence(x, aug, num_threads) for x in to_augmentX]
        aug_dict[1].extend(augX)
    
    # Build DataFrame containing augmented data
    aug_df = pd.DataFrame.from_dict(aug_dict)
    
    return df.append(aug_df, ignore_index=True).sample(frac=1, random_state=42)
    

    
# Define nlpaug augmentation object 
aug10p = nlpaw.ContextualWordEmbsAug(model_path='bert-base-uncased', aug_min=1, aug_p=0.1, action="substitute")

# Augment Negative class ([0] == 1)
df_train_valid = augment_text(dask_df_train_valid, aug10p, num_threads=8, num_times=3,review_type=1)
# Augment Positive class ([0] == 1)
df_train_valid = augment_text(dask_df_train_valid, aug10p, num_threads=8, num_times=3,review_type=2)