In [0]:
# %pip install torch
# %pip install transformers
# %pip install flair
# %pip install vaderSentiment
# %pip install textblob

In [0]:
import flair
flair_sentiment = flair.models.TextClassifier.load('en-sentiment')

In [0]:
import torch
from transformers.file_utils import is_tf_available, is_torch_available, is_torch_tpu_available
from transformers import BertTokenizerFast, BertForSequenceClassification
from transformers import Trainer, TrainingArguments
import numpy as np
import random
from sklearn.model_selection import train_test_split
import numpy as np
from pyspark.sql.functions import col
import pandas as pd
import calendar

In [0]:
def set_seed(seed: int):
    """
    Helper function for reproducible behavior to set the seed in ``random``, ``numpy``, ``torch`` and/or ``tf`` (if
    installed).

    Args:
        seed (:obj:`int`): The seed to set.
    """
    random.seed(seed)
    np.random.seed(seed)
    if is_torch_available():
        torch.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
        # ^^ safe to call this function even if cuda is not available
    if is_tf_available():
        import tensorflow as tf

        tf.random.set_seed(seed)

set_seed(1)

In [0]:
# the model we gonna train, base uncased BERT (https://huggingface.co/models?filter=text-classification)
model_name = "bert-base-uncased"
max_length = 200 # max sequence length for each document/sentence sample

In [0]:
# load the tokenizer
tokenizer = BertTokenizerFast.from_pretrained(model_name, do_lower_case=True)

###Read data

In [0]:
#Read data from widget

BASE_PATH = "a) Training Data Path"
dbutils.widgets.text(BASE_PATH, "s3://ipsy-databricks-mlp/research/ethan/all_detractors_gb_gbplus_gbx_feb_to_jun.csv")
base_path = dbutils.widgets.get(BASE_PATH)

In [0]:
#df = read('s3://ipsy-databricks-mlp/research/ethan/first1500detractorsmonolabeled.csv', ',')
#df = read('s3://ipsy-databricks-mlp/research/ethan/alldetractorsmonolabeled.csv', ',')
#df = spark.read.csv('s3://ipsy-databricks-mlp/research/ethan/all_detractors_gb_gbplus_gbx_feb_to_jun.csv', header=True)

df = spark.read.csv(base_path, header=True)
print('num rows: ', df.count())

df.display()

###Preprocessing

In [0]:
#rename columns
df = df.withColumnRenamed('How likely are you to recommend the Glam Bag to a friend?', 'nps')
df = df.withColumnRenamed('How likely are you to recommend Glam Bag Plus to a friend?', 'nps')
df = df.withColumnRenamed('How likely are you to recommend Glam Bag X to a friend?', 'nps')
df = df.withColumnRenamed('What is the most important reason for your recommendation answer?', 'comments')
df = df.withColumnRenamed('Start Date', 'start')
df = df.withColumnRenamed('End Date', 'end')
df = df.withColumnRenamed('Subscription', 'subscription')
df.display()

In [0]:
#Filter out null comments, make nps all ints
df = df.select('start', 'end', 'topic_type', 'nps', 'comments', 'subscription', 'userId')
df = df.filter(df["comments"].isNotNull())
df = df.filter(df["topic_type"].isNotNull())
df = df.replace("10 = Extremely likely", '10', ['nps']).replace("0 = Not at all likely", '0', ['nps']) # added new
df = df.withColumn('nps', df.nps.cast('int'))
df = df.filter(df["nps"].isNotNull())

df.display()

In [0]:
df_pd = df.toPandas()
df_pd

In [0]:
#only keep mono-labeled data
df_mono_labeled = df_pd[df_pd["topic_type"].str.contains(",")==False]
df_mono_labeled

In [0]:
#Replace weird syntax
'''
ex: ‚Äô --> '
ex: ‚Äú --> " (opening quote)
ex: ‚Äù --> " (ending quote)
'''
weird_syntax = {"‚Äô": '\'', 
                "‚Äú": '\'', 
                "‚Äù": '\'',
                "‚Äò": '\'',
                "‚Äö√Ñ√¥": '\'',
                "‚Äö√Ñ√∫": '\'',
                "‚Äö√Ñ√π": '\'',
                "‚Äî": ' ',
                "‚Ä¶": ' ',
                "Ô£ø√º¬ß‚àëÔ£ø√º√®√¶": ' ',
                "‚Å∞": ' ',
                "‚ù§": ' ',
                "‚óç‚Ä¢·¥ó‚Ä¢‚óç": ' ',
                "ü§∑‚Äç‚ôÄÔ∏è": ' '
               }

for index, item in df_mono_labeled['comments'].items():
  for key in weird_syntax:
    df_mono_labeled['comments'][index] = df_mono_labeled['comments'][index].replace(key, weird_syntax[key])
  
df_mono_labeled.display()

In [0]:
#Filter out random comments
random = ["na", "na ", "no", "no ", "Na", "Na ", "No", "No ", "NA", "NA ", "NO", "NO ", "n/a", "n/a ", "N/A", "N/A ", ".", ". ", "?", "? ", ",", ", ", "!", "! ", "..", ".. ", "...", "... ", "....", ".... ", ".....", "..... ", " ", "  ", "   ", "    ", "     "]

df_mono_labeled = df_mono_labeled[~df_mono_labeled['comments'].isin(random)]
df_mono_labeled

In [0]:
#Shift date and add month column

df_mono_labeled['date'] = pd.to_datetime(df_mono_labeled['end'], format='%Y-%m-%d')
df_mono_labeled['date_shifted'] = df_mono_labeled['date'] + pd.TimedeltaIndex( [-6]*len(df_mono_labeled.index), unit='d') #shift back 6 days
df_mono_labeled['month'] = pd.DatetimeIndex(df_mono_labeled['date_shifted']).month
df_mono_labeled['month'] = df_mono_labeled['month'].apply(lambda x: calendar.month_abbr[x])
df_mono_labeled = df_mono_labeled.drop(['date', 'date_shifted'], axis=1)
df_mono_labeled

In [0]:
#Add NPS type column (detractors, passives, and promoters)
df_mono_labeled['nps'] = df_mono_labeled['nps'].astype(int)
df_mono_labeled["nps_type"] = df_mono_labeled["nps"].apply(lambda x : 'detractor' if x <=6 else 'passive' if x <=8 else 'promotor')
df_mono_labeled['nps'] = df_mono_labeled['nps'].astype(str)

df_mono_labeled

###Flair

In [0]:
#Get comment score and type (POSITIVE or NEGATIVE) of each comment

def comment_score(row): 
  sentence = str(row['comments'])
  s = flair.data.Sentence(sentence)
  flair_sentiment.predict(s)
  total_sentiment = s.labels
  s = total_sentiment[0].to_dict()
  return s['value'], s['confidence']

df_mono_labeled['comment_type'], df_mono_labeled['comment_score'] = zip(*df_mono_labeled.apply(comment_score, axis=1))

In [0]:
df_mono_labeled['comment_score'] = df_mono_labeled['comment_score'] * 1000
df_mono_labeled["comment_score"] = df_mono_labeled["comment_score"].astype(int)

In [0]:
#Filter out positive comments

df_neg = df_mono_labeled[(df_mono_labeled["comment_type"]=='NEGATIVE')].sort_values(by = 'comment_score')
df_neg.display()

In [0]:
print('num of non-null comments: ', df_mono_labeled.count())
print('num of non-null negative comments: ', df_neg.count())
print('percent of negative comments: ', df_neg.count()/df_mono_labeled.count())

###Training Model

In [0]:
#Read data from widget

#dbutils.widgets.removeAll()
#dbutils.widgets.remove("Base Path")
TEST_SIZE = "b) Test Size"
dbutils.widgets.dropdown(TEST_SIZE, "0.2", ["0.1", "0.15", "0.2", "0.25", "0.3"])
test_size = float(dbutils.widgets.get(TEST_SIZE))

In [0]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

def read_comments(df, test_size=0.2):
    labelencoder = LabelEncoder()
    df['topic_type_label'] = labelencoder.fit_transform(df['topic_type']) #transforms topic type from strs to ints
    documents = df.comments.tolist()
    labels = df.topic_type_label.tolist()
    return train_test_split(documents, labels, test_size=test_size, stratify=labels), labelencoder.classes_ 
      
(train_texts, valid_texts, train_labels, valid_labels), target_names = read_comments(df_neg, test_size) 

In [0]:
print("target names: ", target_names)

In [0]:
# tokenize the dataset, truncate when passed `max_length`, 
# and pad with 0's when less than `max_length`
# always tokenize after splitting
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=200)
valid_encodings = tokenizer(valid_texts, truncation=True, padding=True, max_length=200)

In [0]:
class comment_dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
        item["labels"] = torch.tensor([self.labels[idx]])
        return item

    def __len__(self):
        return len(self.labels)

# convert our tokenized data into a torch Dataset
train_dataset = comment_dataset(train_encodings, train_labels)
valid_dataset = comment_dataset(valid_encodings, valid_labels)

In [0]:
# load the model and pass to CUDA
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=len(target_names))#.to("cuda")
#from transformers import AutoModelForSequenceClassification
#model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=len(set(target_names)))

In [0]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    acc = accuracy_score(labels, preds)
    f1_scores = f1_score(labels, preds, average='weighted')
    return {'accuracy': acc, 'f1_score': f1_scores,}

In [0]:
#Read data from widget

#dbutils.widgets.removeAll()
#dbutils.widgets.remove("Base Path")
NUM_EPOCHS = "c) Num of Train Epochs"
dbutils.widgets.dropdown(NUM_EPOCHS, "15", ["5", "10", "15", "20"])
num_epochs = int(dbutils.widgets.get(NUM_EPOCHS))

In [0]:
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=num_epochs,     # total number of training epochs
    per_device_train_batch_size=32,  # batch size per device during training
    per_device_eval_batch_size=32,   # batch size for evaluation
    warmup_steps=10,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    load_best_model_at_end=True,     # load the best model when finished training (default metric is loss)
    # but you can specify `metric_for_best_model` argument to change to accuracy or other metric
    logging_steps=100,               # log & save weights each logging_steps
    evaluation_strategy="steps",     # evaluate each `logging_steps`
)

In [0]:
trainer = Trainer(
    model=model,                         # the instantiated Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=valid_dataset,          # evaluation dataset
    compute_metrics=compute_metrics,     # the callback that computes metrics of interest
)

In [0]:
# train the model
trainer.train()

In [0]:
#clear prev model storage
#! rm -rf results/checkpoint-*
#! df -h /databricks/driver

# evaluate the current model after training
trainer.evaluate() 

###Save Model

In [0]:
# saving the fine tuned model & tokenizer - https://huggingface.co/transformers/main_classes/model.html

model_path = "sentiment-topic-bert-base-uncased"
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)

In [0]:
SAVE_PATH = "d) Save Path"
dbutils.widgets.text(SAVE_PATH, "dbfs:/mnt/ipsy-databricks-mlp/research/ethan/monolabeled-model-full-w-others-copy")
save_path = dbutils.widgets.get(SAVE_PATH)

In [0]:
#save to amazon s3
dbutils.fs.mv("file:/databricks/driver/sentiment-topic-bert-base-uncased", save_path, recurse=True)

In [0]:
#load saved model
dbutils.fs.cp(save_path, "file:/databricks/driver/sentiment-topic-bert-base-uncased", recurse=True)
tokenizer_2 = BertTokenizerFast.from_pretrained(model_path)
model_2 = BertForSequenceClassification.from_pretrained(model_path, num_labels=len(target_names))#.to("cuda")

###Evaluating Model

In [0]:
df_used = df_neg[:]
df_used

In [0]:
def get_prediction_clean(text):
    # prepare our text into tokenized sequence
    inputs = tokenizer_2(text, padding=True, truncation=True, max_length=max_length, return_tensors="pt")
    # perform inference to our model
    outputs = model_2(**inputs)
    # get output probabilities by doing softmax
    probs = outputs[0].softmax(1)
    # executing argmax function to get the candidate label
    return target_names[probs.argmax()]

In [0]:
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix

def accuracy(y: list, pred: list):
    # accuracy: (tp + tn) / (p + n)
    from sklearn.metrics import accuracy_score
    accuracy = accuracy_score(y, pred)    
    return accuracy

def cm(y: list, pred: list):
    from sklearn.metrics import confusion_matrix
    return confusion_matrix(y, pred)  
  
def confusion_matrix_plot(y, pred, normalize=False):
    plot_confusion_matrix(y, pred, classes=target_names, normalize=normalize, title='Confusion matrix')

def plot_confusion_matrix(y_true, y_pred, classes, normalize=False, title=None, cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    from sklearn.utils.multiclass import unique_labels
    from sklearn.metrics import confusion_matrix

    # Compute confusion matrix
    cm = confusion_matrix(y_true, y_pred)
    # Only use the labels that appear in the data
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

    fig, ax = plt.subplots()
    im = ax.imshow(cm, interpolation='nearest', cmap=cmap)
    ax.figure.colorbar(im, ax=ax)
    # We want to show all ticks...
    ax.set(xticks=np.arange(cm.shape[1]),
        yticks=np.arange(cm.shape[0]),
        # ... and label them with the respective list entries
        xticklabels=classes, yticklabels=classes,
        title=title,
        ylabel='True label',
        xlabel='Predicted label')

    # Rotate the tick labels and set their alignment.
    plt.setp(ax.get_xticklabels(), rotation=45, ha="right", rotation_mode="anchor")

    # Loop over data dimensions and create text annotations.
    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            ax.text(j, i, format(cm[i, j], fmt),
                    ha="center", va="center",
                    color="white" if cm[i, j] > thresh else "black")
    fig.tight_layout()
    return ax

####All Data

In [0]:
df_used['predicted_label'] = df_used['comments'].apply(get_prediction_clean)
actual_label = df_used['topic_type']
pred_label = df_used['predicted_label']
df_used

In [0]:
# You can download results here!
df_used.display()

In [0]:
confusion_matrix_plot(actual_label, pred_label)

In [0]:
confusion_matrix_plot(actual_label, pred_label, normalize=True)

In [0]:
from sklearn.metrics import classification_report
# accuracy = (tp + tn) / (tp + tn + fp + fn)
# precision = tp / (tp + fp)   tp out of all labeled p
# recall = tp / (tp + fn)      tp out of all actually p
# f1 = weighted avg of precision and recall
print('accuracy: ', accuracy(actual_label, pred_label))
print()
print(classification_report(actual_label, pred_label))

####Testing Data

In [0]:
testing_data_df = pd.DataFrame(target_names[valid_labels], columns = ['topic_type'])
testing_data_df['comments'] = valid_texts
testing_data_df

In [0]:
testing_data_df['predicted_label'] = testing_data_df['comments'].apply(get_prediction_clean)
testing_data_actual_label = testing_data_df['topic_type']
testing_data_predicted_label = testing_data_df['predicted_label']
testing_data_df

In [0]:
confusion_matrix_plot(testing_data_actual_label, testing_data_predicted_label)

In [0]:
confusion_matrix_plot(testing_data_actual_label, testing_data_predicted_label, normalize=True)

In [0]:
# accuracy = (tp + tn) / (tp + tn + fp + fn)
# precision = tp / (tp + fp)   tp out of all labeled p
# recall = tp / (tp + fn)      tp out of all actually p
# f1 = weighted avg of precision and recall
print('accuracy: ', accuracy(testing_data_actual_label, testing_data_predicted_label))
print()
print(classification_report(testing_data_actual_label, testing_data_predicted_label))