In [0]:
# %pip install torch
# %pip install transformers
# %pip install flair
# %pip install vaderSentiment
# %pip install textblob

In [0]:
import flair
flair_sentiment = flair.models.TextClassifier.load('en-sentiment')

In [0]:
import torch
from transformers.file_utils import is_tf_available, is_torch_available, is_torch_tpu_available
from transformers import BertTokenizerFast, BertForSequenceClassification
from transformers import Trainer, TrainingArguments
import numpy as np
import pandas as pd
import random
from sklearn.model_selection import train_test_split
import calendar
from pyspark.sql.functions import col

###Read Data

In [0]:
#Read data from widget

#dbutils.widgets.removeAll()
#dbutils.widgets.remove("a) Base Path")
BASE_PATH = "a) Testing Data Path"
dbutils.widgets.text(BASE_PATH, "s3://ipsy-databricks-mlp/research/ethan/all_passives_gb_gbplus_gbx_feb_to_jun.csv")
base_path = dbutils.widgets.get(BASE_PATH)

In [0]:
#reading data
df = spark.read.csv(base_path, header=True)
print('num rows: ', df.count())
df.display()

###Preprocessing

In [0]:
#rename columns
df = df.withColumnRenamed('How likely are you to recommend the Glam Bag to a friend?', 'nps')
df = df.withColumnRenamed('How likely are you to recommend Glam Bag Plus to a friend?', 'nps')
df = df.withColumnRenamed('How likely are you to recommend Glam Bag X to a friend?', 'nps')
df = df.withColumnRenamed('What is the most important reason for your recommendation answer?', 'comments')
df = df.withColumnRenamed('Start Date', 'start')
df = df.withColumnRenamed('End Date', 'end')
df = df.withColumnRenamed('Subscription', 'subscription')
df.display()

In [0]:
#Filter out null comments, make nps all ints
df = df.select('start', 'end', 'nps', 'comments', 'subscription', 'userId')
df = df.filter(df["comments"].isNotNull())
df = df.replace("10 = Extremely likely", '10', ['nps']).replace("0 = Not at all likely", '0', ['nps']) # added new
df = df.withColumn('nps', df.nps.cast('int'))
df = df.filter(df["nps"].isNotNull())

df.display()

In [0]:
df_pd = df.toPandas()
df_pd

In [0]:
#Replace weird syntax
'''
ex: ‚Äô --> '
ex: ‚Äú --> " (opening quote)
ex: ‚Äù --> " (ending quote)
'''
weird_syntax = {"‚Äô": '\'', 
                "‚Äú": '\'', 
                "‚Äù": '\'',
                "‚Äò": '\'',
                "‚Äö√Ñ√¥": '\'',
                "‚Äö√Ñ√∫": '\'',
                "‚Äö√Ñ√π": '\'',
                "‚Äî": ' ',
                "‚Ä¶": ' ',
                "Ô£ø√º¬ß‚àëÔ£ø√º√®√¶": ' ',
                "‚Å∞": ' ',
                "‚ù§": ' ',
                "‚óç‚Ä¢·¥ó‚Ä¢‚óç": ' ',
                "ü§∑‚Äç‚ôÄÔ∏è": ' '
               }

for index, item in df_pd['comments'].items():
  for key in weird_syntax:
    df_pd['comments'][index] = df_pd['comments'][index].replace(key, weird_syntax[key])
  
df_pd.display()

In [0]:
#Filter out random comments
random = ["na", "na ", "no", "no ", "Na", "Na ", "No", "No ", "NA", "NA ", "NO", "NO ", "n/a", "n/a ", "N/A", "N/A ", ".", ". ", "?", "? ", ",", ", ", "!", "! ", "..", ".. ", "...", "... ", "....", ".... ", ".....", "..... ", " ", "  ", "   ", "    ", "     "]

df_pd = df_pd[~df_pd['comments'].isin(random)]
df_pd

In [0]:
#Shift date and add month column
import calendar
df_pd['date'] = pd.to_datetime(df_pd['end'], format='%Y-%m-%d')
df_pd['date_shifted'] = df_pd['date'] + pd.TimedeltaIndex( [-6]*len(df_pd.index), unit='d') #shift back 6 days
df_pd['month'] = pd.DatetimeIndex(df_pd['date_shifted']).month
df_pd['month'] = df_pd['month'].apply(lambda x: calendar.month_abbr[x])
df_pd = df_pd.drop(['date', 'date_shifted'], axis=1)
df_pd

In [0]:
#Add NPS type column (detractors, passives, and promoters)
df_pd['nps'] = df_pd['nps'].astype(int)
df_pd["nps_type"] = df_pd["nps"].apply(lambda x : 'detractor' if x <=6 else 'passive' if x <=8 else 'promotor')
df_pd['nps'] = df_pd['nps'].astype(str)

df_pd

###Flair

In [0]:
#Get comment score and type (POSITIVE or NEGATIVE) of each comment

def comment_score(row): 
  sentence = str(row['comments'])
  #print(sentence)
  s = flair.data.Sentence(sentence)
  flair_sentiment.predict(s)
  total_sentiment = s.labels
  s = total_sentiment[0].to_dict()
  #print(s['value'])
  #print(s['confidence'])
  return s['value'], s['confidence']

df_pd['comment_type'], df_pd['comment_score'] = zip(*df_pd.apply(comment_score, axis=1))


In [0]:
df_pd['comment_score'] = df_pd['comment_score'] * 1000
df_pd["comment_score"] = df_pd["comment_score"].astype(int)


In [0]:
#Filter out positive comments

df_neg = df_pd[(df_pd["comment_type"]=='NEGATIVE')].sort_values(by = 'comment_score')
df_neg.display()

In [0]:
print('num of non-null comments: ', df_pd.count())
print('num of non-null negative comments: ', df_neg.count())
print('percent of negative comments: ', df_neg.count()/df_pd.count())

###Prediction Using Model

In [0]:
#Read data from widget
#TODO: make labels NOT widget

#dbutils.widgets.removeAll()
#dbutils.widgets.remove("b) Labels")
#LABELS = "b) Labels"
#dbutils.widgets.text(LABELS, 'excitement/quality, other, personalization, repetitive, shipping/packaging, value/price')
#labels = dbutils.widgets.get(LABELS)
#labels_arr = labels.split(", ")

In [0]:
model_path = "sentiment-topic-bert-base-uncased"
target_names = ['excitement/quality', 'other', 'personalization', 'repetitive', 'shipping/packaging', 'value/price']
#target_names = #labels_arr
max_length = 200

In [0]:
#Read data from widget

#model path rn: dbfs:/mnt/ipsy-databricks-mlp/research/ethan/monolabeled-model-business-w-flair-and-other
MODEL_PATH = "b) Model Path"
dbutils.widgets.text(MODEL_PATH, "dbfs:/mnt/ipsy-databricks-mlp/research/ethan/monolabeled-model-business-w-flair-and-other")
pretrained_model_path = dbutils.widgets.get(MODEL_PATH)
print(pretrained_model_path)

In [0]:
#load saved model
dbutils.fs.cp(pretrained_model_path, "file:/databricks/driver/sentiment-topic-bert-base-uncased", recurse=True)

In [0]:
tokenizer_2 = BertTokenizerFast.from_pretrained(model_path)
model_2 = BertForSequenceClassification.from_pretrained(model_path, num_labels=len(target_names))

###Visualizing Findings

In [0]:
df_used = df_neg[:]
df_used

In [0]:
def get_prediction_clean(text):
    # prepare our text into tokenized sequence
    inputs = tokenizer_2(text, padding=True, truncation=True, max_length=max_length, return_tensors="pt")#.to("cuda")
    # perform inference to our model
    outputs = model_2(**inputs)
    # get output probabilities by doing softmax
    probs = outputs[0].softmax(1)
    # executing argmax function to get the candidate label
    return target_names[probs.argmax()]

In [0]:
df_used['predicted_label'] = df_used['comments'].apply(get_prediction_clean)
pred_label = df_used['predicted_label']
df_used

In [0]:
# You can download results here!
df_used.display()

In [0]:
#Convert pandas into spark dataframe

# Enable Arrow-based columnar data transfers
spark.conf.set("spark.sql.execution.arrow.enabled", "true")

# Create a Spark DataFrame from a pandas DataFrame using Arrow
df_labeled = spark.createDataFrame(df_used)
df_labeled.display()

In [0]:
df_labeled = df_labeled.replace(['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'], ['1 - Jan', '2 - Feb', '3 - Mar', '4 - Apr', '5 - May', '6 - Jun', '7 - Jul', '8 - Aug', '9 - Sep', '10 - Oct', '11 - Nov', '12 - Dec'], 'month')
df_labeled.display()

####Grouped by Topic Type

In [0]:
df_grouped_by_label = df_labeled.groupBy('predicted_label').count().sort('count', ascending=False)
df_grouped_by_label.display()

In [0]:
df_grouped_by_label.display()

####Grouped by Topic Type, Month

In [0]:
df_grouped_by_label_month = df_labeled.groupBy(['predicted_label', 'month']).count().sort(col('month').asc(), col('count').desc())
df_grouped_by_label_month.display()

In [0]:
df_grouped_by_label_month.display()

In [0]:
df_grouped_by_label_month.display()

####Grouped by Topic Type, Month, Subscription

In [0]:
df_grouped_by_label_month_subscription = df_labeled.groupBy(['predicted_label', 'month', 'subscription']).count().sort(col('subscription').asc(), col('month').asc(), col('count').desc())
df_grouped_by_label_month_subscription.display()

In [0]:
df_grouped_by_label_month_subscription.display()

In [0]:
df_grouped_by_label_month_subscription.display()

####Grouped by NPS

In [0]:
df_grouped_by_nps = df_labeled.groupBy(['nps']).count().sort(col('count').desc())
df_grouped_by_nps.display()

####Grouped by NPS, Topic Type

In [0]:
df_grouped_by_label_nps = df_labeled.groupBy(['predicted_label', 'nps']).count().sort(col('nps').asc(), col('count').desc())
df_grouped_by_label_nps.display()

In [0]:
df_grouped_by_label_nps.display()

In [0]:
df_grouped_by_label_nps.display()

####Grouped by NPS, Topic Type, Month

In [0]:
df_grouped_by_label_month_nps = df_labeled.groupBy(['predicted_label', 'nps', 'month']).count().sort(col('nps').asc(), col('month').asc(), col('count').desc())
df_grouped_by_label_month_nps.display()

In [0]:
df_grouped_by_label_month_nps.display()

In [0]:
df_grouped_by_label_month_nps.display()

####Grouped by NPS, Topic Type, Subscription

In [0]:
df_grouped_by_label_subscription_nps = df_labeled.groupBy(['predicted_label', 'nps', 'subscription']).count().sort(col('subscription').asc(), col('nps').asc(), col('count').desc())
df_grouped_by_label_subscription_nps.display()

In [0]:
df_grouped_by_label_subscription_nps.display()

In [0]:
df_grouped_by_label_subscription_nps.display()

####Grouped by NPS, Topic Type, Subscription, Month

In [0]:
df_grouped_by_label_subscription_month_nps = df_labeled.groupBy(['predicted_label', 'nps', 'month', 'subscription']).count().sort(col('subscription').asc(), col('month').asc(), col('nps').asc(), col('count').desc())
df_grouped_by_label_subscription_month_nps.display()