In [0]:
# %pip install torch
# %pip install transformers
# %pip install flair
# %pip install vaderSentiment
# %pip install textblob

In [0]:
import flair
flair_sentiment = flair.models.TextClassifier.load('en-sentiment')

In [0]:
import torch
from transformers.file_utils import is_tf_available, is_torch_available, is_torch_tpu_available
from transformers import BertTokenizerFast, BertForSequenceClassification
from transformers import Trainer, TrainingArguments
import numpy as np
import pandas as pd
import random
from sklearn.model_selection import train_test_split
import calendar

###Read Data

In [0]:
#Read data from widget

#dbutils.widgets.removeAll()
#dbutils.widgets.remove("a) Base Path")
BASE_PATH = "a) Testing Data Path"
dbutils.widgets.text(BASE_PATH, "s3://ipsy-databricks-mlp/research/ethan/all_passives_gb_gbplus_gbx_feb_to_jun.csv")
base_path = dbutils.widgets.get(BASE_PATH)

In [0]:
#reading data
df = spark.read.csv(base_path, header=True)
print('num rows: ', df.count())
df.display()

Start Date,End Date,userId,Subscription,How likely are you to recommend the Glam Bag to a friend?,What is the most important reason for your recommendation answer?
2021-03-04 15:19:00,2021-03-04 15:27:00,u-kcdw0g28oh1w1svx,GB,8,this is a great service with amazing producst but in Canada it's most expensive than USA ....
2021-03-04 13:26:00,2021-03-04 13:34:00,u-j4emp8j5ausf2m6,GB,7,Not expensive
2021-03-04 12:34:00,2021-03-04 12:36:00,u-jj1wf1b1u8ck8hv,GB,8,Fun products
2021-03-04 09:21:00,2021-03-04 09:33:00,u-k8nbxxactsoi1zck,GB,8,I've discovered products I love and never would have tried because of your bags.
2021-03-04 04:08:00,2021-03-04 04:26:00,u-jku687tje6kloi6,GB,8,I love trying all of the different items and then buying the full size if i love them
2021-03-03 20:48:00,2021-03-03 20:53:00,u-hx5ksb9lwvmb1ab0,GB,8,Amount of different products to try.
2021-03-03 16:40:00,2021-03-03 16:47:00,u-kicgjp5z1urrhw3,GB,8,Quality products
2021-03-03 13:08:00,2021-03-03 13:14:00,u-kh86eczzgnpxwni,GB,7,It will save money
2021-03-03 11:58:00,2021-03-03 12:11:00,u-ji2cw2m2ss9177d,GB,7,Its fun and easy.
2021-03-03 06:58:00,2021-03-03 07:08:00,u-ik4kwj2qh6gggh7,GB,8,Get to try new products and decide if you enjoy it enough to buy the full size.


###Preprocessing

In [0]:
#rename columns
df = df.withColumnRenamed('How likely are you to recommend the Glam Bag to a friend?', 'nps')
df = df.withColumnRenamed('What is the most important reason for your recommendation answer?', 'comments')
df = df.withColumnRenamed('Start Date', 'start')
df = df.withColumnRenamed('End Date', 'end')
df = df.withColumnRenamed('Subscription', 'subscription')
df.display()

start,end,userId,subscription,nps,comments
2021-03-04 15:19:00,2021-03-04 15:27:00,u-kcdw0g28oh1w1svx,GB,8,this is a great service with amazing producst but in Canada it's most expensive than USA ....
2021-03-04 13:26:00,2021-03-04 13:34:00,u-j4emp8j5ausf2m6,GB,7,Not expensive
2021-03-04 12:34:00,2021-03-04 12:36:00,u-jj1wf1b1u8ck8hv,GB,8,Fun products
2021-03-04 09:21:00,2021-03-04 09:33:00,u-k8nbxxactsoi1zck,GB,8,I've discovered products I love and never would have tried because of your bags.
2021-03-04 04:08:00,2021-03-04 04:26:00,u-jku687tje6kloi6,GB,8,I love trying all of the different items and then buying the full size if i love them
2021-03-03 20:48:00,2021-03-03 20:53:00,u-hx5ksb9lwvmb1ab0,GB,8,Amount of different products to try.
2021-03-03 16:40:00,2021-03-03 16:47:00,u-kicgjp5z1urrhw3,GB,8,Quality products
2021-03-03 13:08:00,2021-03-03 13:14:00,u-kh86eczzgnpxwni,GB,7,It will save money
2021-03-03 11:58:00,2021-03-03 12:11:00,u-ji2cw2m2ss9177d,GB,7,Its fun and easy.
2021-03-03 06:58:00,2021-03-03 07:08:00,u-ik4kwj2qh6gggh7,GB,8,Get to try new products and decide if you enjoy it enough to buy the full size.


In [0]:
#Filter out null comments, make nps all ints
df = df.filter(df["comments"].isNotNull())
df = df.replace("10 = Extremely likely", '10', ['nps']).replace("0 = Not at all likely", '0', ['nps']) # added new
df = df.withColumn('nps', df.nps.cast('int'))
df.display()

start,end,userId,subscription,nps,comments
2021-03-04 15:19:00,2021-03-04 15:27:00,u-kcdw0g28oh1w1svx,GB,8,this is a great service with amazing producst but in Canada it's most expensive than USA ....
2021-03-04 13:26:00,2021-03-04 13:34:00,u-j4emp8j5ausf2m6,GB,7,Not expensive
2021-03-04 12:34:00,2021-03-04 12:36:00,u-jj1wf1b1u8ck8hv,GB,8,Fun products
2021-03-04 09:21:00,2021-03-04 09:33:00,u-k8nbxxactsoi1zck,GB,8,I've discovered products I love and never would have tried because of your bags.
2021-03-04 04:08:00,2021-03-04 04:26:00,u-jku687tje6kloi6,GB,8,I love trying all of the different items and then buying the full size if i love them
2021-03-03 20:48:00,2021-03-03 20:53:00,u-hx5ksb9lwvmb1ab0,GB,8,Amount of different products to try.
2021-03-03 16:40:00,2021-03-03 16:47:00,u-kicgjp5z1urrhw3,GB,8,Quality products
2021-03-03 13:08:00,2021-03-03 13:14:00,u-kh86eczzgnpxwni,GB,7,It will save money
2021-03-03 11:58:00,2021-03-03 12:11:00,u-ji2cw2m2ss9177d,GB,7,Its fun and easy.
2021-03-03 06:58:00,2021-03-03 07:08:00,u-ik4kwj2qh6gggh7,GB,8,Get to try new products and decide if you enjoy it enough to buy the full size.


In [0]:
df_pd = df.toPandas()
df_pd = df_pd[['start', 'end', 'nps', 'comments', 'subscription', 'userId']]
df_pd

Unnamed: 0,start,end,nps,comments,subscription,userId
0,2021-03-04 15:19:00,2021-03-04 15:27:00,8,this is a great service with amazing producst ...,GB,u-kcdw0g28oh1w1svx
1,2021-03-04 13:26:00,2021-03-04 13:34:00,7,Not expensive,GB,u-j4emp8j5ausf2m6
2,2021-03-04 12:34:00,2021-03-04 12:36:00,8,Fun products,GB,u-jj1wf1b1u8ck8hv
3,2021-03-04 09:21:00,2021-03-04 09:33:00,8,I've discovered products I love and never woul...,GB,u-k8nbxxactsoi1zck
4,2021-03-04 04:08:00,2021-03-04 04:26:00,8,I love trying all of the different items and t...,GB,u-jku687tje6kloi6
...,...,...,...,...,...,...
12557,2021-05-27 15:04:00,2021-05-27 15:18:00,8,A bit pricey with the cost being on top of the...,GBX,u-keoe8sjlwh8g9eu
12558,2021-05-27 15:04:00,2021-05-27 15:17:00,8,It was really nice and I would recommend to my...,GBX,u-jtm05merrw51s25
12559,2021-05-27 15:04:00,2021-05-27 15:09:00,8,FINDING NEW FAVORITE PRODUCTS,GBX,u-kgkay8tifufy4xb
12560,2021-05-27 15:03:00,2021-05-27 15:11:00,7,Not being able to pause my other bag at all.,GBX,u-k474e272cuc31qqs


In [0]:
#Replace weird syntax
'''
ex: ‚Äô --> '
ex: ‚Äú --> " (opening quote)
ex: ‚Äù --> " (ending quote)
'''
weird_syntax = {"‚Äô": '\'', 
                "‚Äú": '\'', 
                "‚Äù": '\'',
                "‚Äò": '\'',
                "‚Äö√Ñ√¥": '\'',
                "‚Äö√Ñ√∫": '\'',
                "‚Äö√Ñ√π": '\'',
                "‚Äî": ' ',
                "‚Ä¶": ' ',
                "Ô£ø√º¬ß‚àëÔ£ø√º√®√¶": ' ',
                "‚Å∞": ' ',
                "‚ù§": ' ',
                "‚óç‚Ä¢·¥ó‚Ä¢‚óç": ' ',
                "ü§∑‚Äç‚ôÄÔ∏è": ' '
               }

for index, item in df_pd['comments'].items():
  for key in weird_syntax:
    df_pd['comments'][index] = df_pd['comments'][index].replace(key, weird_syntax[key])
  
df_pd.display()

start,end,nps,comments,subscription,userId
2021-03-04 15:19:00,2021-03-04 15:27:00,8,this is a great service with amazing producst but in Canada it's most expensive than USA ....,GB,u-kcdw0g28oh1w1svx
2021-03-04 13:26:00,2021-03-04 13:34:00,7,Not expensive,GB,u-j4emp8j5ausf2m6
2021-03-04 12:34:00,2021-03-04 12:36:00,8,Fun products,GB,u-jj1wf1b1u8ck8hv
2021-03-04 09:21:00,2021-03-04 09:33:00,8,I've discovered products I love and never would have tried because of your bags.,GB,u-k8nbxxactsoi1zck
2021-03-04 04:08:00,2021-03-04 04:26:00,8,I love trying all of the different items and then buying the full size if i love them,GB,u-jku687tje6kloi6
2021-03-03 20:48:00,2021-03-03 20:53:00,8,Amount of different products to try.,GB,u-hx5ksb9lwvmb1ab0
2021-03-03 16:40:00,2021-03-03 16:47:00,8,Quality products,GB,u-kicgjp5z1urrhw3
2021-03-03 13:08:00,2021-03-03 13:14:00,7,It will save money,GB,u-kh86eczzgnpxwni
2021-03-03 11:58:00,2021-03-03 12:11:00,7,Its fun and easy.,GB,u-ji2cw2m2ss9177d
2021-03-03 06:58:00,2021-03-03 07:08:00,8,Get to try new products and decide if you enjoy it enough to buy the full size.,GB,u-ik4kwj2qh6gggh7


In [0]:
#Filter out random comments
random = ["na", "na ", "no", "no ", "Na", "Na ", "No", "No ", "NA", "NA ", "NO", "NO ", "n/a", "n/a ", "N/A", "N/A ", ".", ". ", "?", "? ", ",", ", ", "!", "! ", "..", ".. ", "...", "... ", "....", ".... ", ".....", "..... ", " ", "  ", "   ", "    ", "     "]

df_pd = df_pd[~df_pd['comments'].isin(random)]
df_pd

Unnamed: 0,start,end,nps,comments,subscription,userId
0,2021-03-04 15:19:00,2021-03-04 15:27:00,8,this is a great service with amazing producst ...,GB,u-kcdw0g28oh1w1svx
1,2021-03-04 13:26:00,2021-03-04 13:34:00,7,Not expensive,GB,u-j4emp8j5ausf2m6
2,2021-03-04 12:34:00,2021-03-04 12:36:00,8,Fun products,GB,u-jj1wf1b1u8ck8hv
3,2021-03-04 09:21:00,2021-03-04 09:33:00,8,I've discovered products I love and never woul...,GB,u-k8nbxxactsoi1zck
4,2021-03-04 04:08:00,2021-03-04 04:26:00,8,I love trying all of the different items and t...,GB,u-jku687tje6kloi6
...,...,...,...,...,...,...
12557,2021-05-27 15:04:00,2021-05-27 15:18:00,8,A bit pricey with the cost being on top of the...,GBX,u-keoe8sjlwh8g9eu
12558,2021-05-27 15:04:00,2021-05-27 15:17:00,8,It was really nice and I would recommend to my...,GBX,u-jtm05merrw51s25
12559,2021-05-27 15:04:00,2021-05-27 15:09:00,8,FINDING NEW FAVORITE PRODUCTS,GBX,u-kgkay8tifufy4xb
12560,2021-05-27 15:03:00,2021-05-27 15:11:00,7,Not being able to pause my other bag at all.,GBX,u-k474e272cuc31qqs


In [0]:
#Shift date and add month column
import calendar
df_pd['date'] = pd.to_datetime(df_pd['end'], format='%Y-%m-%d')
df_pd['date_shifted'] = df_pd['date'] + pd.TimedeltaIndex( [-6]*len(df_pd.index), unit='d') #shift back 6 days
df_pd['month'] = pd.DatetimeIndex(df_pd['date_shifted']).month
df_pd['month'] = df_pd['month'].apply(lambda x: calendar.month_abbr[x])
df_pd = df_pd.drop(['date', 'date_shifted'], axis=1)
df_pd

Unnamed: 0,start,end,nps,comments,subscription,userId,month
0,2021-03-04 15:19:00,2021-03-04 15:27:00,8,this is a great service with amazing producst ...,GB,u-kcdw0g28oh1w1svx,Feb
1,2021-03-04 13:26:00,2021-03-04 13:34:00,7,Not expensive,GB,u-j4emp8j5ausf2m6,Feb
2,2021-03-04 12:34:00,2021-03-04 12:36:00,8,Fun products,GB,u-jj1wf1b1u8ck8hv,Feb
3,2021-03-04 09:21:00,2021-03-04 09:33:00,8,I've discovered products I love and never woul...,GB,u-k8nbxxactsoi1zck,Feb
4,2021-03-04 04:08:00,2021-03-04 04:26:00,8,I love trying all of the different items and t...,GB,u-jku687tje6kloi6,Feb
...,...,...,...,...,...,...,...
12557,2021-05-27 15:04:00,2021-05-27 15:18:00,8,A bit pricey with the cost being on top of the...,GBX,u-keoe8sjlwh8g9eu,May
12558,2021-05-27 15:04:00,2021-05-27 15:17:00,8,It was really nice and I would recommend to my...,GBX,u-jtm05merrw51s25,May
12559,2021-05-27 15:04:00,2021-05-27 15:09:00,8,FINDING NEW FAVORITE PRODUCTS,GBX,u-kgkay8tifufy4xb,May
12560,2021-05-27 15:03:00,2021-05-27 15:11:00,7,Not being able to pause my other bag at all.,GBX,u-k474e272cuc31qqs,May


In [0]:
#Add NPS type column (detractors, passives, and promoters)
df_pd['nps'] = df_pd['nps'].astype(int)
df_pd["nps_type"] = df_pd["nps"].apply(lambda x : 'detractor' if x <=6 else 'passive' if x <=8 else 'promotor')
df_pd['nps'] = df_pd['nps'].astype(str)

df_pd

Unnamed: 0,start,end,nps,comments,subscription,userId,month,nps_type
0,2021-03-04 15:19:00,2021-03-04 15:27:00,8,this is a great service with amazing producst ...,GB,u-kcdw0g28oh1w1svx,Feb,passive
1,2021-03-04 13:26:00,2021-03-04 13:34:00,7,Not expensive,GB,u-j4emp8j5ausf2m6,Feb,passive
2,2021-03-04 12:34:00,2021-03-04 12:36:00,8,Fun products,GB,u-jj1wf1b1u8ck8hv,Feb,passive
3,2021-03-04 09:21:00,2021-03-04 09:33:00,8,I've discovered products I love and never woul...,GB,u-k8nbxxactsoi1zck,Feb,passive
4,2021-03-04 04:08:00,2021-03-04 04:26:00,8,I love trying all of the different items and t...,GB,u-jku687tje6kloi6,Feb,passive
...,...,...,...,...,...,...,...,...
12557,2021-05-27 15:04:00,2021-05-27 15:18:00,8,A bit pricey with the cost being on top of the...,GBX,u-keoe8sjlwh8g9eu,May,passive
12558,2021-05-27 15:04:00,2021-05-27 15:17:00,8,It was really nice and I would recommend to my...,GBX,u-jtm05merrw51s25,May,passive
12559,2021-05-27 15:04:00,2021-05-27 15:09:00,8,FINDING NEW FAVORITE PRODUCTS,GBX,u-kgkay8tifufy4xb,May,passive
12560,2021-05-27 15:03:00,2021-05-27 15:11:00,7,Not being able to pause my other bag at all.,GBX,u-k474e272cuc31qqs,May,passive


###Flair

In [0]:
#Get comment score and type (POSITIVE or NEGATIVE) of each comment

def comment_score(row): 
  sentence = str(row['comments'])
  #print(sentence)
  s = flair.data.Sentence(sentence)
  flair_sentiment.predict(s)
  total_sentiment = s.labels
  s = total_sentiment[0].to_dict()
  #print(s['value'])
  #print(s['confidence'])
  return s['value'], s['confidence']

df_pd['comment_type'], df_pd['comment_score'] = zip(*df_pd.apply(comment_score, axis=1))


In [0]:
df_pd['comment_score'] = df_pd['comment_score'] * 1000
df_pd["comment_score"] = df_pd["comment_score"].astype(int)


In [0]:
#Filter out positive comments

df_neg = df_pd[(df_pd["comment_type"]=='NEGATIVE')].sort_values(by = 'comment_score')
df_neg.display()

start,end,nps,comments,subscription,userId,month,nps_type,comment_type,comment_score
2021-04-27 13:56:02,2021-04-27 13:57:47,8,"I have only great things to say about ipsy, but would only recommend to friends that have the same skincare/make up interests as I do",GB+,u-j5lbmo3vtzwzxjr,Apr,passive,NEGATIVE,500
2021-04-28 03:13:00,2021-04-28 03:20:00,7,P,GB,u-jm8r2ldx3eia13b3,Apr,passive,NEGATIVE,500
2021-03-26 12:55:00,2021-03-26 12:56:00,7,Some friends don't wear much makeup.,GB,u-kgn6qlwa24zxw79,Mar,passive,NEGATIVE,501
2021-02-26 18:35:00,2021-02-26 18:43:00,8,It's like getting a present in the mail every month,GB,u-k324zvsaaa9j1n6x,Feb,passive,NEGATIVE,501
2021-04-28 06:32:00,2021-04-28 06:34:00,7,It is nice getting to try different products. It would be nice to have a bigger selection.,GB,u-kl8bdh43askvtrj,Apr,passive,NEGATIVE,501
2021-05-27 11:52:07,2021-05-27 11:52:58,8,I get to pick over half of my bag.,GB+,u-jsvsyucmhn5a170g,May,passive,NEGATIVE,501
2021-03-26 13:47:00,2021-03-26 13:52:00,7,Most of my friends are members already or not interested,GB,u-kl5lo4iya6u57mc,Mar,passive,NEGATIVE,501
2021-06-28 11:11:00,2021-06-28 11:16:00,8,It's a fun thing to get each month but not a necessity.,GB,u-klpdln4g8ihhxnl,Jun,passive,NEGATIVE,503
2021-04-27 13:27:00,2021-04-27 13:31:00,8,Try new products,GB,u-ji8z51d4wuuncwc,Apr,passive,NEGATIVE,505
2021-03-26 13:16:00,2021-03-26 13:22:00,8,Try new products,GB,u-kkhmx19jtjwi1hi,Mar,passive,NEGATIVE,505


###Prediction Using Model

In [0]:
#Read data from widget
#TODO: make labels NOT widget

#dbutils.widgets.removeAll()
dbutils.widgets.remove("b) Labels")
#LABELS = "b) Labels"
#dbutils.widgets.text(LABELS, 'excitement/quality, other, personalization, repetitive, shipping/packaging, value/price')
#labels = dbutils.widgets.get(LABELS)
#labels_arr = labels.split(", ")

In [0]:
model_path = "sentiment-topic-bert-base-uncased"
target_names = ['excitement/quality, other, personalization, repetitive, shipping/packaging, value/price']
target_names = #labels_arr
max_length = 200

In [0]:
#Read data from widget

#model path rn: dbfs:/mnt/ipsy-databricks-mlp/research/ethan/monolabeled-model-business-w-flair-and-other
MODEL_PATH = "b) Model Path"
dbutils.widgets.text(MODEL_PATH, "dbfs:/mnt/ipsy-databricks-mlp/research/ethan/monolabeled-model-business-w-flair-and-other")
pretrained_model_path = dbutils.widgets.get(MODEL_PATH)

In [0]:
#load saved model
dbutils.fs.cp(pretrained_model_path, "file:/databricks/driver/sentiment-topic-bert-base-uncased", recurse=True)
tokenizer_2 = BertTokenizerFast.from_pretrained(model_path)
model_2 = BertForSequenceClassification.from_pretrained(model_path, num_labels=len(target_names))

###Visualizing Findings

In [0]:
df_used = df_neg[:]
df_used

Unnamed: 0,start,end,nps,comments,subscription,userId,month,nps_type,comment_type,comment_score
10748,2021-04-27 13:56:02,2021-04-27 13:57:47,8,"I have only great things to say about ipsy, bu...",GB+,u-j5lbmo3vtzwzxjr,Apr,passive,NEGATIVE,500
4104,2021-04-28 03:13:00,2021-04-28 03:20:00,7,P,GB,u-jm8r2ldx3eia13b3,Apr,passive,NEGATIVE,500
2915,2021-03-26 12:55:00,2021-03-26 12:56:00,7,Some friends don't wear much makeup.,GB,u-kgn6qlwa24zxw79,Mar,passive,NEGATIVE,501
561,2021-02-26 18:35:00,2021-02-26 18:43:00,8,It's like getting a present in the mail every ...,GB,u-k324zvsaaa9j1n6x,Feb,passive,NEGATIVE,501
4029,2021-04-28 06:32:00,2021-04-28 06:34:00,7,It is nice getting to try different products. ...,GB,u-kl8bdh43askvtrj,Apr,passive,NEGATIVE,501
...,...,...,...,...,...,...,...,...,...,...
6135,2021-05-27 18:08:00,2021-05-27 18:10:00,7,sometimes the stuff is repetitive or not somet...,GB,u-kbadyufmtw3e3ki,May,passive,NEGATIVE,999
6133,2021-05-27 18:08:00,2021-05-27 18:11:00,8,"I love the chance to try new makeup monthly, b...",GB,u-kbq1q2l86wwl1krb,May,passive,NEGATIVE,999
6127,2021-05-27 18:12:00,2021-05-27 18:18:00,8,Not really good product,GB,u-k4mhbgg2t3re1vho,May,passive,NEGATIVE,999
6205,2021-05-27 17:20:00,2021-05-27 17:29:00,7,The bottles have gotten smaller while the pric...,GB,u-ibtpg9nqz8jo11js,May,passive,NEGATIVE,999


In [0]:
def get_prediction_clean(text):
    # prepare our text into tokenized sequence
    inputs = tokenizer_2(text, padding=True, truncation=True, max_length=max_length, return_tensors="pt")#.to("cuda")
    # perform inference to our model
    outputs = model_2(**inputs)
    # get output probabilities by doing softmax
    probs = outputs[0].softmax(1)
    # executing argmax function to get the candidate label
    return target_names[probs.argmax()]

In [0]:
df_used['predicted_label'] = df_used['comments'].apply(get_prediction_clean)
pred_label = df_used['predicted_label']
df_used

Unnamed: 0,start,end,nps,comments,subscription,userId,month,nps_type,comment_type,comment_score,predicted_label
10748,2021-04-27 13:56:02,2021-04-27 13:57:47,8,"I have only great things to say about ipsy, bu...",GB+,u-j5lbmo3vtzwzxjr,Apr,passive,NEGATIVE,500,other
4104,2021-04-28 03:13:00,2021-04-28 03:20:00,7,P,GB,u-jm8r2ldx3eia13b3,Apr,passive,NEGATIVE,500,other
2915,2021-03-26 12:55:00,2021-03-26 12:56:00,7,Some friends don't wear much makeup.,GB,u-kgn6qlwa24zxw79,Mar,passive,NEGATIVE,501,other
561,2021-02-26 18:35:00,2021-02-26 18:43:00,8,It's like getting a present in the mail every ...,GB,u-k324zvsaaa9j1n6x,Feb,passive,NEGATIVE,501,other
4029,2021-04-28 06:32:00,2021-04-28 06:34:00,7,It is nice getting to try different products. ...,GB,u-kl8bdh43askvtrj,Apr,passive,NEGATIVE,501,repetitive
...,...,...,...,...,...,...,...,...,...,...,...
6135,2021-05-27 18:08:00,2021-05-27 18:10:00,7,sometimes the stuff is repetitive or not somet...,GB,u-kbadyufmtw3e3ki,May,passive,NEGATIVE,999,repetitive
6133,2021-05-27 18:08:00,2021-05-27 18:11:00,8,"I love the chance to try new makeup monthly, b...",GB,u-kbq1q2l86wwl1krb,May,passive,NEGATIVE,999,repetitive
6127,2021-05-27 18:12:00,2021-05-27 18:18:00,8,Not really good product,GB,u-k4mhbgg2t3re1vho,May,passive,NEGATIVE,999,excitement/quality
6205,2021-05-27 17:20:00,2021-05-27 17:29:00,7,The bottles have gotten smaller while the pric...,GB,u-ibtpg9nqz8jo11js,May,passive,NEGATIVE,999,value/price


In [0]:
#Convert pandas into spark dataframe

# Enable Arrow-based columnar data transfers
spark.conf.set("spark.sql.execution.arrow.enabled", "true")

# Create a Spark DataFrame from a pandas DataFrame using Arrow
df_labeled = spark.createDataFrame(df_used)
df_labeled.display()

start,end,nps,comments,subscription,userId,month,nps_type,comment_type,comment_score,predicted_label
2021-04-27 13:56:02,2021-04-27 13:57:47,8,"I have only great things to say about ipsy, but would only recommend to friends that have the same skincare/make up interests as I do",GB+,u-j5lbmo3vtzwzxjr,Apr,passive,NEGATIVE,500,other
2021-04-28 03:13:00,2021-04-28 03:20:00,7,P,GB,u-jm8r2ldx3eia13b3,Apr,passive,NEGATIVE,500,other
2021-03-26 12:55:00,2021-03-26 12:56:00,7,Some friends don't wear much makeup.,GB,u-kgn6qlwa24zxw79,Mar,passive,NEGATIVE,501,other
2021-02-26 18:35:00,2021-02-26 18:43:00,8,It's like getting a present in the mail every month,GB,u-k324zvsaaa9j1n6x,Feb,passive,NEGATIVE,501,other
2021-04-28 06:32:00,2021-04-28 06:34:00,7,It is nice getting to try different products. It would be nice to have a bigger selection.,GB,u-kl8bdh43askvtrj,Apr,passive,NEGATIVE,501,repetitive
2021-05-27 11:52:07,2021-05-27 11:52:58,8,I get to pick over half of my bag.,GB+,u-jsvsyucmhn5a170g,May,passive,NEGATIVE,501,personalization
2021-03-26 13:47:00,2021-03-26 13:52:00,7,Most of my friends are members already or not interested,GB,u-kl5lo4iya6u57mc,Mar,passive,NEGATIVE,501,other
2021-06-28 11:11:00,2021-06-28 11:16:00,8,It's a fun thing to get each month but not a necessity.,GB,u-klpdln4g8ihhxnl,Jun,passive,NEGATIVE,503,other
2021-04-27 13:27:00,2021-04-27 13:31:00,8,Try new products,GB,u-ji8z51d4wuuncwc,Apr,passive,NEGATIVE,505,other
2021-03-26 13:16:00,2021-03-26 13:22:00,8,Try new products,GB,u-kkhmx19jtjwi1hi,Mar,passive,NEGATIVE,505,other


In [0]:
df_labeled = df_labeled.replace(['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'], ['1 - Jan', '2 - Feb', '3 - Mar', '4 - Apr', '5 - May', '6 - Jun', '7 - Jul', '8 - Aug', '9 - Sep', '10 - Oct', '11 - Nov', '12 - Dec'], 'month')
df_labeled.display()

start,end,nps,comments,subscription,userId,month,nps_type,comment_type,comment_score,predicted_label
2021-04-27 13:56:02,2021-04-27 13:57:47,8,"I have only great things to say about ipsy, but would only recommend to friends that have the same skincare/make up interests as I do",GB+,u-j5lbmo3vtzwzxjr,4 - Apr,passive,NEGATIVE,500,other
2021-04-28 03:13:00,2021-04-28 03:20:00,7,P,GB,u-jm8r2ldx3eia13b3,4 - Apr,passive,NEGATIVE,500,other
2021-03-26 12:55:00,2021-03-26 12:56:00,7,Some friends don't wear much makeup.,GB,u-kgn6qlwa24zxw79,3 - Mar,passive,NEGATIVE,501,other
2021-02-26 18:35:00,2021-02-26 18:43:00,8,It's like getting a present in the mail every month,GB,u-k324zvsaaa9j1n6x,2 - Feb,passive,NEGATIVE,501,other
2021-04-28 06:32:00,2021-04-28 06:34:00,7,It is nice getting to try different products. It would be nice to have a bigger selection.,GB,u-kl8bdh43askvtrj,4 - Apr,passive,NEGATIVE,501,repetitive
2021-05-27 11:52:07,2021-05-27 11:52:58,8,I get to pick over half of my bag.,GB+,u-jsvsyucmhn5a170g,5 - May,passive,NEGATIVE,501,personalization
2021-03-26 13:47:00,2021-03-26 13:52:00,7,Most of my friends are members already or not interested,GB,u-kl5lo4iya6u57mc,3 - Mar,passive,NEGATIVE,501,other
2021-06-28 11:11:00,2021-06-28 11:16:00,8,It's a fun thing to get each month but not a necessity.,GB,u-klpdln4g8ihhxnl,6 - Jun,passive,NEGATIVE,503,other
2021-04-27 13:27:00,2021-04-27 13:31:00,8,Try new products,GB,u-ji8z51d4wuuncwc,4 - Apr,passive,NEGATIVE,505,other
2021-03-26 13:16:00,2021-03-26 13:22:00,8,Try new products,GB,u-kkhmx19jtjwi1hi,3 - Mar,passive,NEGATIVE,505,other


####Grouped by Topic Type

In [0]:
df_grouped_by_label = df_labeled.groupBy('predicted_label').count().sort('count', ascending=False)
df_grouped_by_label.display()

predicted_label,count
other,988
personalization,896
repetitive,834
shipping/packaging,433
excitement/quality,390
value/price,204


In [0]:
df_grouped_by_label.display()

predicted_label,count
other,988
personalization,896
repetitive,834
shipping/packaging,433
excitement/quality,390
value/price,204


####Grouped by Topic Type, Month

In [0]:
df_grouped_by_label_month = df_labeled.groupBy(['predicted_label', 'month']).count().sort(col('month').asc(), col('count').desc())
df_grouped_by_label_month.display()

predicted_label,month,count
repetitive,2 - Feb,222
personalization,2 - Feb,212
other,2 - Feb,193
excitement/quality,2 - Feb,81
shipping/packaging,2 - Feb,55
value/price,2 - Feb,43
other,3 - Mar,210
personalization,3 - Mar,210
repetitive,3 - Mar,177
shipping/packaging,3 - Mar,100


In [0]:
df_grouped_by_label_month.display()

predicted_label,month,count
repetitive,2 - Feb,222
personalization,2 - Feb,212
other,2 - Feb,193
excitement/quality,2 - Feb,81
shipping/packaging,2 - Feb,55
value/price,2 - Feb,43
other,3 - Mar,210
personalization,3 - Mar,210
repetitive,3 - Mar,177
shipping/packaging,3 - Mar,100


In [0]:
df_grouped_by_label_month.display()

predicted_label,month,count
repetitive,2 - Feb,222
personalization,2 - Feb,212
other,2 - Feb,193
excitement/quality,2 - Feb,81
shipping/packaging,2 - Feb,55
value/price,2 - Feb,43
other,3 - Mar,210
personalization,3 - Mar,210
repetitive,3 - Mar,177
shipping/packaging,3 - Mar,100


####Grouped by Topic Type, Month, Subscription

In [0]:
df_grouped_by_label_month_subscription = df_labeled.groupBy(['predicted_label', 'month', 'subscription']).count().sort(col('subscription').asc(), col('month').asc(), col('count').desc())
df_grouped_by_label_month_subscription.display()

predicted_label,month,subscription,count
personalization,2 - Feb,GB,151
repetitive,2 - Feb,GB,151
other,2 - Feb,GB,147
excitement/quality,2 - Feb,GB,44
shipping/packaging,2 - Feb,GB,29
value/price,2 - Feb,GB,28
other,3 - Mar,GB,163
personalization,3 - Mar,GB,132
repetitive,3 - Mar,GB,132
shipping/packaging,3 - Mar,GB,68


In [0]:
df_grouped_by_label_month_subscription.display()

predicted_label,month,subscription,count
personalization,2 - Feb,GB,151
repetitive,2 - Feb,GB,151
other,2 - Feb,GB,147
excitement/quality,2 - Feb,GB,44
shipping/packaging,2 - Feb,GB,29
value/price,2 - Feb,GB,28
other,3 - Mar,GB,163
personalization,3 - Mar,GB,132
repetitive,3 - Mar,GB,132
shipping/packaging,3 - Mar,GB,68


In [0]:
df_grouped_by_label_month_subscription.display()

predicted_label,month,subscription,count
personalization,2 - Feb,GB,151
repetitive,2 - Feb,GB,151
other,2 - Feb,GB,147
excitement/quality,2 - Feb,GB,44
shipping/packaging,2 - Feb,GB,29
value/price,2 - Feb,GB,28
other,3 - Mar,GB,163
personalization,3 - Mar,GB,132
repetitive,3 - Mar,GB,132
shipping/packaging,3 - Mar,GB,68


####Grouped by NPS

In [0]:
df_grouped_by_nps = df_labeled.groupBy(['nps']).count().sort(col('count').desc())
df_grouped_by_nps.display()

nps,count
7,2223
8,1522


####Grouped by NPS, Topic Type

In [0]:
df_grouped_by_label_nps = df_labeled.groupBy(['predicted_label', 'nps']).count().sort(col('nps').asc(), col('count').desc())
df_grouped_by_label_nps.display()

predicted_label,nps,count
personalization,7,561
other,7,526
repetitive,7,513
shipping/packaging,7,271
excitement/quality,7,240
value/price,7,112
other,8,462
personalization,8,335
repetitive,8,321
shipping/packaging,8,162


In [0]:
df_grouped_by_label_nps.display()

predicted_label,nps,count
personalization,7,561
other,7,526
repetitive,7,513
shipping/packaging,7,271
excitement/quality,7,240
value/price,7,112
other,8,462
personalization,8,335
repetitive,8,321
shipping/packaging,8,162


In [0]:
df_grouped_by_label_nps.display()

predicted_label,nps,count
personalization,7,561
other,7,526
repetitive,7,513
shipping/packaging,7,271
excitement/quality,7,240
value/price,7,112
other,8,462
personalization,8,335
repetitive,8,321
shipping/packaging,8,162


####Grouped by NPS, Topic Type, Month

In [0]:
df_grouped_by_label_month_nps = df_labeled.groupBy(['predicted_label', 'nps', 'month']).count().sort(col('nps').asc(), col('month').asc(), col('count').desc())
df_grouped_by_label_month_nps.display()

predicted_label,nps,month,count
personalization,7,2 - Feb,142
repetitive,7,2 - Feb,132
other,7,2 - Feb,102
excitement/quality,7,2 - Feb,49
shipping/packaging,7,2 - Feb,37
value/price,7,2 - Feb,26
personalization,7,3 - Mar,132
other,7,3 - Mar,115
repetitive,7,3 - Mar,109
shipping/packaging,7,3 - Mar,58


In [0]:
df_grouped_by_label_month_nps.display()

predicted_label,nps,month,count
personalization,7,2 - Feb,142
repetitive,7,2 - Feb,132
other,7,2 - Feb,102
excitement/quality,7,2 - Feb,49
shipping/packaging,7,2 - Feb,37
value/price,7,2 - Feb,26
personalization,7,3 - Mar,132
other,7,3 - Mar,115
repetitive,7,3 - Mar,109
shipping/packaging,7,3 - Mar,58


In [0]:
df_grouped_by_label_month_nps.display()

predicted_label,nps,month,count
personalization,7,2 - Feb,142
repetitive,7,2 - Feb,132
other,7,2 - Feb,102
excitement/quality,7,2 - Feb,49
shipping/packaging,7,2 - Feb,37
value/price,7,2 - Feb,26
personalization,7,3 - Mar,132
other,7,3 - Mar,115
repetitive,7,3 - Mar,109
shipping/packaging,7,3 - Mar,58


####Grouped by NPS, Topic Type, Subscription

In [0]:
df_grouped_by_label_subscription_nps = df_labeled.groupBy(['predicted_label', 'nps', 'subscription']).count().sort(col('subscription').asc(), col('nps').asc(), col('count').desc())
df_grouped_by_label_subscription_nps.display()

predicted_label,nps,subscription,count
other,7,GB,391
personalization,7,GB,375
repetitive,7,GB,354
shipping/packaging,7,GB,161
excitement/quality,7,GB,132
value/price,7,GB,73
other,8,GB,354
repetitive,8,GB,226
personalization,8,GB,216
shipping/packaging,8,GB,102


In [0]:
df_grouped_by_label_subscription_nps.display()

predicted_label,nps,subscription,count
other,7,GB,391
personalization,7,GB,375
repetitive,7,GB,354
shipping/packaging,7,GB,161
excitement/quality,7,GB,132
value/price,7,GB,73
other,8,GB,354
repetitive,8,GB,226
personalization,8,GB,216
shipping/packaging,8,GB,102


In [0]:
df_grouped_by_label_subscription_nps.display()

predicted_label,nps,subscription,count
other,7,GB,391
personalization,7,GB,375
repetitive,7,GB,354
shipping/packaging,7,GB,161
excitement/quality,7,GB,132
value/price,7,GB,73
other,8,GB,354
repetitive,8,GB,226
personalization,8,GB,216
shipping/packaging,8,GB,102


####Grouped by NPS, Topic Type, Subscription, Month

In [0]:
df_grouped_by_label_subscription_month_nps = df_labeled.groupBy(['predicted_label', 'nps', 'month', 'subscription']).count().sort(col('subscription').asc(), col('month').asc(), col('nps').asc(), col('count').desc())
df_grouped_by_label_subscription_month_nps.display()

predicted_label,nps,month,subscription,count
personalization,7,2 - Feb,GB,102
repetitive,7,2 - Feb,GB,92
other,7,2 - Feb,GB,73
excitement/quality,7,2 - Feb,GB,27
shipping/packaging,7,2 - Feb,GB,18
value/price,7,2 - Feb,GB,18
other,8,2 - Feb,GB,74
repetitive,8,2 - Feb,GB,59
personalization,8,2 - Feb,GB,49
excitement/quality,8,2 - Feb,GB,17
