In [1]:
import numpy as np
import pandas as pd

from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer, AutoConfig
from transformers import pipeline

In [2]:
MODEL = f"cardiffnlp/twitter-roberta-base-sentiment-latest"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
config = AutoConfig.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)

classifier = pipeline('sentiment-analysis', model=model, tokenizer=tokenizer, config = config, truncation = True)
classifier("This community is so helpful!")

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


[{'label': 'positive', 'score': 0.9821994304656982}]

In [3]:
df_input = pd.read_csv('data_2023.csv', low_memory = False, on_bad_lines = "skip")

In [4]:
#Initial copy to avoid re-input
df = df_input.copy()
#Drop empty comment
df = df.dropna(subset = "Comment_Translation")
#Filter columns
df = df[['CGID', 'Survey_Completed_Date', 'CustomerNumber', 'HeinekenRegion', 'Country', 'CustomerCluster',
         'BusinessSegment', 'CustomerType', 'NetPromoterScore', 'Reasons_List', 'Comment', 'Comment_Translation']]
#Extract months
#df['Survey_Completed_Date'] = pd.to_datetime(df['Survey_Completed_Date'])
#df['Month'] = pd.DatetimeIndex(df['Survey_Completed_Date']).month
#Filter countries
#df = df[df['Country'] != 'VN'] 
df = df[df['Country'] != 'MX'] #Different operating system
df = df[df['Country'] != 'BI'] #Translation error
df = df[df['Country'] != 'KH'] #Translation error

#Data cleaning
#Convert everything to string type
df['Comment_Translation'] = df['Comment_Translation'].astype('str')
#Exclude numeric
df = df[~df['Comment_Translation'].str.isdigit()]
df = df[~df['Comment_Translation'].str.isnumeric()]
df = df[df['Comment_Translation'] != '100%']
#Exclue certain patterns that confuse the model (e.g. Nil vs NB (shorts for Nigerian breweries))
df = df[df['Comment_Translation'] != 'Nil']
df = df[df['Comment_Translation'] != 'Nill']
#Exclude CSAT
df = df[df['NetPromoterScore'] != -1]
#Remove certain patterns + HTML linebreak
rmv_lst = ['<br/>', '=', '=-']
for i in range(len(rmv_lst)):
    df = df.replace(rmv_lst[i], '', regex = True)
#Remove NULL/BLANK
df = df[df['Comment_Translation'] != '']
df = df[df['Comment_Translation'].apply(lambda x: len(x) < 512)]
#display(df)

In [None]:
#df = df.iloc[1:100]
df = (
    df
    .assign(sentiment = lambda x: x['Comment_Translation'].apply(lambda s: classifier(s)))
    .assign(
         label = lambda x: x['sentiment'].apply(lambda s: (s[0]['label'])),
         score = lambda x: x['sentiment'].apply(lambda s: (s[0]['score']))
    )
)

In [6]:
df.to_csv('SA_Results.csv', encoding = 'utf_8_sig', index = False)