To import the data from BQ we have used the tutorial "Visualizing BigQuery public data that was located in the tutorials folder of repo.

Additional modifications have been made to filter and remove duplicates in the data.

Information about the sentiment classification library can be found here: https://huggingface.co/marma/bert-base-swedish-cased-sentiment

In [25]:
from google.cloud import bigquery
import pandas as pd
from transformers import pipeline
import torch
import numpy as np
from sklearn.metrics import (precision_recall_fscore_support)

client = bigquery.Client()

In [None]:
# Get all the labeled comments that are labeled 1 or -1
sql_labeled = """
SELECT * FROM `BQ TABLE NAME` WHERE NOT sentiment_label=0
"""

'''
Pre-processing of the labeled data
'''
dfs = client.query(sql_labeled).to_dataframe() 
fb_comments_labeled_df = pd.concat(dfs)
fb_comments_labeled_df = fb_comments_labeled_df.drop_duplicates(subset='comment_id')

fb_comments_labeled_df.head()

In [26]:
def calculate_precision_recall(targets, predictions):
    scores = precision_recall_fscore_support(predictions, targets, average=None)
    p_r_f_for_class = dict()
    for i in range(2):
        label = 'Positive'
        if i == 0:
            label = 'Negative'
            
        p_r_f_for_class[label] = (scores[0][i], scores[1][i], scores[2][i])
    return  p_r_f_for_class

In [27]:
# Load the test set
test_df = pd.read_pickle("./test_labeled_data.pkl")
all_labeled_df = pd.read_pickle("./labeled_filtered_comments.pkl")

# Set the labels to be 1 or -1
test_df['sentiment_label'] = test_df['sentiment_label'].map({'pos':1, 'neg': 0})
all_labeled_df['sentiment_label'] = all_labeled_df['sentiment_label'].map({'pos':1, 'neg': 0})

print("There are ", len(test_df), " comments in the test set")
print("There are ", len(all_labeled_df), " labeled comments total")
n_neg, n_pos = all_labeled_df["sentiment_label"].value_counts()[0], all_labeled_df["sentiment_label"].value_counts()[1]
print("Negative: %d%% (%d)" %(n_neg*100/len(all_labeled_df), n_neg))
print("Positive: %d%% (%d)" %(n_pos*100/len(all_labeled_df), n_pos))

There are  280  comments in the test set
There are  982  labeled comments total
Negative: 74% (728)
Positive: 25% (254)


In [30]:
sa = pipeline('sentiment-analysis', model='marma/bert-base-swedish-cased-sentiment')

targets = list(all_labeled_df['sentiment_label'].values)
features = list(all_labeled_df['message'].values)
preds = []

# classify all texts in the labeled dataset
for text, label in zip(features, targets):
    # print('text', text)
    # print('label', label)
    if len(text) < 512:
        pred = sa(text)
        label = pred[0]['label']
        if label == 'NEGATIVE':
            preds.append(0)
        else:
            preds.append(1)
    else:
        preds.append(0) # classify as negative


In [31]:
# Calculate the accuracy of the predicitons

scores = calculate_precision_recall(targets, preds)
print('scores: ', scores)
count = 0
for i, (pred, target) in enumerate(zip(preds, targets)):
    if pred == target:
        count += 1
    # else:
        # print('Predicted label: ', pred)
        # print('Target label: ', target)
        # print('Message: ', features[i])
        
test_accuracy = count / len(preds)
print("Accuracy: {0:.3f}".format(test_accuracy))

scores:  {'Negative': (0.8832417582417582, 0.9455882352941176, 0.9133522727272727), 'Positive': (0.8543307086614174, 0.7185430463576159, 0.7805755395683454)}
Accuracy: 0.876
