In [None]:
!wget -nc https://lazyprogrammer.me/course_files/nlp/bbc_text_cls.csv

In [None]:
!pip install transformers

In [None]:
from transformers import pipeline 
import numpy as np 
import pandas as pd 
import seaborn as sn 
import matplotlib.pyplot as plt 
import textwrap
from sklearn.metrics import roc_auc_score, f1_score, confusion_matrix

In [None]:
classifier = pipeline('zero-shot-classification', device = 0) 

In [None]:
classifier("This is a great movie", candidate_labels=['positive', 'negative'])

In [None]:
text = """5' AMP-activated protein kinase or AMPK or 5' adenosine monophosphate-activated protein kinase is an enzyme (EC 2.7.11.31) that plays a role in cellular energy homeostasis, largely to activate glucose and fatty acid uptake and oxidation when cellular energy is low. It belongs to a highly conserved eukaryotic protein family and its orthologues are SNF1 in yeast, and SnRK1 in plants. It consists of three proteins (subunits) that together make a functional enzyme, conserved from yeast to humans. It is expressed in a number of tissues, including the liver, brain, and skeletal muscle. In response to binding AMP and ADP,[1] the net effect of AMPK activation is stimulation of hepatic fatty acid oxidation, ketogenesis, stimulation of skeletal muscle fatty acid oxidation and glucose uptake, inhibition of cholesterol synthesis, lipogenesis, and triglyceride synthesis, inhibition of adipocyte lipogenesis, inhibition of adipocyte lipolysis, and modulation of insulin secretion by pancreatic β-cells.[2]

It should not be confused with cyclic AMP-activated protein kinase (protein kinase A).[3]"""

classifier(text, candidate_labels = ['biology', 'math', 'geology'])

In [None]:
df = pd.read_csv('bbc_text_cls.csv')

In [None]:
len(df) 

In [None]:
df.sample(frac=1).head()

In [None]:
labels = list(set(df['labels']))
labels

In [None]:
print(textwrap.fill(df.iloc[1024]['text']))

In [None]:
df.iloc[1024]['labels']

In [None]:
classifier(df.iloc[1024]['text'], candidate_labels = labels) 

In [None]:
# Takes about 55 mins
preds = classifier(df['text'].tolist(), candidate_labels = labels) 

In [None]:
preds

In [None]:
predicted_labels = [d['labels'][0] for d in preds] 

In [None]:
df['predicted_labels'] = predicted_labels

In [None]:
print("Acc:", np.mean(df['predicted_labels'] == df['labels']))

In [None]:
# convert prediction probs into an NxK matrix according to original label order 
N = len(df) 
K = len(labels) 
label2idx = {v:k for k, v in enumerate(labels)}

probs = np.zeros((N, K))
for i in range(N):
    # loop through labels and scores in corresponding order
    d = preds[i]
    for label, score in zip(d['labels'], d['scores']): 
        k = label2idx[label] 
        probs[i, k] = score 


In [None]:
int_labels = [label2idx[x] for x in df['labels']]

In [None]:
int_preds = np.argmax(probs, axis=1) 
cm = confusion_matrix(int_labels, int_preds, normalize='true')

In [None]:
def plot_cm(cm):
    df_cm = pd.DataFrame(cm, index = labels , columns=labels) 
    ax = sn.heatmap(df_cm, annot=True, fmt='.2g')
    ax.set_xlabel("Predicted")
    ax.set_ylabel("Target")

plot_cm(cm) 

In [None]:
f1_score(df['labels'], predicted_labels, average= 'micro')

In [None]:
roc_auc_score(int_labels, probs, multi_class='ovo')