# Objective
- the objective of is to classify a news to ten categories
- First we install the cohere package which will help us connect ot the cohere API
    - ***pip install cohere***
- Then we generate API key for the cohere

# 1. Get the packages

In [115]:
import pandas as pd
import cohere
from sklearn.model_selection import train_test_split

import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score


# 2. Get the dataset

In [116]:
# ADD YOUR API KEY HERE
api_key = pd.read_json('../config/apikey.json')['cohere_key'][0]
co =  cohere.Client(api_key)

In [117]:
df = pd.read_csv('../data/news/news_data.csv')
df.shape


(10, 9)

In [118]:
df['Analyst_Rank'] = df['Analyst_Rank'].apply(lambda x: 0 if x<4 else 1)
df['Analyst_Rank'].value_counts()

1    7
0    3
Name: Analyst_Rank, dtype: int64

In [119]:
# Split the dataset into training and test portions
# Training = For use in Sections 2 and 3
# Test = For evaluating the classifier performance
X, y = df["Title"], df["Analyst_Rank"].astype(str)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.10, random_state=21)


In [120]:
# View the list of all available categories
intents = y_train.unique().tolist()
print(intents)


['1', '0']


# 3. Few-shot classification with the Classify endpoint

### Preparing Examble

In [121]:
# Set the number of examples per category
EX_PER_CAT = 6

# Create list of examples containing texts and labels - sample from the dataset
ex_texts, ex_labels = [], []
for intent in intents:
  y_temp = y_train[y_train == intent]
  sample_indexes = y_temp.index#.sample(n=EX_PER_CAT, random_state=42).index
  ex_texts += X_train[sample_indexes].tolist()
  ex_labels += y_train[sample_indexes].tolist()

# print(f'Number of examples per class: {EX_PER_CAT}')
print(f'Number of classes: {len(intents)}')
print(f'Total number of examples: {len(ex_texts)}')


Number of classes: 2
Total number of examples: 9


## 4. Get classifications via the Classify endpoint

In [122]:
# Collate the examples via the Example module
from cohere.classify import Example

examples = list()
for txt, lbl in zip(X_train,y_train):
  examples.append(Example(txt,lbl))

In [123]:
def classify_text(text, examples):
  classifications = co.classify(
      model='medium',  # model version - medium-22020720
      inputs=[text],
      examples=examples
  )
  return classifications.classifications[0].prediction


In [124]:
examples


[cohere.Example {
 	text: Marché Résines dans les peintures et revêtements 2021 avec les données des meilleurs pays et l’analyse Covid-19, la portée future, l’estimation de la taille, les revenus, les tendances des prix et les prévisions d’ici 2026
 	label: 1
 },
 cohere.Example {
 	text: The construction sector is expected to be boosted by riots and looting repairs
 	label: 0
 },
 cohere.Example {
 	text: AI drives data analytics surge, study finds
 	label: 1
 },
 cohere.Example {
 	text: Male arrested for the murder of an elderly female in Cofimvaba – SAPS Crime Report: 2021-09-09 13:22:58
 	label: 0
 },
 cohere.Example {
 	text: 7th Anniversary of SCOAN Collapse in Nigeria-SABC News
 	label: 1
 },
 cohere.Example {
 	text: Boris Johnson using a taxpayer-funded jet for an election campaign fits a long history of taking things he didn't pay for
 	label: 1
 },
 cohere.Example {
 	text: Triacetin Vertrieb Markt 2021: Globale Unternehmensanalyse, Merkmale, Marktplatzlänge und -prognosen 

In [125]:
# Generate classification predictions on the test dataset (this will take a few minutes)
y_pred = X_test.apply(classify_text, args=(examples,)).tolist()


In [126]:
# Compute metrics on the test dataset
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')

print(f'Accuracy: {100*accuracy:.2f}')
print(f'F1-score: {100*f1:.2f}')


Accuracy: 100.00
F1-score: 100.00


# 5. Get the embedding of the news title

In [127]:
# Embed the training set
embeddings_train = co.embed(texts=X_train.tolist(),
                            model="large",
                            truncate="LEFT").embeddings
# Embed the testing set
embeddings_test = co.embed(texts=X_test.tolist(),
                           model="large",
                           truncate="LEFT").embeddings


In [128]:
print(f"Review text: {X_train[0]}")
print(f"Embedding vector: {embeddings_train[0][:10]}")


Review text: Boris Johnson using a taxpayer-funded jet for an election campaign fits a long history of taking things he didn't pay for
Embedding vector: [1.5737003, 0.4941845, -1.4789015, 0.08786164, 0.322134, -0.22437036, 2.4170856, 0.18827693, -0.56460893, 1.5627791]


# 6. Train a classifier using the training set

In [129]:
# import SVM classifier code
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler


# Initialize a support vector machine, with class_weight='balanced' because
# our training set has roughly an equal amount of positive and negative
# sentiment sentences
svm_classifier = make_pipeline(StandardScaler(), SVC(class_weight='balanced'))

# fit the support vector machine
svm_classifier.fit(embeddings_train, y_train)


In [130]:
# get the score from the test set, and print it out to screen!
score = svm_classifier.score(embeddings_test, y_test)
print(f"Validation accuracy on Large is {100*score}%!")


Validation accuracy on Large is 100.0%!


Validation accuracy on Large is 66.7%!
This was a small scale example, meant as a proof of concept and designed to illustrate how you can build a custom classifier quickly using a small amount of labelled data and Cohere's embeddings. Increase the number of training examples to achieve better performance on this task.

In [131]:
# Collate the examples via the Example module
from cohere.classify import Example

examples = list()
for txt, lbl in zip(X_train, y_train):
  examples.append(Example(txt,lbl))
