# Objective
- the objective of is to classify a news to ten categories
- First we install the cohere package which will help us connect ot the cohere API
    - ***pip install cohere***
- Then we generate API key for the cohere

# 1. Get the packages

In [12]:
import pandas as pd
import cohere
from sklearn.model_selection import train_test_split

import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score


# 2. Get the dataset

In [13]:
# ADD YOUR API KEY HERE
api_key = pd.read_json('../config/apikey.json')['cohere_key'][0]
co =  cohere.Client(api_key)

In [14]:
df = pd.read_csv('../data/news/news_data.csv')
df


Unnamed: 0,Domain,Title,Description,Body,Link,timestamp,Analyst_Average_Score,Analyst_Rank,Reference_Final_Score
0,rassegnastampa.news,Boris Johnson using a taxpayer-funded jet for ...,…often trigger a protest vote that can upset…t...,Boris Johnson using a taxpayer-funded jet for ...,https://rassegnastampa.news/boris-johnson-usin...,2021-09-09T18:17:46.258006,0.0,4,1.96
1,twitter.com,"Stumbled across an interesting case, a woman f...","Stumbled across an interesting case, a woman f...","Stumbled across an interesting case, a woman f...",http://twitter.com/CoruscaKhaya/status/1435585...,2021-09-08T13:02:45.802298,0.0,4,12.0
2,atpe-tchad.info,Marché Résines dans les peintures et revêtemen...,…COVID-19…COVID…COVID…COVID-19 et Post COVID…C...,Le rapport d’étude de marché Résines dans les ...,http://atpe-tchad.info/2021/09/13/marche-resin...,2021-09-13T07:32:46.244403,0.0,4,0.05
3,badbluetech.bitnamiapp.com,"AI drives data analytics surge, study finds",…hate raiders' linked to automated harassment ...,How to drive the funnel through content market...,http://badbluetech.bitnamiapp.com/p.php?sid=21...,2021-09-11T00:17:45.962605,0.0,4,6.1
4,kryptogazette.com,Triacetin Vertrieb Markt 2021: Globale Unterne...,…Abschnitten und Endanwendungen / Organisation...,Global Triacetin Vertrieb-Markt 2021 von Herst...,https://kryptogazette.com/2021/09/08/triacetin...,2021-09-08T12:47:46.078369,0.0,4,0.13
5,mype.co.za,Male arrested for the murder of an elderly fem...,…Crime Stamp Out…N1 and R101 roads appear in c...,South African Police Service Office of the Pro...,https://mype.co.za/new/male-arrested-for-the-m...,2021-09-10T00:17:46.055622,1.33,2,11.0
6,eminetra.co.za,7th Anniversary of SCOAN Collapse in Nigeria-S...,"…in Lagos, Nigeria, 84 South Africans were kil...",Today is the 7th anniversary [Tragic collapse ...,https://eminetra.co.za/7th-anniversary-of-scoa...,2021-09-12T05:17:50.279081,0.0,4,10.1
7,eminetra.co.za,The construction sector is expected to be boos...,"…additional spending on buildings, repairs and...",Construction activity grew steadily by 4% in t...,https://eminetra.co.za/the-construction-sector...,2021-09-09T09:02:46.320793,1.66,1,1.36
8,news24.com,News24.com | Court dismisses attempt by former...,…Lawsuit Against Public Participation) designe...,- Former Eskom CEO Matshela Moses Koko sought ...,https://www.news24.com/news24/southafrica/news...,2021-09-09T19:32:46.239682,0.33,3,2.4
9,manometcurrent.com,Global and Regional Beta-Carotene Market Resea...,…key players! – DSM – BASF – Allied Biotech – ...,Global and Regional Beta-Carotene Market Resea...,https://manometcurrent.com/global-and-regional...,2021-09-13T03:02:45.609228,0.0,4,0.22


In [15]:
df['Analyst_Rank'] = df['Analyst_Rank'].apply(lambda x: 0 if x<4 else 1)
df['Analyst_Rank'].value_counts()

1    7
0    3
Name: Analyst_Rank, dtype: int64

In [16]:
# Split the dataset into training and test portions
# Training = For use in Sections 2 and 3
# Test = For evaluating the classifier performance
X, y = df["Description"], df["Analyst_Rank"].astype(str)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.10, random_state=21)

In [17]:
# View the list of all available categories
intents = y_train.unique().tolist()
print(intents)


['1', '0']


# 3. Few-shot classification with the Classify endpoint

### Preparing Examble

In [18]:
# Set the number of examples per category
EX_PER_CAT = 6

# Create list of examples containing texts and labels - sample from the dataset
ex_texts, ex_labels = [], []
for intent in intents:
  y_temp = y_train[y_train == intent]
  sample_indexes = y_temp.index#.sample(n=EX_PER_CAT, random_state=42).index
  ex_texts += X_train[sample_indexes].tolist()
  ex_labels += y_train[sample_indexes].tolist()

# print(f'Number of examples per class: {EX_PER_CAT}')
print(f'Number of classes: {len(intents)}')
print(f'Total number of examples: {len(ex_texts)}')


Number of classes: 2
Total number of examples: 9


## 4. Get classifications via the Classify endpoint

In [19]:
# Collate the examples via the Example module
from cohere.classify import Example

examples = list()
for txt, lbl in zip(X_train,y_train):
  examples.append(Example(txt,lbl))
  

In [20]:
def classify_text(text, examples):
  classifications = co.classify(
      model='xlarge',  # model version - medium-22020720
      inputs=[text],
      examples=examples
  )
  return classifications.classifications[0].prediction


In [21]:
with open('../data/output/news_examples.txt', 'w') as f:
    for item in examples:
        # write each item on a new line
        f.write("%s\n" % str(item).strip())
    print('Done')


Done


In [32]:
for ex in examples:
    print()


cohere.Example 

text: …COVID-19…COVID…COVID…COVID-19 et Post COVID…COVID…Covid…BASF…COVID…COVID-19 3.4.1 Évaluation des risques liés au COVID-19 3.4.2 Évaluation de l’impact global du COVID-19 sur l’industrie 3.4.3 Scénario de marché avant et après COVID…covid…covid…
label: 1


cohere.Example 

text: …additional spending on buildings, repairs and security in KwaZulu-Natal and Houten could boost construction activity in the third…starting block.” He hopes that further relaxation of blockade regulations and tax jackpots against the backdrop of record…
label: 0


cohere.Example 

text: …hate raiders' linked to automated harassment campaigns [engadget](d.php?did=engadget.com) Texas law could force social media to host misinformation and hate speech [ars](d.php?did=arstechnica.com)…breach…shot on…stolen data…Killed…M1…Toxic…Attack…
label: 1


cohere.Example 

text: …Crime Stamp Out…N1 and R101 roads appear in court – SAPS Crime Report:…Crime Report:…robbery – SAPS Crime Report: 2021-09-09 

In [23]:
# Generate classification predictions on the test dataset (this will take a few minutes)
y_pred = X_test.apply(classify_text, args=(examples,)).tolist()

In [24]:
# Compute metrics on the test dataset
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')

print(f'Accuracy: {100*accuracy:.2f}')
print(f'F1-score: {100*f1:.2f}')


Accuracy: 100.00
F1-score: 100.00


# 5. Get the embedding of the news title

In [25]:
# Embed the training set
embeddings_train = co.embed(texts=X_train.tolist(),
                            model="large",
                            truncate="LEFT").embeddings
# Embed the testing set
embeddings_test = co.embed(texts=X_test.tolist(),
                           model="large",
                           truncate="LEFT").embeddings


In [26]:
print(f"Review text: {X_train[0]}")
print(f"Embedding vector: {embeddings_train[0][:10]}")


Review text: …often trigger a protest vote that can upset…that ministers who breach the Ministerial Code…plane up the M1 in order to…serious questions about abuse of taxpayers' money which is a clear breach of the code.…should not permit abuse of taxpayers' money…
Embedding vector: [0.80549407, 1.0933834, 0.70318717, 1.4982257, -0.53660864, -0.78288454, 0.86601657, -2.3292928, 0.3933644, 0.010739213]


# 6. Train a classifier using the training set

In [27]:
# import SVM classifier code
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler


# Initialize a support vector machine, with class_weight='balanced' because
# our training set has roughly an equal amount of positive and negative
# sentiment sentences
svm_classifier = make_pipeline(StandardScaler(), SVC(class_weight='balanced'))

# fit the support vector machine
svm_classifier.fit(embeddings_train, y_train)


In [28]:
# get the score from the test set, and print it out to screen!
score = svm_classifier.score(embeddings_test, y_test)
print(f"Validation accuracy on Large is {100*score}%!")


Validation accuracy on Large is 100.0%!


Validation accuracy on Large is 66.7%!
This was a small scale example, meant as a proof of concept and designed to illustrate how you can build a custom classifier quickly using a small amount of labelled data and Cohere's embeddings. Increase the number of training examples to achieve better performance on this task.