# Objective
- the objective of is to classify a news to ten categories
- First we install the cohere package which will help us connect ot the cohere API
    - ***pip install cohere***
- Then we generate API key for the cohere

# 1. Get the packages

In [14]:
import pandas as pd
import cohere
from sklearn.model_selection import train_test_split


# 2. Get the dataset

In [15]:
pd.set_option('display.max_colwidth', None)
# Get the SST2 training and test sets
df = pd.read_csv('../data/news_data.csv')
df.columns


Index(['Domain', 'Title', 'Description', 'Body', 'Link', 'timestamp',
       'Analyst_Average_Score', 'Analyst_Rank', 'Reference_Final_Score'],
      dtype='object')

In [18]:
# Split into training and testing sets
sentences_train, sentences_test, labels_train, labels_test = train_test_split(
    list(df['Title']), list(df['Analyst_Rank']), test_size=0.25, random_state=0)


# 3. Get the embedding of the news title

In [2]:
# ADD YOUR API KEY HERE
api_key = "M83Glp01KZFaCkKn636Er3uY8MfTi9GIYu1AqhE8"

# Create and retrieve a Cohere API key from os.cohere.ai
co = cohere.Client(api_key)


In [21]:
# Embed the training set
embeddings_train = co.embed(texts=sentences_train,
                            model="large",
                            truncate="LEFT").embeddings
# Embed the testing set
embeddings_test = co.embed(texts=sentences_test,
                           model="large",
                           truncate="LEFT").embeddings


In [22]:
print(f"Review text: {sentences_train[0]}")
print(f"Embedding vector: {embeddings_train[0][:10]}")


Review text: Global and Regional Beta-Carotene Market Research 2020 Report | Growth Forecast 2025 key players! – DSM – BASF – Allied Biotech – Chr Hansen – LYCORED
Embedding vector: [-2.1059608, 1.4654341, 1.3921894, 1.8436499, -2.7594426, 1.2028096, 1.6470028, 0.5763277, 0.11021129, 0.7930053]


# 4. Train a classifier using the training set

In [23]:
# import SVM classifier code
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler


# Initialize a support vector machine, with class_weight='balanced' because
# our training set has roughly an equal amount of positive and negative
# sentiment sentences
svm_classifier = make_pipeline(StandardScaler(), SVC(class_weight='balanced'))

# fit the support vector machine
svm_classifier.fit(embeddings_train, labels_train)


In [24]:
# get the score from the test set, and print it out to screen!
score = svm_classifier.score(embeddings_test, labels_test)
print(f"Validation accuracy on Large is {100*score}%!")


Validation accuracy on Large is 66.66666666666666%!


Validation accuracy on Large is 66.7%!
This was a small scale example, meant as a proof of concept and designed to illustrate how you can build a custom classifier quickly using a small amount of labelled data and Cohere's embeddings. Increase the number of training examples to achieve better performance on this task.

In [29]:
import cohere
from cohere.classify import Example
response = co.classify(
    model='medium',
    inputs=["Am I still able to return my order?",
            "When can I expect my package?"],
    examples=[Example("Do you offer same day shipping?", "Shipping and handling policy"), Example("Can you ship to Italy?", "Shipping and handling policy"), Example("How long does shipping take?", "Shipping and handling policy"), Example("Can I buy online and pick up in store?", "Shipping and handling policy"), Example("What are your shipping options?", "Shipping and handling policy"), Example("My order arrived damaged, can I get a refund?", "Start return or exchange"), Example("You sent me the wrong item", "Start return or exchange"), Example("I want to exchange my item for another colour", "Start return or exchange"), Example("I ordered something and it wasn’t what I expected. Can I return it?", "Start return or exchange"), Example("What’s your return policy?", "Start return or exchange"), Example("Where’s my package?", "Track order"), Example("When will my order arrive?", "Track order"), Example("What’s my shipping number?", "Track order"), Example("Which carrier is my package with?", "Track order"), Example("Is my package delayed?", "Track order")])
print('The confidence levels of the labels are: {}'.format(response.classifications))


The confidence levels of the labels are: [cohere.Classification {
	input: Am I still able to return my order?
	prediction: Start return or exchange
	confidence: [cohere.Confidence {
	label: Shipping and handling policy
	confidence: 0.22736539
}, cohere.Confidence {
	label: Start return or exchange
	confidence: 0.60092175
}, cohere.Confidence {
	label: Track order
	confidence: 0.17171283
}]
}, cohere.Classification {
	input: When can I expect my package?
	prediction: Track order
	confidence: [cohere.Confidence {
	label: Shipping and handling policy
	confidence: 0.24907826
}, cohere.Confidence {
	label: Start return or exchange
	confidence: 0.27320468
}, cohere.Confidence {
	label: Track order
	confidence: 0.47771704
}]
}]


In [3]:
import cohere
from cohere.classify import Example

response = co.classify(
    model='large',
    inputs=["Any decent trader spends 90% of their efforts exploring how they could be wrong. That should apply to everyone’s decision making.",
            "Congratulations to all of this years @OliverAwards winners!"],
    examples=[Example("Elon Musk says Twitter Blue subscribers should be able to pay with dogecoin", "Business news"), Example("Probability of a US recession in the next 12 months, via WSJ", "Business news"), Example("European futures slide", "Business news"), Example("NASDAQ rises 2% to ATH", "Business news"), Example("FTX Founder is one of the world\'s richest crypto billionaires, with a fortune valued at $20 billion.", "Business news"), Example("Sweet Potato Macaroni Cheese is #RecipeOfTheDay, and I’m very happy about it!", "Cooking"), Example("3-Ingredient Slow Cooker recipes", "Cooking"), Example("This is by far the BEST biscuit recipe I’ve ever tried", "Cooking"), Example("Baking my first loaf of banana bread...", "Cooking"), Example("From the queen of Italian cooking, this is one of the most iconic tomato sauce recipes ever", "Cooking"), Example("I’ve actually read this book and it was extremely insightful. A quick read as well and available as a free audiobook through many libraries.", "Arts & Culture"), Example("Today’s Daily Cartoon", "Arts & Culture"), Example("Get a glimpse of the stage adaptation of Hayao Miyazaki’s 2001 Oscar-winning animated feature Spirited Away", "Arts & Culture"), Example("The #Banksy Exhibit in Cambridge, MA is absolutely terrific.", "Arts & Culture"), Example("“A Whisper in Time” large abstract paining 48’ x 48’", "Arts & Culture")])
print('The confidence levels of the labels are: {}'.format(response.classifications))


The confidence levels of the labels are: [cohere.Classification {
	input: Any decent trader spends 90% of their efforts exploring how they could be wrong. That should apply to everyone’s decision making.
	prediction: Business news
	confidence: [cohere.Confidence {
	label: Business news
	confidence: 0.8030835
}, cohere.Confidence {
	label: Cooking
	confidence: 0.0006364698
}, cohere.Confidence {
	label: Arts & Culture
	confidence: 0.19628008
}]
}, cohere.Classification {
	input: Congratulations to all of this years @OliverAwards winners!
	prediction: Arts & Culture
	confidence: [cohere.Confidence {
	label: Business news
	confidence: 0.19653323
}, cohere.Confidence {
	label: Cooking
	confidence: 0.0039493362
}, cohere.Confidence {
	label: Arts & Culture
	confidence: 0.79951745
}]
}]
