In [1]:
%pip install kneed

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [21]:
import os
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, HashingVectorizer
from kMeans import kmeans
from sklearn.cluster import KMeans
from sklearn.metrics import accuracy_score
from sklearn.cluster import DBSCAN, MeanShift, estimate_bandwidth, SpectralClustering
from sklearn.neighbors import NearestNeighbors
from kneed import KneeLocator

I. Clustering messages in spam and ham
II. Recognize the emotion from a text

1. Load data

In [3]:
# load some data
def load_data_spam(fileName):
    file = pd.read_csv(fileName, encoding='ISO-8859-1')
    inputs = file['emailText']
    outputs = file['emailType']
    labelNames = list(set(outputs))
    return inputs, outputs, labelNames

def load_data_emotions(filename):
    file = pd.read_csv(filename, encoding = "ISO-8859-1")
    input_data = file["Text"]
    output_data = file["Sentiment"]
    label_names = list(set(output_data))
    return input_data, output_data, label_names

crtDir =  os.getcwd()

# fileName = os.path.join(crtDir, 'data', 'spam.csv')
# inputs, labels, labelNames = load_data_spam(fileName)

fileName = os.path.join(crtDir, 'data', 'reviews_mixed.csv')
inputs, labels, labelNames = load_data_emotions(fileName)

print(inputs[:2])
print(labelNames[:2])

0    The rooms are extremely small, practically onl...
1                              Room safe did not work.
Name: Text, dtype: object
['positive', 'negative']


2. Split into train and test

In [4]:
def train_and_test(input_data, output_data):
    indexes = [i for i in range(len(input_data))]
    train_sample = np.random.choice(indexes, int(0.8 * len(input_data)), replace=False)
    test_sample = [i for i in indexes if i not in train_sample]
    train_inputs = [input_data[i] for i in train_sample]
    train_outputs = [output_data[i] for i in train_sample]
    test_inputs = [input_data[i] for i in test_sample]
    test_outputs = [output_data[i] for i in test_sample]
    return train_inputs, train_outputs, test_inputs, test_outputs

trainInputs, trainOutputs, testInputs, testOutputs = train_and_test(inputs, labels)
print(trainInputs[:3])

['Rooms quite small for price, especially if you use a travel cot.', 'The bathroom functioned o.k.', 'Loved the elegant art deco ambience.']


3. Extract caracteristics

In [5]:
def extract_features_bag_of_words(train_inputs, test_inputs):  # BAG OF WORDS
    vec = CountVectorizer()
    train_features = vec.fit_transform(train_inputs)
    test_features = vec.transform(test_inputs)
    # vocabulaty size
    print("vocab size: ", len(vec.vocabulary_),  " words")
    # no of emails (Samples)
    print("traindata size: ", len(train_inputs), " texts")
    # shape of feature matrix
    print("trainFeatures shape: ", train_features.shape)
    # vocabbulary from the train data 
    print('some words of the vocab: ', vec.get_feature_names_out()[-20:])
    # extracted features
    print('some features: ', train_features.toarray()[:3])
    return vec, train_features.toarray(), test_features.toarray()


def extract_features_tf_idf(train_inputs, test_inputs, max_features):  # TF-IDF
    vec = TfidfVectorizer(max_features=max_features)
    train_features = vec.fit_transform(train_inputs)
    test_features = vec.transform(test_inputs)
    # vocabbulary from the train data 
    print('vocab: ', vec.get_feature_names_out()[:10])
    # extracted features
    print('features: ', train_features.toarray()[:3])
    return vec, train_features.toarray(), test_features.toarray()


def extract_features_hashing(train_inputs, test_inputs, n_features):  # HASHING - bag of words that uses hash codes
    vec = HashingVectorizer(n_features=n_features)
    train_features = vec.fit_transform(train_inputs)
    test_features = vec.transform(test_inputs)
    return vec, train_features.toarray(), test_features.toarray()

4. Train and test model

In [6]:
def predict_by_tool(train_features, test_features, label_names, classes):
    unsupervisedClassifier = KMeans(n_clusters=classes, random_state=0)
    unsupervisedClassifier.fit(train_features)
    computed_indexes = unsupervisedClassifier.predict(test_features)
    computed_outputs = [label_names[value] for value in computed_indexes]
    return unsupervisedClassifier, computed_outputs


def predict_by_me(train_features, test_features, label_names, classes):
    unsupervisedClassifier = kmeans(n_clusters=classes)
    unsupervisedClassifier.fit(train_features)
    my_centroids, computed_indexes = unsupervisedClassifier.evaluate(test_features)
    computed_outputs = [label_names[value] for value in computed_indexes]
    return computed_outputs, my_centroids, computed_indexes

Alternatives for kmeans:
- Hierarchical Clustering (Agglomerative and Divisive)
- DBSCAN (Density-Based Spatial Clustering of Applications with Noise)
- Mean Shift
- Gaussian Mixture Models (GMM)
- OPTICS (Ordering Points To Identify the Clustering Structure)
- Self-Organizing Maps (SOM)
- BIRCH (Balanced Iterative Reducing and Clustering using Hierarchies)

In [34]:
# with DBSCAN
def predict_with_dbscan(train_features, test_features, label_names):
    nbrs = NearestNeighbors(n_neighbors=4).fit(train_features)
    neigh_dist, _= nbrs.kneighbors(train_features)
    sort_neigh_dist = np.sort(neigh_dist, axis = 0)
    kneedle = KneeLocator(x = range(1, len(neigh_dist)+1), y = sort_neigh_dist, S = 1.0, 
                      curve = "concave", direction = "increasing", online=True)
    clf = DBSCAN(eps=kneedle.knee_y, min_samples=4)
    clf.fit(train_features)
    computed_indexes = clf.fit_predict(test_features)
    computed_outputs = [label_names[value] for value in computed_indexes]
    return computed_outputs

# with MeanShift
def predict_with_meanshift(train_features, test_features, label_names, n_samples):
    bandwith = estimate_bandwidth(train_features, quantile=0.2, n_samples=n_samples)
    print("bandwith: ", bandwith)
    clf = MeanShift(bandwidth=bandwith)
    clf.fit(train_features)
    computed_indexes = clf.predict(test_features)
    print("computed indexes: ", computed_indexes)
    computed_outputs = [label_names[value] for value in computed_indexes]
    return computed_outputs

# with SpectralClustering
def predict_with_spectral(train_features, test_features, label_names, classes):
    clf = SpectralClustering(n_clusters=classes, random_state=0)
    clf.fit(train_features)
    computed_indexes = clf.fit_predict(test_features)
    computed_outputs = [label_names[value] for value in computed_indexes]
    return computed_outputs

5. Main function

In [37]:
print("\nSPAM")
crtDir =  os.getcwd()
fileName = os.path.join(crtDir, 'data', 'spam.csv')
inputs, outputs, labelsNames = load_data_spam(fileName)
trainInputs, trainOutputs, testInputs, testOutputs = train_and_test(inputs, outputs)
# 1. BAG OF WORDS
_, trainFeatures, testFeatures = extract_features_bag_of_words(trainInputs, testInputs)
# 2. TF-IDF
# _, trainFeatures, testFeatures = extract_features_tf_idf(trainInputs, testInputs, 50)
# 3. HASHING
# _, trainFeatures, testFeatures = extract_features_hashing(trainInputs, testInputs, 2 ** 10)
_, computedOutputs = predict_by_tool(trainFeatures, testFeatures, labelsNames, len(set(labelsNames)))
myComputedOutputs, centroids, computedIndexes = predict_by_me(trainFeatures, testFeatures, labelsNames, len(set(labelsNames)))

accuracyByTool = accuracy_score(testOutputs, computedOutputs)
accuracyByMe = accuracy_score(testOutputs, myComputedOutputs)

print('Accuracy score by tool:', accuracyByTool)
print('Accuracy score by me:', accuracyByMe)
print('Output computed by tool:  ', computedOutputs)
print('Output computed by me:    ', myComputedOutputs)
print('Real output:              ', testOutputs)

print("\nEMOTIONS")
crtDir =  os.getcwd()
fileName = os.path.join(crtDir, 'data', 'reviews_mixed.csv')
inputs, outputs, labelsNames = load_data_emotions(fileName)
trainInputs, trainOutputs, testInputs, testOutputs = train_and_test(inputs, outputs)
# 1. BAG OF WORDS
vec, trainFeatures, testFeatures = extract_features_bag_of_words(trainInputs, testInputs)
# 2. TF-IDF
# vec, trainFeatures, testFeatures = extract_features_tf_idf(trainInputs, testInputs, 150)
# 3. HASHING
# vec, trainFeatures, testFeatures = extract_features_hashing(trainInputs, testInputs, 2 ** 10)
classifier, computedOutputs = predict_by_tool(trainFeatures, testFeatures, labelsNames, len(set(labelsNames)))
myComputedOutputs, centroids, computedIndexes = predict_by_me(trainFeatures, testFeatures, labelsNames, len(set(labelsNames)))

accuracyByTool = accuracy_score(testOutputs, computedOutputs)
accuracyByMe = accuracy_score(testOutputs, myComputedOutputs)

print('Accuracy score by tool:', accuracyByTool)
print('Accuracy score by me:', accuracyByMe)
print('Output computed by tool:  ', computedOutputs)
print('Output computed by me:                   ', myComputedOutputs)
print('Real output:                             ', testOutputs)

# computedOutputs = predict_with_meanshift(trainFeatures, testFeatures, labelsNames, 100)
# accuracyByMeanShift = accuracy_score(testOutputs, computedOutputs)
# print('Output computed by Mean Shift:       ', computedOutputs)

# computedOutputs = predict_with_dbscan(trainFeatures, testFeatures, labelsNames)
# accuracyByDBSCAN = accuracy_score(testOutputs, computedOutputs)
# print('Output computed by DBSCAN:  ', computedOutputs)

computedOutputs = predict_with_spectral(trainFeatures, testFeatures, labelsNames, len(set(labelsNames)))
accuracyBySpectral = accuracy_score(testOutputs, computedOutputs)
print('Output computed by Spectral Clustering:  ', computedOutputs)
print('Accuracy score by Spectral Clustering:', accuracyBySpectral)
# print('Accuracy score by DBSCAN:', accuracyByDBSCAN)
# print('Accuracy score by Mean Shift:', accuracyByMeanShift)


SPAM
vocab size:  7729  words
traindata size:  4457  texts
trainFeatures shape:  (4457, 7729)
some words of the vocab:  ['zouk' 'zyada' 'åè10' 'åð' 'åòharry' 'åòit' 'åômorrow' 'åôrents' 'ì_'
 'ì¼1' 'ìä' 'ìï' 'ó_' 'û_' 'ûªm' 'ûªt' 'ûªve' 'ûï' 'ûïharry' 'ûò']
some features:  [[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
Accuracy score by tool: 0.21434977578475337
Accuracy score by me: 0.8600896860986547
Output computed by tool:   ['spam', 'spam', 'spam', 'ham', 'ham', 'spam', 'spam', 'spam', 'spam', 'spam', 'spam', 'spam', 'ham', 'spam', 'spam', 'spam', 'spam', 'spam', 'spam', 'spam', 'spam', 'spam', 'spam', 'spam', 'ham', 'spam', 'spam', 'spam', 'spam', 'spam', 'ham', 'spam', 'ham', 'ham', 'spam', 'ham', 'spam', 'ham', 'spam', 'spam', 'ham', 'spam', 'spam', 'spam', 'spam', 'spam', 'spam', 'spam', 'spam', 'spam', 'spam', 'spam', 'spam', 'spam', 'spam', 'spam', 'spam', 'spam', 'ham', 'spam', 'spam', 'ham', 'spam', 'spam', 'spam', 'spam', 'spam', 'spam', 'spam', 'spam', 'spam'

What is the feeling conveyed by the following message: "By choosing a bike over a car, I’m reducing my environmental footprint. Cycling promotes eco-friendly transportation, and I’m proud to be part of that movement."  ?

In [29]:
%pip install azure-ai-textanalytics==5.2.0

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [30]:
import os
from azure.core.credentials import AzureKeyCredential
from azure.ai.textanalytics import TextAnalyticsClient

In [31]:
endpoint = os.environ["LANGUAGE_ENDPOINT"]
key = os.environ["LANGUAGE_KEY"]

client = TextAnalyticsClient(endpoint=endpoint, credential=AzureKeyCredential(key))

In [32]:
# with azure client
documents = ["By choosing a bike over a car, I’m reducing my environmental footprint. Cycling promotes eco-friendly transportation, and I’m proud to be part of that movement."]
result = client.analyze_sentiment(documents, show_opinion_mining=True)

doc = result[0]

print(f"Document text: {documents[0]}")
print("With azure client:")
print(f"Overall sentiment: {doc.sentiment}")

# with kmeans
doc_features = vec.transform(documents)
idx = classifier.predict(doc_features)
rez = [labelNames[idx[0]]]
print("With kmeans algorithm:")
print(f"Overall sentiment: {rez}")

Document text: By choosing a bike over a car, I’m reducing my environmental footprint. Cycling promotes eco-friendly transportation, and I’m proud to be part of that movement.
With azure client:
Overall sentiment: positive
With kmeans algorithm:
Overall sentiment: ['positive']
