Incarcarea datelor de input

In [1]:
import csv
import os

crtDir =  os.getcwd()
fileName = os.path.join(crtDir, 'data', 'reviews_mixed.csv')

data = []
with open(fileName) as csv_file:
    csv_reader = csv.reader(csv_file, delimiter=',')
    line_count = 0
    for row in csv_reader:
        if line_count == 0:
            dataNames = row
        else:
            data.append(row)
        line_count += 1

inputs = [data[i][0] for i in range(len(data))][:100]
outputs = [data[i][1] for i in range(len(data))][:100]
labelNames = list(set(outputs))

print(inputs[:2])
print(labelNames[:2])

['The rooms are extremely small, practically only a bed.', 'Room safe did not work.']
['negative', 'positive']


Impartirea datelor in trainData and testData

In [2]:
import numpy as np

np.random.seed(5) #for generate random numbers
noSamples = len(inputs) #number of inputs
indexes = [i for i in range(noSamples)] #select random indexes (80%)
trainSample = np.random.choice(indexes, int(0.8 * noSamples), replace = False)
testSample = [i for i in indexes  if not i in trainSample] #the other indexes

# split into train data and test data 
# 80% train data, 20% test data
trainInputs = [inputs[i] for i in trainSample]
trainOutputs = [outputs[i] for i in trainSample]
testInputs = [inputs[i] for i in testSample]
testOutputs = [outputs[i] for i in testSample]

print(trainInputs[:3])

['Just to give you an idea: the shutters of the windows were not working, did not go neither up or down - just hanging down only one side and the other up....', 'and hip and CLEAN!', "Toilet paper wasn't replaced everyday!"]


Extract the features

In [3]:
# Bag of Words
from sklearn.feature_extraction.text import CountVectorizer

# 1. tokenizare cuvinte (by spaces and punctuatie)
# 2. atribure index per cuvant unic
# 3. vect frecv per cuvant
# 4. normalizare
# 5. stop-words
vectorizer = CountVectorizer() # into reprezentare numerica - Bag of Words

# transform the train and test data
trainFeatures = vectorizer.fit_transform(trainInputs)
testFeatures = vectorizer.transform(testInputs)

# vocabulary size - unique words
print("vocab size: ", len(vectorizer.vocabulary_),  " words")
# no of messages (Samples)
print("traindata size: ", len(trainInputs), " messages")
# shape of feature matrix
print("trainFeatures shape: ", trainFeatures.shape)

# vocabbulary from the train data 
print('some words of the vocab: ', vectorizer.get_feature_names_out()[-20:])
# extracted features
print('some features: ', trainFeatures.toarray()[:3])

vocab size:  341  words
traindata size:  80  messages
trainFeatures shape:  (80, 341)
some words of the vocab:  ['was' 'wasn' 'water' 'we' 'wear' 'well' 'were' 'wet' 'which' 'whole'
 'window' 'windows' 'winter' 'with' 'work' 'working' 'workout' 'would'
 'you' 'your']
some features:  [[0 0 0 ... 0 1 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


Train the model (clustering)

In [4]:
from sklearn.cluster import KMeans

#Cluster - organizeaza obiectele similare in grupuri
# - se identif pe baza vectorului de frecventa (MA)
# K-means
# centroid - centrul geometric al unui cluster (o prop)
# calculez pt fiecare punct distanta euclidiana fata de centroid
# update centroizi si se repeta pana se converge la solutie

unsupervisedClassifier = KMeans(n_clusters=2, random_state=0) # n_clusters - nr clustere
unsupervisedClassifier.fit(trainFeatures)

Test the model

In [5]:
computedTestIndexes = unsupervisedClassifier.predict(testFeatures)
computedTestOutputs = [labelNames[value] for value in computedTestIndexes]
for i in range(0, len(testInputs)):
    print(testInputs[i], " -> ", computedTestOutputs[i])

The bed is very comfortable.  ->  negative
Very spacious rooms, quiet and very comfortable.  ->  negative
Corridors filthy
Room filthy
Electrical cables in room not safe
Whole building smelly
Shower repulsive  ->  negative
walls seem to have no sound insulation  ->  negative
The building was under renovation,  ->  negative
no elevator might be a challenge for some people  ->  negative
The bed was highly uncomfortable, although the engineer fixed it  ->  negative
bed, smell.  ->  negative
Detest the glass "door" if shower/tub .. with?  ->  negative
this was expected, clean towels and room cleaned every day.  ->  negative
More plug outlets with surge protectors.  ->  negative
Room was very spacious  ->  negative
Roof terrace great  ->  negative
No tea or coffee making facilities in the rooms  ->  negative
the room had aircon and we had earplugs and slept soundly.  ->  negative
Also, when the bright bathroom lights are turned on, it lights up the whole hotel room, shining thru the frosted

Test the performance

In [6]:
from sklearn.metrics import accuracy_score

print("acc: ", accuracy_score(testOutputs, computedTestOutputs))

acc:  0.7


Stabiliti care este sentimentul transmis prin mesajul By choosing a bike over a car, I’m reducing my environmental footprint. Cycling promotes eco-friendly transportation, and I’m proud to be part of that movement..

In [7]:
msg = ["By choosing a bike over a car, I’m reducing my environmental footprint. Cycling promotes eco-friendly transportation, and I’m proud to be part of that movement."]

msg_vectorized = vectorizer.transform(msg).toarray()

prediction = unsupervisedClassifier.predict(msg_vectorized)

# Check the prediction
if prediction[0] == 0:
    print("Negative")
else:
    print("Positive")


Negative


My KMeans

In [8]:
from MyKMeans import MyKMeans

model = MyKMeans(n_clusters=2)
model.fit(trainFeatures)

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
vectorizer.fit(trainInputs)

test_tfidf = vectorizer.transform(testInputs)
test_dense = test_tfidf.todense()
cluster_labels = model.predict(test_dense)

for i, label in enumerate(cluster_labels):
    text_string = testInputs[i]
    print(f"Text: '{text_string}' -> {testOutputs[label]}")


Cluster Labels for Test Set:
Text: 'The bed is very comfortable.' -> positive
Text: 'Very spacious rooms, quiet and very comfortable.' -> positive
Text: 'Corridors filthy
Room filthy
Electrical cables in room not safe
Whole building smelly
Shower repulsive' -> positive
Text: 'walls seem to have no sound insulation' -> positive
Text: 'The building was under renovation,' -> positive
Text: 'no elevator might be a challenge for some people' -> positive
Text: 'The bed was highly uncomfortable, although the engineer fixed it' -> positive
Text: 'bed, smell.' -> positive
Text: 'Detest the glass "door" if shower/tub .. with?' -> positive
Text: 'this was expected, clean towels and room cleaned every day.' -> positive
Text: 'More plug outlets with surge protectors.' -> positive
Text: 'Room was very spacious' -> positive
Text: 'Roof terrace great' -> positive
Text: 'No tea or coffee making facilities in the rooms' -> positive
Text: 'the room had aircon and we had earplugs and slept soundly.' -> po

In [10]:
print("acc: ", accuracy_score(testOutputs, computedTestOutputs))

acc:  0.7


In [11]:
msg = ["By choosing a bike over a car, I’m reducing my environmental footprint. Cycling promotes eco-friendly transportation, and I’m proud to be part of that movement."]

msg_vectorized = vectorizer.transform(msg).toarray()

prediction = model.predict(msg_vectorized)

# Check the prediction
if prediction[0] == 0:
    print("Negative")
else:
    print("Positive")


Negative


Connecting azureClient

In [12]:
import os
from azure.core.credentials import AzureKeyCredential
from azure.ai.textanalytics import TextAnalyticsClient

os.environ["AZURE_TEXT_ANALYTICS_ENDPOINT"] = "https://cosmin-andrei-ai.cognitiveservices.azure.com/"
os.environ["AZURE_TEXT_ANALYTICS_KEY"] = "4534a1eeaf564f36926149e01f1474fe"


endpoint = os.environ["AZURE_TEXT_ANALYTICS_ENDPOINT"]
key = os.environ["AZURE_TEXT_ANALYTICS_KEY"]

client = TextAnalyticsClient(endpoint=endpoint, credential=AzureKeyCredential(key))

Verificam sentimentul

In [13]:
msg = ["By choosing a bike over a car, I’m reducing my environmental footprint. Cycling promotes eco-friendly transportation, and I’m proud to be part of that movement."]

result = client.analyze_sentiment(msg)[0]

print("Sentiment mesaj: {}".format(result.sentiment))

Sentiment mesaj: positive
