In [1]:
# import necessary library
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, hamming_loss

In [2]:
df = pd.read_csv('semeval2014.csv')

df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3044 entries, 0 to 3043
Data columns (total 6 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   text                     3044 non-null   object
 1   service                  3044 non-null   int64 
 2   food                     3044 non-null   int64 
 3   anecdotes/miscellaneous  3044 non-null   int64 
 4   price                    3044 non-null   int64 
 5   ambience                 3044 non-null   int64 
dtypes: int64(5), object(1)
memory usage: 142.8+ KB


Unnamed: 0,text,service,food,anecdotes/miscellaneous,price,ambience
0,but the staff was so horrible to us,1,0,0,0,0
1,to be completely fair the only redeeming facto...,0,1,1,0,0
2,the food is uniformly exceptional with a very ...,0,1,0,0,0
3,where gabriela personaly greets you and recomm...,1,0,0,0,0
4,for those that go once and dont enjoy it all i...,0,0,1,0,0


# in order for our Scikit-Learn model to accept the training data, convert the text data into TF-IDF representation.

In [3]:
X = df["text"]
y = np.asarray(df[df.columns[1:]])

# initializing TfidfVectorizer
vect = TfidfVectorizer(max_features=3000, max_df=0.85)
# fitting the tf-idf on the given data
vect.fit(X)

# Training and Testing process

In [4]:
# splitting the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

# transforming the data
X_train_tfidf = vect.transform(X_train)
X_test_tfidf = vect.transform(X_test)

# Involving Machine Learning algorithm for predicition

# 1. Algorithm involve : KNN Classifier

In [5]:
from skmultilearn.adapt import MLkNN

classifier = MLkNN()
classifier.fit(X_train_tfidf, y_train)

In [6]:
new_sentences = ["I like the food but I hate the place"]
new_sentence_tfidf = vect.transform(new_sentences)

predicted_sentences = classifier.predict(new_sentence_tfidf)
print(predicted_sentences.toarray())

[[0 1 0 0 1]]


In [7]:
predicted = classifier.predict(X_test_tfidf)

print("Accuracy score:", round(accuracy_score(y_test, predicted),2))
print("Hamming Loss:", round(hamming_loss(y_test, predicted), 2))

Accuracy score: 0.48
Hamming Loss: 0.17


# 2. Algorithm involve : Logistic Regression

In [8]:
from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import LogisticRegression

clf = MultiOutputClassifier(LogisticRegression()).fit(X_train_tfidf, y_train)

In [9]:
prediction = clf.predict(X_test_tfidf)
print(prediction)

[[0 1 0 0 0]
 [0 0 0 0 0]
 [0 0 1 0 0]
 ...
 [1 0 0 0 0]
 [0 0 1 0 0]
 [0 1 0 0 0]]


In [10]:
print('Accuracy Score: ', accuracy_score(y_test, prediction))

Accuracy Score:  0.47155361050328226


test the prediction and the actual, we can try to modify the value

In [11]:
prediction[0]

array([0, 1, 0, 0, 0], dtype=int64)

In [12]:
y_test[0]

array([0, 1, 0, 0, 0], dtype=int64)

test again with different value

In [13]:
prediction[9]

array([0, 1, 0, 0, 0], dtype=int64)

In [14]:
y_test[9]

array([1, 1, 0, 0, 0], dtype=int64)

In [15]:
new_sentences1 = ["I like the food but I hate the place"]
new_sentence1_tfidf = vect.transform(new_sentences1)
predicted_sentences1 = clf.predict(new_sentence_tfidf)
print(predicted_sentences1)

[[0 1 0 0 0]]


There are shortcomings in the accuracy score for a multilabel prediction assessment. For a sentence to receive a high accuracy score, all labels must appear in exactly the right place; otherwise, the sentence would be deemed incorrect.

Given that the label combination is different, the accuracy score would indicate that the prediction was incorrect. Our model has a low metric score because of this.

We must assess the label prediction as opposed to their label combination in order to lessen this issue. Since the Hamming Loss evaluation metric is derived by dividing the fraction of incorrect predictions by the total number of labels, it is a reliable option in this situation. Since Hamming Loss is a loss function, a lower score indicates better performance.

source : kdnuggets.com

In [16]:
print('Hamming Loss: ', round(hamming_loss(y_test, prediction),2))

Hamming Loss:  0.14


0 indicates no wrong prediction and 1 indicates all the prediction is wrong. In our case, with a Multilabel Classifier Hamming Loss model of 0.14, 14% of independent predictions made by our model would be incorrect. This indicates that 14% of the time, each label prediction may be incorrect.
