# Classifiers

In [1]:
import numpy as np
import pandas as pd
import sklearn as sk
import sklearn.datasets as sk_data
import sklearn.metrics as metrics
from sklearn import preprocessing
import sklearn.cluster as sk_cluster
import sklearn.feature_extraction.text as sk_text
from sklearn.feature_extraction import DictVectorizer
from sklearn.preprocessing import LabelEncoder
from smart_open import smart_open 
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
nltk.download('punkt')
from nltk.stem import PorterStemmer

import json
import re
import os


%matplotlib inline

[nltk_data] Downloading package punkt to /home/ze/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


From the "yelp_academic_dataset_business.json" dataset keep businesses from the city of Toronto with the categories "Beauty & Spas", "Shopping" and "Bars", which they have at least 10 reviews.

For each business on the list get all the reviews for the business from the "yelp_academic_dataset_review.json" dataset and merge them in a great text for business.

In [2]:
business = []

with open("yelp_academic_dataset_business.json", encoding = "utf8", errors = "ignore") as f:
    for line in f:
        get_line = json.loads(line)
        business_id = get_line["business_id"]
        cat = get_line["categories"]
        # split categories from text to array
        if cat is not None:
            category = cat.split(", ")
        city = get_line["city"]
        count = get_line["review_count"]
        beauty = "Beauty & Spas"
        shopping = "Shopping"
        bars = "Bars"
        if city == "Toronto" and count>=10:                    
            if beauty in category:
                # if Beauty & Spas
                business.append((business_id,beauty))
            elif shopping in category:
                # if Shopping
                business.append((business_id,shopping))
            elif bars in category:
                # if Bars
                business.append((business_id,bars))
               

In [3]:
# get reviews where business in Toronto
only_bs = np.array(business)
check_for_business = list(only_bs[:,0])

reviews = []

with open("yelp_academic_dataset_review.json", encoding = "utf8", errors = "ignore") as f: 
    for line in f:
        get_line = json.loads(line)
        # get business id
        business_id = get_line["business_id"]
        # get text of business
        text = get_line["text"]

        if business_id in check_for_business:
            reviews.append((business_id,text))
                              

In [4]:
# Implementing stemming

# for each business keep all the reviews given together in a text
reviews_per_business = []
ps = PorterStemmer()

for b in range(len(business)):

    # empty text for each business
    text_per_business=""
    for r in range(len(reviews)):
        if business[b][0]==reviews[r][0]:
            
            # remove punctuation
            text = re.sub("[^a-zA-Z]"," ",reviews[r][1])
            # remove tags
            text = re.sub("&lt;/?.*?&gt;"," &lt;&gt; ",text)
            # remove special characters and digits
            text = re.sub("(\\d|\\W)+"," ",text)
            # lower cases only
            text = text.lower()
            text = word_tokenize(text)
            stem = [ps.stem(w) for w in text]
            # do the stemming
            for w in stem:
                text_per_business+= " " + w
            
    reviews_per_business.append((business[b][0],business[b][1],text_per_business))

In [5]:
# array to dataframe
df = pd.DataFrame(data=reviews_per_business,columns=["business_id","category","text"])
df

Unnamed: 0,business_id,category,text
0,cicPsia8Wj-DNRkmLbD_xg,Bars,consist good as the keg tend to be highlight ...
1,xVXyrTWbG8U3szze-aA7eg,Bars,i would give zero star i came here with a gro...
2,e-tRKAC-q40SqQfAOwYa-A,Beauty & Spas,a bliss experi i highli recommend thi place i...
3,C9keC4mWuXdl2mYFHZXudQ,Shopping,if you re a boy and you want to wear some hot...
4,PFS9kf3U-ZCvpqay3AaNnQ,Shopping,as a countri girl i often find myself miss th...
...,...,...,...
2986,Cesnh6fIsAUO8D4jfGhOIw,Shopping,good taco in the downtown core are hard to co...
2987,lkq6i2x3vUsR7ZNrIFqoIw,Bars,thi use to be my favourit place it wa alway p...
2988,wjqOdj0XJUDOOtU9LjRlWQ,Bars,veri welcom place great setup and super frien...
2989,AqpB2IoLkUupDCuH-hmVdg,Shopping,i can t beleiv i am say thi but i left thi sh...


In [6]:
# get text per business
corpus = df["text"].tolist()

In [7]:
# set 0,1,2 labels for each category
labelEnc = LabelEncoder()
categories = labelEnc.fit_transform(df["category"])
df["label"] = categories
df

Unnamed: 0,business_id,category,text,label
0,cicPsia8Wj-DNRkmLbD_xg,Bars,consist good as the keg tend to be highlight ...,0
1,xVXyrTWbG8U3szze-aA7eg,Bars,i would give zero star i came here with a gro...,0
2,e-tRKAC-q40SqQfAOwYa-A,Beauty & Spas,a bliss experi i highli recommend thi place i...,1
3,C9keC4mWuXdl2mYFHZXudQ,Shopping,if you re a boy and you want to wear some hot...,2
4,PFS9kf3U-ZCvpqay3AaNnQ,Shopping,as a countri girl i often find myself miss th...,2
...,...,...,...,...
2986,Cesnh6fIsAUO8D4jfGhOIw,Shopping,good taco in the downtown core are hard to co...,2
2987,lkq6i2x3vUsR7ZNrIFqoIw,Bars,thi use to be my favourit place it wa alway p...,0
2988,wjqOdj0XJUDOOtU9LjRlWQ,Bars,veri welcom place great setup and super frien...,0
2989,AqpB2IoLkUupDCuH-hmVdg,Shopping,i can t beleiv i am say thi but i left thi sh...,2


In [8]:
# get target labels
target_labels = df["label"].tolist()

##  Get TF-IDF and experiment with Logistic Regression, SVM and K-NN. Using 5-fold cross validation for the evaluation. 

In [9]:
# get true values
y = df.label
y

0       0
1       0
2       1
3       2
4       2
       ..
2986    2
2987    0
2988    0
2989    2
2990    0
Name: label, Length: 2991, dtype: int64

In [10]:
# get text
X = df.text
X

0        consist good as the keg tend to be highlight ...
1        i would give zero star i came here with a gro...
2        a bliss experi i highli recommend thi place i...
3        if you re a boy and you want to wear some hot...
4        as a countri girl i often find myself miss th...
                              ...                        
2986     good taco in the downtown core are hard to co...
2987     thi use to be my favourit place it wa alway p...
2988     veri welcom place great setup and super frien...
2989     i can t beleiv i am say thi but i left thi sh...
2990     we had pizza and fish chip the pizza wa not d...
Name: text, Length: 2991, dtype: object

In [11]:
# do the train_test_split method

from sklearn.model_selection import train_test_split
X_train, X_test,y_train,y_test = train_test_split(X,y)

In [12]:
# Implementing TfidfVectorizer

# The lower the value of a word, the less unique it is to any particular document.

# min_df = 20, ignore terms that appeared in less than 20 documents 
# max_df = 0.7, ignore terms that appear in 70% of the documents

vectorizer = sk_text.TfidfVectorizer(stop_words = 'english',min_df=20,max_df=0.70,max_features = 1000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.fit_transform(X_test)

### Logistic Regression

In [13]:
import sklearn.linear_model as linear_model

lr_clf = linear_model.LogisticRegression(solver='lbfgs')
lr_clf.fit(X_train_tfidf, y_train)

y_pred = lr_clf.predict(X_test_tfidf)

# accuracy
accuracy = metrics.accuracy_score(y_test,y_pred)
print("\nAccuracy:",accuracy)

# confusion matrix
confusion_matrix = metrics.confusion_matrix(y_test,y_pred)
print("\nConfusion matrix")
print(confusion_matrix)

# average precision score
average_precision_score = metrics.precision_score(y_test,y_pred,average='weighted')
print("\nAverage Precision Score:\t",average_precision_score)

# average recall score
average_recall_score = metrics.recall_score(y_test,y_pred,average='weighted')
print("\nAverage Recall Score:\t",average_recall_score)

# average f1 score
average_f1_score = metrics.f1_score(y_test,y_pred,average='weighted')
print("\nAverage F1 Score:\t",average_f1_score)

# keep a dataframe for all
data_total = []

data_total.append(("Logistic Regression",accuracy,average_precision_score,average_recall_score,average_f1_score))


Accuracy: 0.5788770053475936

Confusion matrix
[[146   2 164]
 [  0  91 121]
 [ 11  17 196]]

Average Precision Score:	 0.7443829583514178

Average Recall Score:	 0.5788770053475936

Average F1 Score:	 0.5864011239486261


In [17]:
# 5-fold cross validation
import sklearn.model_selection as model_selection

scores = model_selection.cross_validate(lr_clf, X_train_tfidf,y_train,
                                    scoring=["precision_weighted","recall_weighted","f1_weighted"],cv=5)

print ("Test precision weighted mean:\t",scores['test_precision_weighted'].mean(),
       "\nTest recall weighted mean:\t",scores['test_recall_weighted'].mean(),
      "\nTest f1 score weighted mean:\t",scores['test_f1_weighted'].mean())

data_total_k_fold = []
data_total_k_fold.append(("Logistic Regression",scores['test_precision_weighted'].mean(),
                         scores['test_recall_weighted'].mean(),scores['test_f1_weighted'].mean()))

Test precision weighted mean:	 0.9644548115688611 
Test recall weighted mean:	 0.9638899936366527 
Test f1 score weighted mean:	 0.9638018390607941


### SVM Classification

In [18]:
from sklearn import svm
svm_clf = svm.SVC()
svm_clf.fit(X_train_tfidf,y_train)

y_pred = svm_clf.predict(X_test_tfidf)

# accuracy
accuracy = metrics.accuracy_score(y_test,y_pred)
print("\nAccuracy:",accuracy)

# confusion matrix
confusion_matrix = metrics.confusion_matrix(y_test,y_pred)
print("\nConfusion matrix")
print(confusion_matrix)

# average precision score
average_precision_score = metrics.precision_score(y_test,y_pred,average='weighted')
print("\nAverage Precision Score:\t",average_precision_score)

# average recall score
average_recall_score = metrics.recall_score(y_test,y_pred,average='weighted')
print("\nAverage Recall Score:\t",average_recall_score)

# average f1 score
average_f1_score = metrics.f1_score(y_test,y_pred,average='weighted')
print("\nAverage F1 Score:\t",average_f1_score)

data_total.append(("SVM Classification",accuracy,average_precision_score,average_recall_score,average_f1_score))


Accuracy: 0.5200534759358288

Confusion matrix
[[115   0 197]
 [  0  70 142]
 [ 17   3 204]]

Average Precision Score:	 0.7476745309941913

Average Recall Score:	 0.5200534759358288

Average F1 Score:	 0.5145950834674139


In [19]:
# 5-fold cross validation

scores = model_selection.cross_validate(svm_clf, X_train_tfidf,y_train,
                                    scoring=["precision_weighted","recall_weighted","f1_weighted"],cv=5)

print ("Test precision weighted mean:\t",scores['test_precision_weighted'].mean(),
       "\nTest recall weighted mean:\t",scores['test_recall_weighted'].mean(),
      "\nTest f1 score weighted mean:\t",scores['test_f1_weighted'].mean())

data_total_k_fold.append(("SVM Classification",scores['test_precision_weighted'].mean(),
                         scores['test_recall_weighted'].mean(),scores['test_f1_weighted'].mean()))

Test precision weighted mean:	 0.963029843818167 
Test recall weighted mean:	 0.9625546850143175 
Test f1 score weighted mean:	 0.9624272735251171


### k-NN Classification

In [20]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train_tfidf,y_train)

y_pred = knn.predict(X_test_tfidf)

# accuracy
accuracy = metrics.accuracy_score(y_test,y_pred)
print("\nAccuracy:",accuracy)

# confusion matrix
confusion_matrix = metrics.confusion_matrix(y_test,y_pred)
print("\nConfusion matrix")
print(confusion_matrix)

# average precision score
average_precision_score = metrics.precision_score(y_test,y_pred,average='weighted')
print("\nAverage Precision Score:\t",average_precision_score)

# average recall score
average_recall_score = metrics.recall_score(y_test,y_pred,average='weighted')
print("\nAverage Recall Score:\t",average_recall_score)

# average f1 score
average_f1_score = metrics.f1_score(y_test,y_pred,average='weighted')
print("\nAverage F1 Score:\t",average_f1_score)

data_total.append(("k-NN Classification",accuracy,average_precision_score,average_recall_score,average_f1_score))


Accuracy: 0.6537433155080213

Confusion matrix
[[273   7  32]
 [ 32 119  61]
 [ 89  38  97]]

Average Precision Score:	 0.6475533477891394

Average Recall Score:	 0.6537433155080213

Average F1 Score:	 0.6423121091432232


In [21]:
# 5-fold cross validation

scores = model_selection.cross_validate(knn, X_train_tfidf,y_train,
                                    scoring=["precision_weighted","recall_weighted","f1_weighted"],cv=5)

print ("Test precision weighted mean:\t",scores['test_precision_weighted'].mean(),
       "\nTest recall weighted mean:\t",scores['test_recall_weighted'].mean(),
      "\nTest f1 score weighted mean:\t",scores['test_f1_weighted'].mean())

data_total_k_fold.append(("k-NN Classification",scores['test_precision_weighted'].mean(),
                         scores['test_recall_weighted'].mean(),scores['test_f1_weighted'].mean()))

Test precision weighted mean:	 0.948969703640224 
Test recall weighted mean:	 0.9487352847597836 
Test f1 score weighted mean:	 0.9482673034553715


### Classificator's Results

In [22]:
data_total = pd.DataFrame(data=data_total,columns=["Classificator","Accuracy","Average Precision Score",
                                                   "Average Recall Score","Average F1 Score"])
data_total

Unnamed: 0,Classificator,Accuracy,Average Precision Score,Average Recall Score,Average F1 Score
0,Logistic Regression,0.578877,0.744383,0.578877,0.586401
1,SVM Classification,0.520053,0.747675,0.520053,0.514595
2,k-NN Classification,0.653743,0.647553,0.653743,0.642312


We implemented a better stemming (in addition with clustering-previous exercise) for our data so for the results, the overall accuracy is high -Logistic Regression and SVM ~0.74. Recall, is low and only k-NN seems to be better for applying classification, and f1 score is low as expected (precision-recall are low)

As comparison with clustering, clustering was better, only Agglomerative - Single precision and recall was 0.4203 and 0.4215 respectively. K-means, Agglomerative - Complete, Agglomerative - Average, Agglomerative - Ward had values close to 1 (>=0.81).

### 5-fold cross validation

In [23]:
data_total_k_fold = pd.DataFrame(data=data_total_k_fold,
                                 columns=["5-fold cross validation","Test Precicion Weighted",
                                          "Test Recall Weighted","Test F1 Weighted"])

data_total_k_fold

Unnamed: 0,5-fold cross validation,Test Precicion Weighted,Test Recall Weighted,Test F1 Weighted
0,Logistic Regression,0.964455,0.96389,0.963802
1,SVM Classification,0.96303,0.962555,0.962427
2,k-NN Classification,0.94897,0.948735,0.948267


Now for 5-fold cross validation, all precision, recall and f1 score are high enough, it is very good result.

## Same implementation as before but now extracting features using Word Emdendings of Google.

In [24]:
import gensim
from gensim.models import Word2Vec
import gensim.downloader as api
path = api.load("word2vec-google-news-300", return_path=True)
print(path)

/home/ze/gensim-data/word2vec-google-news-300/word2vec-google-news-300.gz


In [25]:
g_model = gensim.models.KeyedVectors.load_word2vec_format(path, binary=True)  

In [26]:
import string as string

X_train_nltk = []
y_train_nltk = []
for x,y in zip(X_train,y_train):
    wt = word_tokenize(x.lower())
    doc = [w for w in wt if w not in string.punctuation]
    if len(doc) == 0: continue
    X_train_nltk.append(doc)
    y_train_nltk.append(y)

In [27]:
from nltk.corpus import stopwords
nltk.download('stopwords')
english_stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /home/ze/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [28]:
X_test_nltk = []
y_test_nltk = []

for x,y in zip(X_test,y_test):
    wt = word_tokenize(x.lower())
    doc = [w for w in wt if (w not in english_stop_words) and (w not in string.punctuation)]
    if len(doc) == 0: continue    
    X_test_nltk.append(doc)
    y_test_nltk.append(y)

### Transforming the train and test data

In [29]:
X_train_gmodel = []
for x in X_train_nltk:
    vx = np.zeros(300)
    length = 0
    for w in x: 
        if w in g_model.wv:
            length += 1
            vx += g_model[w]
    if length != 0: vx /= length
    X_train_gmodel.append(vx)

  if w in g_model.wv:


In [30]:
X_test_gmodel = []
for x in X_test_nltk:
    vx = np.zeros(300)
    length = 0
    for w in x: 
        if (w not in g_model.wv): continue
        length += 1
        vx += g_model.wv[w]
    if length != 0: vx /= length
    X_test_gmodel.append(vx)

  if (w not in g_model.wv): continue
  vx += g_model.wv[w]


### Logistic Regression

In [31]:
lr_clf = linear_model.LogisticRegression(solver='lbfgs')
lr_clf.fit(X_train_gmodel, y_train_nltk)

y_pred = lr_clf.predict(X_test_gmodel)

# accuracy
accuracy = metrics.accuracy_score(y_test,y_pred)
print("\nAccuracy:",accuracy)

# confusion matrix
confusion_matrix = metrics.confusion_matrix(y_test,y_pred)
print("\nConfusion matrix")
print(confusion_matrix)

# average precision score
average_precision_score = metrics.precision_score(y_test,y_pred,average='weighted')
print("\nAverage Precision Score:\t",average_precision_score)

# average recall score
average_recall_score = metrics.recall_score(y_test,y_pred,average='weighted')
print("\nAverage Recall Score:\t",average_recall_score)

# average f1 score
average_f1_score = metrics.f1_score(y_test,y_pred,average='weighted')
print("\nAverage F1 Score:\t",average_f1_score)

# keep a dataframe for all
data_total_nltk = []

data_total_nltk.append(("Logistic Regression",accuracy,average_precision_score,average_recall_score,average_f1_score))


Accuracy: 0.9264705882352942

Confusion matrix
[[312   0   0]
 [  8 189  15]
 [ 32   0 192]]

Average Precision Score:	 0.9309004953041223

Average Recall Score:	 0.9264705882352942

Average F1 Score:	 0.9259602996343181


In [32]:
# 5-fold cross validation

scores = model_selection.cross_validate(lr_clf, X_train_tfidf,y_train,
                                    scoring=["precision_weighted","recall_weighted","f1_weighted"],cv=5)

print ("Test precision weighted mean:\t",scores['test_precision_weighted'].mean(),
       "\nTest recall weighted mean:\t",scores['test_recall_weighted'].mean(),
      "\nTest f1 score weighted mean:\t",scores['test_f1_weighted'].mean())

data_total_k_fold_nltk = []
data_total_k_fold_nltk.append(("Logistic Regression",scores['test_precision_weighted'].mean(),
                         scores['test_recall_weighted'].mean(),scores['test_f1_weighted'].mean()))

Test precision weighted mean:	 0.9644548115688611 
Test recall weighted mean:	 0.9638899936366527 
Test f1 score weighted mean:	 0.9638018390607941


### SVM Classification

In [33]:
svm_clf = svm.SVC()
svm_clf.fit(X_train_gmodel,y_train_nltk)

y_pred = svm_clf.predict(X_test_gmodel)

# accuracy
accuracy = metrics.accuracy_score(y_test,y_pred)
print("\nAccuracy:",accuracy)

# confusion matrix
confusion_matrix = metrics.confusion_matrix(y_test,y_pred)
print("\nConfusion matrix")
print(confusion_matrix)

# average precision score
average_precision_score = metrics.precision_score(y_test,y_pred,average='weighted')
print("\nAverage Precision Score:\t",average_precision_score)

# average recall score
average_recall_score = metrics.recall_score(y_test,y_pred,average='weighted')
print("\nAverage Recall Score:\t",average_recall_score)

# average f1 score
average_f1_score = metrics.f1_score(y_test,y_pred,average='weighted')
print("\nAverage F1 Score:\t",average_f1_score)

data_total_nltk.append(("SVM Classification",accuracy,average_precision_score,average_recall_score,average_f1_score))


Accuracy: 0.9491978609625669

Confusion matrix
[[312   0   0]
 [  1 197  14]
 [ 23   0 201]]

Average Precision Score:	 0.9507062021425907

Average Recall Score:	 0.9491978609625669

Average F1 Score:	 0.9489172623651538


In [34]:
# 5-fold cross validation

scores = model_selection.cross_validate(svm_clf, X_train_tfidf,y_train,
                                    scoring=["precision_weighted","recall_weighted","f1_weighted"],cv=5)

print ("Test precision weighted mean:\t",scores['test_precision_weighted'].mean(),
       "\nTest recall weighted mean:\t",scores['test_recall_weighted'].mean(),
      "\nTest f1 score weighted mean:\t",scores['test_f1_weighted'].mean())

data_total_k_fold_nltk.append(("SVM Classification",scores['test_precision_weighted'].mean(),
                         scores['test_recall_weighted'].mean(),scores['test_f1_weighted'].mean()))

Test precision weighted mean:	 0.963029843818167 
Test recall weighted mean:	 0.9625546850143175 
Test f1 score weighted mean:	 0.9624272735251171


### k-NN Classification

In [35]:
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train_gmodel,y_train_nltk)

y_pred = knn.predict(X_test_gmodel)

# accuracy
accuracy = metrics.accuracy_score(y_test,y_pred)
print("\nAccuracy:",accuracy)

# confusion matrix
confusion_matrix = metrics.confusion_matrix(y_test,y_pred)
print("\nConfusion matrix")
print(confusion_matrix)

# average precision score
average_precision_score = metrics.precision_score(y_test,y_pred,average='weighted')
print("\nAverage Precision Score:\t",average_precision_score)

# average recall score
average_recall_score = metrics.recall_score(y_test,y_pred,average='weighted')
print("\nAverage Recall Score:\t",average_recall_score)

# average f1 score
average_f1_score = metrics.f1_score(y_test,y_pred,average='weighted')
print("\nAverage F1 Score:\t",average_f1_score)

data_total_nltk.append(("k-NN Classification",accuracy,average_precision_score,average_recall_score,average_f1_score))


Accuracy: 0.9398395721925134

Confusion matrix
[[312   0   0]
 [  0 196  16]
 [ 28   1 195]]

Average Precision Score:	 0.9415026179551339

Average Recall Score:	 0.9398395721925134

Average F1 Score:	 0.939327643394975


In [36]:
# 5-fold cross validation

scores = model_selection.cross_validate(knn, X_train_tfidf,y_train,
                                    scoring=["precision_weighted","recall_weighted","f1_weighted"],cv=5)

print ("Test precision weighted mean:\t",scores['test_precision_weighted'].mean(),
       "\nTest recall weighted mean:\t",scores['test_recall_weighted'].mean(),
      "\nTest f1 score weighted mean:\t",scores['test_f1_weighted'].mean())

data_total_k_fold_nltk.append(("k-NN Classification",scores['test_precision_weighted'].mean(),
                         scores['test_recall_weighted'].mean(),scores['test_f1_weighted'].mean()))

Test precision weighted mean:	 0.948969703640224 
Test recall weighted mean:	 0.9487352847597836 
Test f1 score weighted mean:	 0.9482673034553715


### Classificator's Results - Word Embeddigs Google (WEG)

In [37]:
data_total_nltk = pd.DataFrame(data=data_total_nltk,columns=["Classificator","Accuracy","Average Precision Score",
                                                   "Average Recall Score","Average F1 Score"])
data_total_nltk

Unnamed: 0,Classificator,Accuracy,Average Precision Score,Average Recall Score,Average F1 Score
0,Logistic Regression,0.926471,0.9309,0.926471,0.92596
1,SVM Classification,0.949198,0.950706,0.949198,0.948917
2,k-NN Classification,0.93984,0.941503,0.93984,0.939328


### 5-fold cross validation - Word Embeddigs Google

In [38]:
data_total_k_fold_nltk = pd.DataFrame(data=data_total_k_fold_nltk,
                                 columns=["5-fold cross validation","Test Precicion Weighted",
                                          "Test Recall Weighted","Test F1 Weighted"])

data_total_k_fold_nltk

Unnamed: 0,5-fold cross validation,Test Precicion Weighted,Test Recall Weighted,Test F1 Weighted
0,Logistic Regression,0.964455,0.96389,0.963802
1,SVM Classification,0.96303,0.962555,0.962427
2,k-NN Classification,0.94897,0.948735,0.948267


### Classificator's Results - Without WEG

In [39]:
data_total

Unnamed: 0,Classificator,Accuracy,Average Precision Score,Average Recall Score,Average F1 Score
0,Logistic Regression,0.578877,0.744383,0.578877,0.586401
1,SVM Classification,0.520053,0.747675,0.520053,0.514595
2,k-NN Classification,0.653743,0.647553,0.653743,0.642312


### 5-fold cross validation - Without WEG

In [40]:
data_total_k_fold

Unnamed: 0,5-fold cross validation,Test Precicion Weighted,Test Recall Weighted,Test F1 Weighted
0,Logistic Regression,0.964455,0.96389,0.963802
1,SVM Classification,0.96303,0.962555,0.962427
2,k-NN Classification,0.94897,0.948735,0.948267


As expected, the results using word embeddings of Google, are better and they have close value to 1, all metrics, accuracy, precicion, recall and f1 score are high enough. 
With 5-fold cross validation we are getting same results.