In [89]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report

import spacy
import gensim.downloader as api
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

from collections import Counter

from tqdm.notebook import tqdm

In [2]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

In [3]:
print(df_train.info())
print(df_test.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21057 entries, 0 to 21056
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   label   21057 non-null  int64 
 1   text    21057 non-null  object
dtypes: int64(1), object(1)
memory usage: 329.1+ KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8943 entries, 0 to 8942
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   _id     8943 non-null   int64 
 1   text    8943 non-null   object
dtypes: int64(1), object(1)
memory usage: 139.9+ KB
None


In [4]:
nlp = spacy.load("en_core_web_lg")

In [5]:
docs_train = [[tkn.lemma_.lower() for tkn in nlp(doc) if (not tkn.is_stop) & (tkn.is_alpha) & (not tkn.is_oov)] 
              for doc in tqdm(df_train.text)]

  0%|          | 0/21057 [00:00<?, ?it/s]

In [34]:
df_train['Tokens'] = docs_train
df_train.head()

Unnamed: 0,label,text,Tokens
0,0,Batch #5\n\nAppearance: Pours a slightly hazy ...,"[batch, appearance, pour, slightly, hazy, aubu..."
1,0,Murky peach color with off-white head. Aroma h...,"[murky, peach, color, white, head, aroma, tart..."
2,0,Can poured into a Spiegelau IPA glass\n\nA: Po...,"[pour, ipa, glass, pour, golden, amber, kinda,..."
3,0,A big thanks to Jeff for this one. 750ml cappe...,"[big, thank, jeff, cap, bottle, brooklyn, brew..."
4,0,On tap into a shaker pint.\n\nAppearance is go...,"[tap, shaker, pint, appearance, golden, amber,..."


In [None]:
# docs_test = [[tkn.lemma_.lower() for tkn in nlp(doc) if (not tkn.is_stop) & (tkn.is_alpha) & (not tkn.is_oov)] 
#              for doc in tqdm(df_test.text)]

In [None]:
# Count of typos in the train dataset
typos_train = [[tkn.text for tkn in nlp(doc) if (tkn.is_oov) & (tkn.is_alpha)] for doc in tqdm(df_train.text)]

train_typos_list = []
for lst in typos_train:
    for typo in lst:
        train_typos_list.append(typo)

print(len(train_typos_list))

train_typo_count = Counter(train_typos_list)
print(train_typo_count)

df_train_typos = pd.DataFrame.from_dict(data=train_typo_count, orient='index', columns=['Count'])
df_train_typos.to_csv('train_typos.csv')

  0%|          | 0/21057 [00:00<?, ?it/s]

In [None]:
# Count of typos in the test dataset
typos_test = [[tkn.text for tkn in nlp(doc) if (tkn.is_oov) & (tkn.is_alpha)] for doc in tqdm(df_test.text)]

test_typos_list = []
for lst in typos_test:
    for typo in lst:
        test_typos_list.append(typo)

print(len(test_typos_list))

test_typo_count = Counter(test_typos_list)
print(test_typo_count)

df_test_typos = pd.DataFrame.from_dict(data=test_typo_count, orient='index', columns=['Count'])
df_test_typos.to_csv('test_typos.csv')

  0%|          | 0/8943 [00:00<?, ?it/s]

In [36]:
api.info()['models'].keys()

dict_keys(['fasttext-wiki-news-subwords-300', 'conceptnet-numberbatch-17-06-300', 'word2vec-ruscorpora-300', 'word2vec-google-news-300', 'glove-wiki-gigaword-50', 'glove-wiki-gigaword-100', 'glove-wiki-gigaword-200', 'glove-wiki-gigaword-300', 'glove-twitter-25', 'glove-twitter-50', 'glove-twitter-100', 'glove-twitter-200', '__testing_word2vec-matrix-synopsis'])

In [169]:
wv = api.load('glove-wiki-gigaword-300')



In [170]:
def word_vectoriser(sent):
    vector_size = wv.vector_size
    wv_res = np.zeros(vector_size)

    counter = 1
    for word in sent:
        if word in wv:
            counter += 1
            wv_res += wv[word]

    wv_res = wv_res/counter
    return wv_res

In [171]:
word2vec = []
for doc_tkn in docs_train:
    try:
        word2vec.append(word_vectoriser(doc_tkn))
    except:
        continue

In [177]:
def split_data(X, y):
    X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
    return X_train, X_valid, y_train, y_valid

In [178]:
def naive_bayes(X_train, X_valid, y_train, y_valid):    
    nb = MultinomialNB()
    nb.fit(X_train, y_train)
    y_pred_nb = nb.predict(X_valid)
    print(f'Accuracy of Naive Bayes: {accuracy_score(y_valid, y_pred_nb)}')
    print(f'Classification report:\n{classification_report(y_valid, y_pred_nb)}')

In [179]:
def svc(X_train, X_valid, y_train, y_valid):
    svc = SVC(random_state=42)
    svc.fit(X_train, y_train)
    y_pred_svc = svc.predict(X_valid)
    print(f'Accuracy of SVC: {accuracy_score(y_valid, y_pred_svc)}')
    print(f'Classification report:\n{classification_report(y_valid, y_pred_svc)}')

In [180]:
def lr(X_train, X_valid, y_train, y_valid):
    lr = LogisticRegression(random_state=42, solver='newton-cg')
    lr.fit(X_train, y_train)
    y_pred_lr = lr.predict(X_valid)
    print(f'Accuracy of Logistic Regression: {accuracy_score(y_valid, y_pred_lr)}')
    print(f'Classification report:\n{classification_report(y_valid, y_pred_lr)}')

In [213]:
vectorizer = TfidfVectorizer()
tfidf_X_train, tfidf_X_valid, y_train, y_valid = split_data(df_train.Tokens, df_train.label)
tfidf_train = vectorizer.fit_transform(tfidf_X_train.apply(lambda token: " ".join(token)))
tfidf_valid = vectorizer.transform(tfidf_X_valid.apply(lambda token: " ".join(token)))
naive_bayes(tfidf_train, tfidf_valid, y_train, y_valid)
svc(tfidf_train, tfidf_valid, y_train, y_valid)
lr(tfidf_train, tfidf_valid, y_train, y_valid)

Accuracy of Naive Bayes: 0.5726495726495726
Classification report:
              precision    recall  f1-score   support

           0       0.61      0.65      0.63      1409
           1       0.45      0.47      0.46      1397
           2       0.67      0.59      0.63      1406

    accuracy                           0.57      4212
   macro avg       0.58      0.57      0.57      4212
weighted avg       0.58      0.57      0.57      4212

Accuracy of SVC: 0.5914055080721747
Classification report:
              precision    recall  f1-score   support

           0       0.63      0.65      0.64      1409
           1       0.47      0.44      0.45      1397
           2       0.67      0.68      0.67      1406

    accuracy                           0.59      4212
   macro avg       0.59      0.59      0.59      4212
weighted avg       0.59      0.59      0.59      4212

Accuracy of Logistic Regression: 0.587369420702754
Classification report:
              precision    recall  f1-

In [224]:
doc2vec_X_train, doc2vec_X_valid, y_train, y_valid = split_data(df_train.Tokens, df_train.label)

X_tagged_docs_train = [TaggedDocument(doc, [i]) for i, doc in enumerate(doc2vec_X_train)]
model = Doc2Vec(X_tagged_docs_train, vector_size=100, window=2, min_count=10, workers=4, epochs=30)
model.build_vocab(X_tagged_docs_train)
model.train(X_tagged_docs_train, total_examples=model.corpus_count, epochs=model.epochs)
doc2vec_X_train = [model.infer_vector(doc.words) for doc in tqdm(X_tagged_docs_train)]

X_tagged_docs_valid = [TaggedDocument(doc, [i]) for i, doc in enumerate(doc2vec_X_valid)]
doc2vec_X_valid = [model.infer_vector(doc.words) for doc in tqdm(X_tagged_docs_valid)]

  0%|          | 0/16845 [00:00<?, ?it/s]

  0%|          | 0/4212 [00:00<?, ?it/s]

In [225]:
svc(doc2vec_X_train, doc2vec_X_valid, y_train, y_valid)
lr(doc2vec_X_train, doc2vec_X_valid, y_train, y_valid)

Accuracy of SVC: 0.5776353276353277
Classification report:
              precision    recall  f1-score   support

           0       0.62      0.68      0.65      1409
           1       0.47      0.35      0.40      1397
           2       0.61      0.70      0.65      1406

    accuracy                           0.58      4212
   macro avg       0.57      0.58      0.57      4212
weighted avg       0.57      0.58      0.57      4212

Accuracy of Logistic Regression: 0.5797720797720798
Classification report:
              precision    recall  f1-score   support

           0       0.61      0.68      0.65      1409
           1       0.49      0.30      0.37      1397
           2       0.59      0.75      0.66      1406

    accuracy                           0.58      4212
   macro avg       0.57      0.58      0.56      4212
weighted avg       0.57      0.58      0.56      4212



In [227]:
from scipy.sparse import hstack
merged_features_train = hstack((tfidf_train, doc2vec_X_train))

merged_features_valid = hstack((tfidf_valid, doc2vec_X_valid))

In [123]:
svc(merged_features_train, merged_features_valid, y_train, y_valid)
lr(merged_features_train, merged_features_valid, y_train, y_valid)

Accuracy of SVC: 0.5610161443494777
Classification report:
              precision    recall  f1-score   support

           0       0.59      0.64      0.62      1409
           1       0.45      0.38      0.41      1397
           2       0.62      0.66      0.64      1406

    accuracy                           0.56      4212
   macro avg       0.55      0.56      0.55      4212
weighted avg       0.55      0.56      0.56      4212

Accuracy of Logistic Regression: 0.592355175688509
Classification report:
              precision    recall  f1-score   support

           0       0.63      0.67      0.65      1409
           1       0.47      0.42      0.45      1397
           2       0.65      0.68      0.67      1406

    accuracy                           0.59      4212
   macro avg       0.59      0.59      0.59      4212
weighted avg       0.59      0.59      0.59      4212



In [165]:
# word2vec = np.array(word2vec)
# doc2vec = np.array(doc2vec)
combined_embeddings = np.concatenate([word2vec, doc2vec], axis=1)

In [166]:
X_train, X_valid, y_train, y_valid = split_data(combined_embeddings, df_train.label)
svc(X_train, X_valid, y_train, y_valid)
lr(X_train, X_valid, y_train, y_valid)

Accuracy of SVC: 0.5738366571699905
Classification report:
              precision    recall  f1-score   support

           0       0.60      0.67      0.63      1409
           1       0.47      0.37      0.42      1397
           2       0.62      0.67      0.65      1406

    accuracy                           0.57      4212
   macro avg       0.56      0.57      0.57      4212
weighted avg       0.56      0.57      0.57      4212

Accuracy of Logistic Regression: 0.5885565052231719
Classification report:
              precision    recall  f1-score   support

           0       0.61      0.68      0.64      1409
           1       0.48      0.40      0.44      1397
           2       0.65      0.68      0.67      1406

    accuracy                           0.59      4212
   macro avg       0.58      0.59      0.58      4212
weighted avg       0.58      0.59      0.58      4212



In [167]:
from joblib import dump
dump(vectorizer, 'TFIDF_vectorizer.joblib')

['TFIDF_vectorizer.joblib']