## Testing

First thing that is to be done is to import the data and generate splits

In [129]:
# Load dataset
import pandas as pd
df = pd.read_csv('Datasets/train_cleaned.csv')

# Split dataset into training and testing
from sklearn.model_selection import train_test_split
X = df['body']
y = df['subreddit']
y = y.map({'Toronto': 0, 'London': 1, 'Montreal': 2, 'Paris': 3})
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

Next, we can define some pre-made models

In [140]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

multi_nb = MultinomialNB()
random_forest = RandomForestClassifier()
log_reg = LogisticRegression()
svc = SVC()
knn = KNeighborsClassifier()

models = [multi_nb, random_forest, log_reg, svc, knn]

def evaluate_models(models):
    for model in models:
        model.fit(x_train, y_train)
        y_pred = model.predict(x_test)
        print(model, accuracy_score(y_test, y_pred))

We can first try CountVectorizer encoding of the training data

In [142]:
# Vectorize training and testing data
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(max_features=3000)
x_train = vectorizer.fit_transform(X_train)
x_test = vectorizer.transform(X_test)

evaluate_models(models)

MultinomialNB() 0.6944444444444444
RandomForestClassifier() 0.5944444444444444
LogisticRegression() 0.6555555555555556
SVC() 0.4722222222222222
KNeighborsClassifier() 0.29444444444444445


We can also try a Tfidf encoding

In [143]:
# Vectorize training and testing data
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(max_features=3000)
x_train = vectorizer.fit_transform(X_train)
x_test = vectorizer.transform(X_test)

evaluate_models(models)

MultinomialNB() 0.7
RandomForestClassifier() 0.5444444444444444
LogisticRegression() 0.6777777777777778
SVC() 0.6333333333333333
KNeighborsClassifier() 0.6333333333333333


We can also import the tokenizer models we previously trained

In [None]:
from tokenizers import Tokenizer
import sentencepiece as spm

class TokenizerWrapper:
    def __init__(self, encoding_func, arguments={}):
        self.encoding_func = encoding_func
        self.arguments = arguments

    def tokenize(self, text):
        tokens = self.encoding_func(text, **self.arguments)
        if isinstance(tokens, list):
            return tokens
        else:
            return tokens.tokens

bpe_tokenizer = Tokenizer.from_file("Tokenizers/bpe_tokenizer.json")
sp_tokenizer = spm.SentencePieceProcessor(model_file="Tokenizers/sp_model.model")

def generate_ngrams(text, n):
    tokens = text.split()
    ngrams = []
    for i in range(len(tokens)-n+1):
        ngram = ' '.join(tokens[i:i+n])
        ngrams.append(ngram)
    return ngrams

def whitespace_tokenizer(text):
    return text.split()

# Load them all into a list
tokenizer_models = [
    TokenizerWrapper(bpe_tokenizer.encode),
    TokenizerWrapper(sp_tokenizer.encode, {'out_type': str}),
    TokenizerWrapper(generate_ngrams, {'n': 2}),
    TokenizerWrapper(generate_ngrams, {'n': 3}),
    TokenizerWrapper(whitespace_tokenizer)
]

In [None]:
from gensim.models import Word2Vec, FastText
import numpy as np

class EmbeddingTransformer:
    def __init__(self, transform_method):
        self.transform_method = transform_method

    def transform(self, X):
        try:
            return self.transform_method[X]
        except KeyError:
            pass
    
word_vec = Word2Vec.load('Embeddings/word2vec_model.bin')
fast_vec = FastText.load('Embeddings/fasttext_model.bin')

# Load them all into a list
embedding_models = [
    EmbeddingTransformer(word_vec.wv),
    EmbeddingTransformer(fast_vec.wv)
]

Now, we can start analyzing the performance of various tokenizations, embeddings, and models

In [None]:
import numpy as np

def generate_embeddings(tokenizer_model, embedding_model, corpus):
    sample_embeddings = []
    tokenized_samples = [tokenizer_model.tokenize(text) for text in corpus]
    for sample in tokenized_samples:
        embeddings = []
        for word in sample:
            try:
                embeddings.append(embedding_model.transform(word))
            except KeyError:
                pass
        
        embeddings = [x for x in embeddings if x is not None]
        sample_embeddings.append(np.mean(embeddings, axis=0))

    return np.array(sample_embeddings)

def get_scores(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    return model.score(X_train, y_train), model.score(X_test, y_test)

# Try and train one model
x_train = generate_embeddings(tokenizer_models[0], embedding_models[0], X_train.to_list())
x_test = generate_embeddings(tokenizer_models[0], embedding_models[0], X_test.to_list())

print(get_scores(models[3], x_train, x_test, y_train, y_test))