In [2]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from gensim.models.doc2vec import TaggedDocument
from sklearn.datasets import fetch_20newsgroups
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from gensim.models import Word2Vec, Doc2Vec
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score

In [3]:
categories = ["rec.autos", "sci.electronics","talk.politics.misc"]

In [4]:
train_data = fetch_20newsgroups(subset='train', categories=categories)
test_data = fetch_20newsgroups(subset='test',categories=categories)

In [5]:
#Count Vectorizer
def vectorizer_func(X_train,X_test):
    vectorizer = CountVectorizer()
    X_train = vectorizer.fit_transform(X_train)
    X_test = vectorizer.transform(X_test)
    return X_train, X_test

In [6]:
#Tfidf Vectorizer
def tfidf_func(X_train,X_test):
    tfidf = TfidfVectorizer()
    X_train = tfidf.fit_transform(X_train)
    X_test = tfidf.transform(X_test)
    return X_train, X_test


In [7]:
#Word2Vec
def word2vec_func(X_train, X_test):
    stop_words = set(stopwords.words('english'))

    def preprocess_text(text):
        tokens = word_tokenize(text.lower())
        filtered_tokens = [word for word in tokens if word.isalpha() and word not in stop_words]
        return " ".join(filtered_tokens)

    X_train = [preprocess_text(doc).split() for doc in X_train]
    X_test = [preprocess_text(doc).split() for doc in X_test]
    word2vec_model = Word2Vec(X_train, vector_size=100, window=5, min_count=1, sg=0)

    def vectorize(sentence,w2v_model):
        words_vecs = [w2v_model.wv[word] for word in sentence
                    if word in w2v_model.wv]
        if len(words_vecs) == 0:
            return np.zeros(100)
        words_vecs = np.array(words_vecs)
        return words_vecs.mean(axis = 0)

    X_train = np.array([vectorize(sentence,word2vec_model) for sentence in X_train])
    X_test = np.array([vectorize(sentence,word2vec_model) for sentence in X_test])
    
    scaler = MinMaxScaler() #MultinomialNB doesn't accept negative values. Therefore scaling has been applied.
    X_train = scaler.fit_transform(X_train)
    X_test= scaler.fit_transform(X_test)
    return X_train, X_test


In [8]:
#Doc2Vec
def doc2vec_func(X_train, X_test):
    stop_words = set(stopwords.words('english'))

    def preprocess_text(text):
        tokens = word_tokenize(text.lower())
        filtered_tokens = [word for word in tokens if word.isalpha() and word not in stop_words]
        return " ".join(filtered_tokens)

    X_train = [preprocess_text(doc).split() for doc in X_train]
    X_test = [preprocess_text(doc).split() for doc in X_test]
    tagged_X_train = [TaggedDocument(doc, tags=[str(i)]) for i, doc in enumerate(X_train)]

    model = Doc2Vec(vector_size=100, window=5, min_count=1, dm=1, epochs=20)
    model.build_vocab(tagged_X_train)
    model.train(tagged_X_train, total_examples=model.corpus_count, epochs=model.epochs)
    X_train = [model.infer_vector(doc) for doc in X_train] 
    X_test = [model.infer_vector(doc) for doc in X_test]
    
    scaler = MinMaxScaler() #MultinomialNB doesn't accept negative values. Therefore scaling has been applied.
    X_train = scaler.fit_transform(X_train)
    X_test= scaler.fit_transform(X_test)
    return X_train, X_test

In [9]:
methods = [("Count_Vectorizer",vectorizer_func),
                 ("Tf-Idf",tfidf_func), 
                 ("Word2Vec", word2vec_func), 
                 ("Doc2Vec",doc2vec_func)]

methods_X_values = []
for method in methods:
    func = method[1]
    X_train, X_test = func(train_data.data, test_data.data)
    methods_X_values.append([method[0],X_train,X_test])

In [32]:
nb = MultinomialNB()
lr = LogisticRegression()
svc = SVC()
dt =  DecisionTreeClassifier()

classifiers = [('MultinomialNB', nb),
               ('LogisticRegression', lr),
               ('SVC', svc),
               ('DecisionTree',dt)]
grid_nb = {'alpha':[0.1, 1.0, 2.0], 'fit_prior':[True,False]}
grid_lr = {'penalty':['l2'], 'C':[0.01, 0.1, 1.0, 10.0], 'max_iter' :[999999]}
grid_svc = {'C':[0.1, 1.0, 10],'degree': [2, 3, 4]}
grid_dt = {'max_depth':[2,5,10,20,None], 'max_features':['sqrt',None]}
param_grids = {'MultinomialNB':grid_nb, 'LogisticRegression':grid_lr,
               'SVC':grid_svc, 'DecisionTree':grid_dt}

In [28]:
scores_df = pd.DataFrame(index = [i[0] for i in methods],columns = [i[0] for i in classifiers])

In [35]:
i = 0
total_len = len(methods_X_values) * len(classifiers)
for X_values in methods_X_values:
    for classifier in classifiers:
        grid_cv = GridSearchCV(classifier[1],param_grid = param_grids[classifier[0]])
        grid_cv.fit(X_values[1],train_data.target)
        #classifier[1].fit(X_values[1],train_data.target)
        #predictions = classifier[1].predict(X_values[2])
        predictions = grid_cv.predict(X_values[2])
        accuracy = accuracy_score(test_data.target, predictions)
        scores_df.at[X_values[0], classifier[0]] = accuracy
        i+=1
        print(f"%{100*i/total_len} Completed")

%6.25 Completed
%12.5 Completed
%18.75 Completed
%25.0 Completed
%31.25 Completed
%37.5 Completed
%43.75 Completed
%50.0 Completed
%56.25 Completed
%62.5 Completed
%68.75 Completed
%75.0 Completed
%81.25 Completed
%87.5 Completed
%93.75 Completed
%100.0 Completed


In [38]:
scores_df

Unnamed: 0,MultinomialNB,LogisticRegression,SVC,DecisionTree
Count_Vectorizer,0.961783,0.902639,0.878981,0.770701
Tf-Idf,0.957234,0.942675,0.929026,0.731574
Word2Vec,0.406733,0.435851,0.466788,0.424932
Doc2Vec,0.898999,0.833485,0.864422,0.590537
