In [None]:
### load data set and preprocessing
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
import string
import nltk

newsgroups_train = fetch_20newsgroups(subset='train')
newsgroups_test = fetch_20newsgroups(subset='test')

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))

# Function to preprocess
def lemmatizing(text):
    tokens = word_tokenize(text.lower())
    tokens = [token for token in tokens if token not in string.punctuation and token not in stop_words]
    lemmatized_text = ' '.join([WordNetLemmatizer().lemmatize(token) for token in tokens])
    return lemmatized_text

def stemming(text):
    tokens = word_tokenize(text.lower())
    tokens = [token for token in tokens if token not in string.punctuation and token not in stop_words]
    stemmed_tokens = ' '.join([PorterStemmer().stem(token) for token in tokens])
    return stemmed_tokens

newsgroups_train_lemmatized = [lemmatizing(x) for x in newsgroups_train.data]
newsgroups_test_lemmatized  = [lemmatizing(x) for x in newsgroups_test.data]

newsgroups_train_stemmed = [stemming(x) for x in newsgroups_train.data]
newsgroups_test_stemmed  = [stemming(x) for x in newsgroups_test.data]

print('train dataset length: ', len(newsgroups_train.data))
print('train dataset length: ', len(newsgroups_test.data))

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


train dataset length:  11314
train dataset length:  7532


In [None]:
### Naive Bayes, LogR


from sklearn.feature_extraction.text import TfidfTransformer,TfidfVectorizer,CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline,Pipeline
from sklearn.model_selection import  GridSearchCV
from sklearn import metrics
import time

# train,test = newsgroups_train.data , newsgroups_test.data
# train,test = newsgroups_train_lemmatized , newsgroups_test_lemmatized
train,test = newsgroups_train_stemmed    , newsgroups_test_stemmed


def my_grid_search(input_pipeline, input_parameters, train,test):
    print('---------------------------------------')
    print( input_pipeline)
    print( input_parameters)
    tt=time.time()
    grid_search = GridSearchCV(input_pipeline, input_parameters, cv=5, n_jobs=-1, verbose=1)
    grid_search.fit(train, newsgroups_train.target)
    print("Best parameters set found on development set:",grid_search.best_params_)
    print("Grid scores on development set:")
    means = grid_search.cv_results_['mean_test_score']
    stds = grid_search.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, grid_search.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params))
    predicted = grid_search.predict(test)
    print(metrics.classification_report(newsgroups_test.target, predicted, target_names=newsgroups_test.target_names))
    print((time.time()-tt)/60)
    print('---------------------------------------')



for clf in [ MultinomialNB(),  LogisticRegression(max_iter=1000)]:
    pipeline = Pipeline([
      ('vectorizer', CountVectorizer()),
      ('classifier', clf)
    ])
    parameters ={
      'vectorizer__max_features' : [10000],
      'vectorizer__ngram_range' : [(1, 1),(1,2),(1,3)],
      #'transform__norm':['l1','l2']
    }
    my_grid_search(pipeline,parameters,train,test)

for clf in [   LogisticRegression(max_iter=1000)]:#MultinomialNB()]:#  ,
    pipeline = Pipeline([
      ('vectorizer', CountVectorizer()),
      ('transform', TfidfTransformer()),
      ('classifier', clf)
    ])
    parameters ={
      'vectorizer__max_features' : [10000],
      'vectorizer__ngram_range' : [(1, 1),(1,2),(1,3)],
      'transform__norm':['l1','l2']
    }
    my_grid_search(pipeline,parameters,train,test)







---------------------------------------
Pipeline(steps=[('vectorizer', CountVectorizer()),
                ('classifier', MultinomialNB())])
{'vectorizer__max_features': [10000], 'vectorizer__ngram_range': [(1, 1), (1, 2), (1, 3)]}
Fitting 5 folds for each of 3 candidates, totalling 15 fits
Best parameters set found on development set: {'vectorizer__max_features': 10000, 'vectorizer__ngram_range': (1, 1)}
Grid scores on development set:
0.843 (+/-0.009) for {'vectorizer__max_features': 10000, 'vectorizer__ngram_range': (1, 1)}
0.822 (+/-0.008) for {'vectorizer__max_features': 10000, 'vectorizer__ngram_range': (1, 2)}
0.811 (+/-0.008) for {'vectorizer__max_features': 10000, 'vectorizer__ngram_range': (1, 3)}
                          precision    recall  f1-score   support

             alt.atheism       0.75      0.81      0.78       319
           comp.graphics       0.56      0.79      0.65       389
 comp.os.ms-windows.misc       0.20      0.00      0.01       394
comp.sys.ibm.pc.ha

In [None]:
### Glove Embedding
from sklearn.feature_extraction.text import TfidfTransformer,TfidfVectorizer,CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline,Pipeline
from sklearn.model_selection import  GridSearchCV
from sklearn import metrics
import time

import numpy as np

# train,test = newsgroups_train.data , newsgroups_test.data
# train,test = newsgroups_train_lemmatized , newsgroups_test_lemmatized
train,test = newsgroups_train_stemmed    , newsgroups_test_stemmed



glove_file = 'glove.6B.100d.txt'  # Adjust the path accordingly
word_vectors = {}
with open(glove_file, 'r', encoding='utf-8') as f:
    for line in f:
        try:
          values = line.split()
          word = values[0]
          vector = np.array(values[1:], dtype='float32')
          word_vectors[word] = vector
        except:
            print(line)
            pass


X_train_glove , X_test_glove = [] , []
vectorizer = CountVectorizer(max_features=100000)
vectorizer.fit_transform(train)
vectorizer.transform(test)

word_index = vectorizer.vocabulary_
embedding_matrix = np.zeros((len(word_index) + 1, 100))  # Adjust dimensions based on GloVe file used
for word, i in word_index.items():
    if word in word_vectors:
        embedding_matrix[i] = word_vectors[word]

for doc in newsgroups_train.data:
    words = doc.split()
    embeddings = [embedding_matrix[word_index[word]] for word in words if word in word_index]
    if embeddings: X_train_glove.append(np.mean(embeddings, axis=0))
    else: X_train_glove.append(np.zeros(100))  # Zero vector if no embeddings found

for doc in newsgroups_test.data:
    words = doc.split()
    embeddings = [embedding_matrix[word_index[word]] for word in words if word in word_index]
    if embeddings: X_test_glove.append(np.mean(embeddings, axis=0))
    else: X_test_glove.append(np.zeros(100))  # Zero vector if no embeddings found

X_train_glove = np.array(X_train_glove)
X_test_glove = np.array(X_test_glove)

for clf in [  LogisticRegression(max_iter=1000)]:
    pipeline = Pipeline([
      ('classifier', clf)  #   LogisticRegression(max_iter=1000)
    ])
    tt=time.time()
    pipeline.fit(X_train_glove , newsgroups_train.target)
    predicted = pipeline.predict(X_test_glove)
    print(metrics.classification_report(newsgroups_test.target, predicted, target_names=newsgroups_test.target_names))
    print((time.time()-tt)/60)






                          precision    recall  f1-score   support

             alt.atheism       0.20      0.07      0.10       319
           comp.graphics       0.42      0.36      0.39       389
 comp.os.ms-windows.misc       0.34      0.18      0.23       394
comp.sys.ibm.pc.hardware       0.41      0.37      0.39       392
   comp.sys.mac.hardware       0.35      0.07      0.12       385
          comp.windows.x       0.40      0.46      0.43       395
            misc.forsale       0.54      0.71      0.61       390
               rec.autos       0.54      0.57      0.55       396
         rec.motorcycles       0.41      0.58      0.48       398
      rec.sport.baseball       0.46      0.55      0.50       397
        rec.sport.hockey       0.60      0.63      0.62       399
               sci.crypt       0.48      0.55      0.51       396
         sci.electronics       0.50      0.39      0.44       393
                 sci.med       0.54      0.66      0.59       396
         

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
# from sklearn.datasets import fetch_20newsgroups
# from sklearn.pipeline import Pipeline
# from sklearn.linear_model import LogisticRegression
# from sklearn.feature_extraction.text import CountVectorizer
# from gensim.models import Word2Vec
# from sklearn import metrics
# import numpy as np



# train,test = newsgroups_train.data , newsgroups_test.data
# # train,test = newsgroups_train_lemmatized , newsgroups_test_lemmatized
# # train,test = newsgroups_train_stemmed    , newsgroups_test_stemmed


# # Tokenize the text and train Word2Vec embeddings
# tokenized_train_text = [doc.split() for doc in train]
# word2vec_model = Word2Vec(sentences=tokenized_train_text, vector_size=100, window=5, min_count=1, workers=4)

# class Word2VecTransformer:
#     def __init__(self, word2vec):
#         self.word2vec = word2vec
#         self.vector_size = word2vec.vector_size
#     def fit(self, X, y=None):
#         return self
#     def transform(self, X):
#         return np.array([
#             np.mean([self.word2vec.wv[word] for word in doc.split() if word in self.word2vec.wv] or [np.zeros(self.vector_size)], axis=0)
#             for doc in X
#         ])

# # Build the pipeline
# pipeline = Pipeline([
#     ('word2vec', Word2VecTransformer(word2vec_model)),
#     ('classifier', LogisticRegression(max_iter=1000))
# ])

# # Train the model
# pipeline.fit()
# predicted = pipeline.predict(newsgroups_test.data)
# print(metrics.classification_report(newsgroups_test.target, predicted, target_names=newsgroups_test.target_names))


                          precision    recall  f1-score   support

             alt.atheism       0.27      0.33      0.30       319
           comp.graphics       0.29      0.31      0.30       389
 comp.os.ms-windows.misc       0.33      0.22      0.26       394
comp.sys.ibm.pc.hardware       0.31      0.39      0.35       392
   comp.sys.mac.hardware       0.26      0.24      0.25       385
          comp.windows.x       0.47      0.44      0.45       395
            misc.forsale       0.69      0.70      0.70       390
               rec.autos       0.27      0.32      0.29       396
         rec.motorcycles       0.44      0.39      0.42       398
      rec.sport.baseball       0.31      0.29      0.30       397
        rec.sport.hockey       0.51      0.50      0.50       399
               sci.crypt       0.36      0.50      0.41       396
         sci.electronics       0.30      0.19      0.23       393
                 sci.med       0.29      0.27      0.28       396
         