# Data Parsing: 
Newsgroups: Taken from sci-kit learn

Reuters: Parsed using beautiful soup

In [None]:
import tarfile 
import bs4
from bs4 import BeautifulSoup
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

In [None]:
from sklearn.datasets import fetch_20newsgroups

newsgroups_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))
newsgroups_test = fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes'))

Downloading 20news dataset. This may take a few minutes.
Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)


In [None]:
!wget -N https://kdd.ics.uci.edu/databases/reuters21578/reuters21578.tar.gz

--2021-06-02 19:07:43--  https://kdd.ics.uci.edu/databases/reuters21578/reuters21578.tar.gz
Resolving kdd.ics.uci.edu (kdd.ics.uci.edu)... 128.195.1.86
Connecting to kdd.ics.uci.edu (kdd.ics.uci.edu)|128.195.1.86|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 8150596 (7.8M) [application/x-gzip]
Saving to: ‘reuters21578.tar.gz’


2021-06-02 19:07:43 (17.7 MB/s) - ‘reuters21578.tar.gz’ saved [8150596/8150596]



In [None]:
#Import and preprocess reuters 
import os 
import glob 
reuters_data = !tar xvzf reuters21578.tar.gz 
reuters_data = tarfile.open('reuters21578.tar.gz')
reuters_data.extractall('./content/reuters21578.tar.gz')
reuters_data.close()

#X_reuters are the articles  
#y values are the labels, in this case topics  
X_reuters = []
y_reuters = []
#Put articles into X_reuters for preprocessing

for filename in glob.glob(os.path.join("./content/reuters21578.tar.gz","*.sgm")): #find all files that end in sgm
    with open(filename,"rb") as f: 
      f=f.read()
      soup=BeautifulSoup(f, 'html.parser') #tried using lxml and xml but neither worked, whereas this did
      articles=soup.find_all("reuters") #articles are a list 
      subjects=soup.find_all("topics") #topics are also a list 
    for t in subjects:
      if t.text =="": #if topic tag empty 
        y_reuters.append(0)
      elif t.d.next_sibling: #d tag sibling tag 
        t=[sub.text for sub in t.contents] 
        y_reuters.append(" ".join(t)) #adding multiple relevant topics for a single article into single string 
      else:
        y_reuters.append(t.text) #append text if there is only one topic 
    for a in articles:
      body = a.find('text') 
      X_reuters.append(body.text) 



# Feature Extraction for both Newsgroups and Reuters datasets 

In [None]:
# Newsgroups Feature Extraction

import sklearn 
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
#countvectorizer = CountVectorizer(analyzer= 'word', stop_words='english') #can be enabled to use count vectorizer instead of tfidf 

tfidfvectorizer = TfidfVectorizer(analyzer='word',stop_words='english', ngram_range=(1, 1)) #using tfidf in favor of it being more finely grained
#Splitting newsgroups into train and test 
newsgroups_dataset = fetch_20newsgroups(subset='all') #retrieving all data and breaking it up into train and test using sklearn train_test_split
X_news = newsgroups_dataset.data
y_news = newsgroups_dataset.target
# Newsgroups Feature Extraction
X_news_train, X_news_test, y_news_train, y_news_test = train_test_split(X_news, y_news, test_size=0.2) #80/20 train: test proportion
X_news_train = tfidfvectorizer.fit_transform(X_news_train) #vectorizing data newsgroups
X_news_test = tfidfvectorizer.transform(X_news_test) #vectorizing test data newsgroups 


# Reuters Feature Extraction
X_reuters_train, X_reuters_test, y_reuters_train, y_reuters_test = train_test_split(X_reuters, y_reuters, test_size=0.2) 

X_reuters_train = tfidfvectorizer.fit_transform(X_reuters_train) #vectorizing training data reuters
X_reuters_test = tfidfvectorizer.transform(X_reuters_test) #vectorizing test data reuters 


# **Fitting the Model** 

# SVC Probabilistic, SVC Non-Probablistic, Multinomial Naive Bayes, K-Neighbors Classifier 


In [None]:
########################################## SVC PROBABILISTIC NEWSGROUPS DATASET ############################################

import numpy as np
from sklearn.svm import SVC
from sklearn.metrics import classification_report
svm = SVC(kernel='linear', probability=True) #using SVC as probabilistic classifier to compare with non probablistic classifier and its accuracy 
svm.fit(X_news_train, y_news_train) #fit model with train data
print(classification_report(y_news_test, svm.predict(X_news_test)))

              precision    recall  f1-score   support

           0       0.92      0.94      0.93       155
           1       0.84      0.88      0.86       192
           2       0.88      0.91      0.89       193
           3       0.80      0.83      0.81       194
           4       0.91      0.88      0.90       193
           5       0.90      0.93      0.91       202
           6       0.87      0.88      0.88       213
           7       0.94      0.93      0.93       191
           8       0.97      0.97      0.97       197
           9       0.96      0.96      0.96       204
          10       0.98      0.96      0.97       202
          11       0.98      0.96      0.97       193
          12       0.85      0.89      0.87       176
          13       0.93      0.94      0.93       199
          14       0.97      0.94      0.96       211
          15       0.98      0.96      0.97       206
          16       0.97      0.96      0.96       180
          17       0.99    

In [None]:
######################################### SVC PROBABILISTIC REUTERS DATASET ############################################
import numpy as np
from sklearn.svm import SVC
from sklearn.metrics import classification_report
svm = SVC(kernel='linear', probability=True) #using SVC as probabilistic classifier to compare with non probablistic setting for the same classifier and its accuracy 
svm.fit(X_reuters_train, y_reuters_train) #fit model with train data
print(classification_report(y_reuters_test, svm.predict(X_reuters_test)))

                                                                                                    precision    recall  f1-score   support

                                                                                                 0       0.82      0.93      0.87      1985
                                                                                               acq       0.87      0.85      0.86       490
                                                                                          acq alum       0.00      0.00      0.00         2
                                                                                        acq copper       0.00      0.00      0.00         2
                                                                                         acq crude       0.00      0.00      0.00         1
                                                                                          acq gold       0.00      0.00      0.00         1
                   

  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
################################################. SVC NON-PROBABILISTIC NEWSGROUPS DATASET ############################################
from sklearn.metrics import classification_report
from sklearn.svm import SVC
svmnon = SVC(kernel='linear') #using SVC as non probabilistic classifier to compare with probablistic classifier and its accuracy 
svmnon.fit(X_news_train, y_news_train) #fit model with train data
print(classification_report(y_news_test, svmnon.predict(X_news_test)))

              precision    recall  f1-score   support

           0       0.93      0.91      0.92       191
           1       0.82      0.89      0.85       193
           2       0.87      0.88      0.87       190
           3       0.80      0.84      0.82       176
           4       0.93      0.87      0.90       207
           5       0.90      0.88      0.89       202
           6       0.85      0.87      0.86       190
           7       0.93      0.95      0.94       201
           8       0.97      0.96      0.96       211
           9       0.97      0.97      0.97       182
          10       0.99      0.98      0.99       194
          11       0.98      0.95      0.96       210
          12       0.86      0.92      0.89       179
          13       0.94      0.97      0.95       188
          14       0.99      0.97      0.98       204
          15       0.94      0.94      0.94       203
          16       0.96      0.94      0.95       200
          17       0.98    

In [None]:
################################# SVC NON-PROBABILISTIC REUTERS DATASET ############################################
from sklearn.metrics import classification_report
from sklearn.svm import SVC
svmnon = SVC(kernel='linear') #using SVC as non probabilistic classifier to compare with probablistic classifier and its accuracy 
svmnon.fit(X_reuters_train, y_reuters_train) #fit model with train data
print(classification_report(y_reuters_test, svmnon.predict(X_reuters_test)))

['0' 'lumber' '0' ... 'acq' '0' '0']
                                                                                                    precision    recall  f1-score   support

                                                                                                 0       0.82      0.93      0.87      2008
                                                                                               acq       0.88      0.82      0.85       494
                                                                                        acq copper       0.00      0.00      0.00         1
                                                                                         acq crude       0.00      0.00      0.00         1
                                                                                 acq crude nat-gas       0.00      0.00      0.00         5
                                                                                          acq earn       0.00      0.00   

  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
################################################ K-NEIGHBORS CLASSIFIER NEWSGROUPS DATASET#################################################
from sklearn.neighbors import KNeighborsClassifier #Trying another non-probabilistic classifier that is not SVM for experimental purposes 
from sklearn.metrics import classification_report
kneighbors = KNeighborsClassifier(n_neighbors=11, weights='uniform')
kneighbors.fit(X_news_train, y_news_train)
print(classification_report(y_news_test, kneighbors.predict(X_news_test)))

              precision    recall  f1-score   support

           0       0.73      0.87      0.79       191
           1       0.69      0.65      0.67       193
           2       0.66      0.72      0.69       190
           3       0.58      0.65      0.62       176
           4       0.69      0.66      0.67       207
           5       0.81      0.73      0.77       202
           6       0.66      0.48      0.56       190
           7       0.81      0.84      0.82       201
           8       0.85      0.87      0.86       211
           9       0.76      0.88      0.82       182
          10       0.84      0.93      0.89       194
          11       0.90      0.92      0.91       210
          12       0.79      0.73      0.76       179
          13       0.92      0.78      0.84       188
          14       0.85      0.89      0.87       204
          15       0.89      0.80      0.84       203
          16       0.86      0.89      0.87       200
          17       0.86    

In [None]:
################################################# K-NEIGHBORS CLASSIFIER REUTERS DATASET#########################################
from sklearn.neighbors import KNeighborsClassifier #Trying another non probabilistic classifier that is not SVM for experimental purposes 
from sklearn.metrics import classification_report
kneighbors = KNeighborsClassifier(n_neighbors=11,leaf_size=15, weights='uniform')
kneighbors.fit(X_reuters_train, y_reuters_train)
print(classification_report(y_reuters_test, kneighbors.predict(X_reuters_test)))

                                                                                                    precision    recall  f1-score   support

                                                                                                 0       0.73      0.93      0.82      2008
                                                                                               acq       0.81      0.39      0.53       494
                                                                                        acq copper       0.00      0.00      0.00         1
                                                                                         acq crude       0.00      0.00      0.00         1
                                                                                 acq crude nat-gas       0.00      0.00      0.00         5
                                                                                          acq earn       0.00      0.00      0.00         3
                   

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# Model Comparison and Results Comparison: 

SVM Probabilistic Reuters: Accuracy:93% Macro:92% Weighted:93%

SVM Probabilistic Newsgroups: Accuracy:83% Macro:20% Weighted:81%


SVM Non Probablistic Reuters: Accuracy:83% Macro:20% Weighted:81%

SVM Non Probablistic Newsgroups: Accuracy:92% Macro:92% Weighted:93%


KNN Reuters: Accuracy:75% Macro:16% Weighted:72%

KNN Newsgroups: Accuracy:79% Macro:78% Weighted:79%

KNN seems to be much more computationally efficient than the SVM on these particular datasets. A more detailed account of the models I used is included in the PDF report. In short, KNN works by trying to classify an unclassified data point based on the distance between the unknown data point and other known points. The distance is calculated in terms of a circle, where K specifies the size of the circle where the data will be plotted and distances will be measured in. An SVM on the other hand tries to classify points by plotting them in a space with a given number of dimensions (by the structure of the data) and trying to separate them by a hyperplane. In 2-D the points would be plotted on an XY axis and the hyperplane would simply be a line. This works on data that can easily be classfied in binary terms or is linearly separable in some way. 

SVM worked pretty well on both news groups and reuters, but after hyperparameter tuning, KNN returned 92 percent accuracy on newsgroups and 77 percent  on reuters. The former or which is a considerable improvement and very much comparable to the SVM, but more computationally efficient. 

#Tuning Hyperparameters for Each Model on Each Dataset 

# Hyperparameter Search

This section finds the best parameter for each models and then showing what the best parameters are for each. I did grid search for the probabilistic version of the SVC classifier, and K-Neighbors so as to represent a probabilistic and a non-probabilistic classifier in the hyperparameter search and follow each of the two pipelines all the way through. Based on the trials above the results are very similar for the linear SVC when its probabilistic and non-probablistic in terms of accuracy. 
The hyperparameters that worked best for the SVC (linear, probablistic) were {'C': 2, 'kernel': 'linear'}. Due to the computational expense of running this classifier I chose to look at only a few parameters. Running it on the entire Reuters data set took approximately 12 hours. 
The hyperparameters that worked best for the same classifier on the Newsgroups data set were : {'C': 300, 'kernel': 'linear'}

This took considerably less time, but it seems there are issues with memory when it comes to this particular classifier and running a grid search on a large datset with it is very computationally expensive. 


# Hyperparameter Search for Probabilistic Version of SVC on both Newsgroups and Reuters Datasets 



In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
svm = SVC(kernel='linear', probability=True) 
svmgrid = [{'kernel': ['linear'],
         'C':[1,2,300]}]
SVC_hyperparameter_clsfr= GridSearchCV(svm,svmgrid,scoring='accuracy')

In [None]:
SVC_hyperparameter_clsfr.fit(X_reuters_train,y_reuters_train)




GridSearchCV(cv=None, error_score=nan,
             estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                           class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='scale', kernel='linear', max_iter=-1,
                           probability=True, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='deprecated', n_jobs=None,
             param_grid=[{'C': [1, 2, 300], 'kernel': ['linear']}],
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='accuracy', verbose=0)

In [None]:
print(SVC_hyperparameter_clsfr.best_params_)

{'C': 2, 'kernel': 'linear'}


In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
svcgrid = [{'kernel': ['linear'],
         'C':[1,2,300]}] 

svmgridsearch = GridSearchCV(
                SVC(),
                svcgrid,
                verbose= 1, 
                n_jobs= -1
                )            
gridsearchsvmresults_news = svmgridsearch.fit(X_news_train,y_news_train)
print(gridsearchsvmresults_news.best_params_)

Fitting 5 folds for each of 3 candidates, totalling 15 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed:   34.6s finished


{'C': 300, 'kernel': 'linear'}


I first ran the grid search on the full reuters dataset, and I estimate it took about 12 hours. I ran it before I knew you could include the verbose setting in the gridsearch, and it hadn't occured to me to include a timer for comparison.
However, after I changed the grid search settings for the newsgroups dataset (same hyperparameters), it finished more more quickly. Once again that is most likely due to the n_jobs = -1 setting, as it the computer can use more processing power when it uses all available processors for the task. 



# Hyperparameter search for K Neighbors Classifier on both Newsgroups and Reuters (Non-probablistic classifier) 

For this grid search, I chose to do n_neighbors, which should always be an odd number, as if it were not this way, the decision of where one would classify a given element would be left to chance, were there to be a tie vote where weights are set to 'uniform'. 
In this case I chose to see whether it was better to use uniform weights, or set that parameter to weights being tuned with respet to distance. Additionally, there is a possibility to use manhattan distance or euclidean when measuirng the same and I chose the former. When I ran this classifier's grid search I saw how useful including verbose in the parameters is, as it updates the output as the grid search is running. When I ran the first instance of the SVM hyperparameter search I didn't know at which stage of the process it was several times when I ran it. Because it took 12 hours, it was difficult to know what was happening when and whether or not things were working as they should. 
I set n_jobs to -1 so that the computer will use all of the processors to fit the model, which also speeds up the process quite a bit from my first attempt. Using this setting for the grod search would have porbably significantly sped up my processing time, given that both the SVM and KNN classfiers appear to be slow when fed large datasets. I ran the KNN classifier on the full reuters data set with these 48 potential combinations and the hyperparameters were tuned in 19.9 minutes, as opposed to the extremely long 12 hours of the SVC classifier grid search. It is perhaps the case that the SVC classifier is much slower than Kneighbors, and that is also perhaps it was set to probabilistic and there are more potential outcomes to consider in that case. 

The hyperparameter setting for Reuters in the KNN classifier after the grid search was {'metric': 'euclidean', 'n_neighbors': 11, 'weights': 'distance'}

The hyperparameter setting for Newsgroups in the KNN classifier after the grid search was 


In [None]:
from sklearn.model_selection import GridSearchCV
knngrid = {
    'n_neighbors':[3,5,11,19],
    'weights': ['uniform','distance'],
    'metric':['euclidean','manhattan']
}
knngridsearch = GridSearchCV(
                KNeighborsClassifier(),
                knngrid,
                verbose= 1,
                n_jobs= -1
                )            
gridsearchknnresults_reuters = knngridsearch.fit(X_reuters_train,y_reuters_train)
print(gridsearchknnresults_reuters.best_params_)

Fitting 5 folds for each of 16 candidates, totalling 80 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:  4.1min


{'metric': 'euclidean', 'n_neighbors': 11, 'weights': 'distance'}


[Parallel(n_jobs=-1)]: Done  80 out of  80 | elapsed: 19.9min finished


In [None]:
from sklearn.model_selection import GridSearchCV
knngrid = {
    'n_neighbors':[3,5,11,19],
    'weights': ['uniform','distance'],
    'metric':['euclidean','manhattan']
}
knngridsearch = GridSearchCV(
                KNeighborsClassifier(),
                knngrid,
                verbose= 1,
                n_jobs= -1
                )            
gridsearchknnresults_news = knngridsearch.fit(X_news_train,y_news_train)
print(gridsearchknnresults_news.best_params_)

Fitting 5 folds for each of 16 candidates, totalling 80 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:  5.6min


{'metric': 'euclidean', 'n_neighbors': 3, 'weights': 'distance'}


[Parallel(n_jobs=-1)]: Done  80 out of  80 | elapsed: 29.4min finished


If we then use the best hyperparameters for each model as they were determined for each datset, we should get better results. To test this, I fit the model again in order to compare the results of the model with the randomly chosen parameters I tried at first, and the parameters returned from the grid search. Herein follow these results: 

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_news_test, gridsearchknnresults_news.predict(X_news_test)))

              precision    recall  f1-score   support

           0       0.84      0.90      0.87       165
           1       0.75      0.72      0.73       211
           2       0.72      0.69      0.70       193
           3       0.75      0.78      0.76       203
           4       0.75      0.77      0.76       177
           5       0.84      0.79      0.81       206
           6       0.68      0.58      0.63       190
           7       0.86      0.84      0.85       180
           8       0.91      0.91      0.91       201
           9       0.83      0.92      0.87       170
          10       0.84      0.95      0.89       198
          11       0.89      0.95      0.92       189
          12       0.87      0.80      0.83       233
          13       0.90      0.85      0.87       191
          14       0.89      0.91      0.90       208
          15       0.90      0.88      0.89       210
          16       0.86      0.87      0.87       156
          17       0.92    

In [None]:
from sklearn.model_selection import GridSearchCV
knngrid = {
    'n_neighbors':[3,5,11,19],
    'weights': ['uniform','distance'],
    'metric':['euclidean','manhattan']
}
knngridsearch = GridSearchCV(
                KNeighborsClassifier(),
                knngrid,
                verbose= 1,
                n_jobs= -1
                )            
gridsearchknnresults_reuters = knngridsearch.fit(X_reuters_train,y_reuters_train)
print(gridsearchknnresults_reuters.best_params_)

Fitting 5 folds for each of 16 candidates, totalling 80 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:  4.1min


{'metric': 'euclidean', 'n_neighbors': 11, 'weights': 'distance'}


[Parallel(n_jobs=-1)]: Done  80 out of  80 | elapsed: 19.8min finished


In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_reuters_test, gridsearchknnresults_reuters.predict(X_reuters_test)))

                                                                                                                      precision    recall  f1-score   support

                                                                                                                   0       0.77      0.92      0.84      2053
                                                                                                                 acq       0.81      0.48      0.61       482
                                                                                                   acq crude nat-gas       0.00      0.00      0.00         1
                                                                                                            acq earn       0.00      0.00      0.00         1
                                                                                                            acq gold       0.00      0.00      0.00         1
                                                   

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
#Newsgroups SVC Probablistic Optimized Hyperparameters
import numpy as np
from sklearn.svm import SVC
from sklearn.metrics import classification_report
svm = SVC(C= 300, kernel='linear', probability=True) #using SVC as probabilistic classifier to compare with non probablistic classifier and its accuracy 
svm.fit(X_news_train, y_news_train) #fit model with train data
print(classification_report(y_news_test, svm.predict(X_news_test)))

              precision    recall  f1-score   support

           0       0.90      0.95      0.92       171
           1       0.79      0.87      0.82       179
           2       0.89      0.88      0.89       198
           3       0.78      0.83      0.80       189
           4       0.92      0.87      0.90       182
           5       0.86      0.85      0.86       184
           6       0.84      0.87      0.86       199
           7       0.96      0.93      0.94       210
           8       0.98      0.96      0.97       187
           9       0.96      0.97      0.97       215
          10       0.98      0.99      0.98       207
          11       0.98      0.93      0.96       205
          12       0.84      0.92      0.88       196
          13       0.94      0.94      0.94       203
          14       0.97      0.96      0.96       187
          15       0.98      0.93      0.95       215
          16       0.94      0.96      0.95       166
          17       0.99    

In comparison to the initial model, the KNN on newsgroups achieved 83 percent accuracy, 20% macro avg and 81% weighted avg. When I used the gridsearch hyperparameters, the model achieved 83 percent accuracy, 84 percent macro average and an 83 percent weighted avergage. 

For the reuters, without the hyperparameter tuning the KNN model achieved the following: 75% accuracy. 16% macro average and 72% weighted average. 
With the hyperparameters from the grid search, the model achieved the following: 77 % accuracy 20% macro average and 75 % weighted average. 
