## Artificial Neural Networks and Text Data

Gabriel Kiprono
Lewis William

In [1]:
from sklearn.datasets import fetch_20newsgroups

# select categories and load the training and test data

categories = ['alt.atheism', 'soc.religion.christian','comp.graphics', 'sci.med', 'sci.space',
              'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey']

# load training and test data
twenty_train = fetch_20newsgroups(subset='train', categories=categories, shuffle=True, random_state=42)
twenty_test = fetch_20newsgroups(subset='test', categories=categories, shuffle=True, random_state=42)

### Import and download NLTK (natural language toolkit for more expanded libraries)

In [2]:
import nltk
# nltk.download()

In [4]:
# convert to vectors of word counts

# import and use CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer

count_vect = CountVectorizer()
#count_vect = CountVectorizer(stop_words='english')

X_train_counts = count_vect.fit_transform(twenty_train.data)
X_test_counts = count_vect.transform(twenty_test.data)

### Generate label vectors for training and multiclass ANN

In [5]:
from sklearn import preprocessing
lb = preprocessing.LabelBinarizer()
lb.fit(twenty_train.target)
train_vTarget=lb.transform(twenty_train.target)
test_vTarget=lb.transform(twenty_test.target)

In [6]:
twenty_train.target

array([6, 9, 2, ..., 1, 7, 6], dtype=int64)

In [7]:
train_vTarget

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 1],
       [0, 0, 1, ..., 0, 0, 0],
       ...,
       [0, 1, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 1, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

### Run ANN (discuss parameters)

In [8]:
from sklearn.neural_network import MLPClassifier
import numpy as np

clf = MLPClassifier(activation='logistic', solver='adam', max_iter=50, alpha=1e-5, hidden_layer_sizes=(15,), random_state=1)
clf.fit(X_train_counts, train_vTarget)
print(clf.classes_)
print(clf.n_outputs_)

# make predictions on test data
clf.out_activation_ = 'softmax'
predicted = clf.predict(X_test_counts)

print(predicted[1])
print('-----------')
print(test_vTarget[1])
predicted = lb.inverse_transform(predicted)

# print accuracy
print (np.mean(predicted == twenty_test.target)) 


[0 1 2 3 4 5 6 7 8 9]
10
[0 0 0 0 1 0 0 0 0 0]
-----------
[0 0 0 0 1 0 0 0 0 0]
0.8526831785345718




In [9]:
from sklearn import metrics

# print accuracy
print (np.mean(predicted == twenty_test.target)) 

# print precision and recall statistics
print(metrics.classification_report(twenty_test.target, predicted,
    target_names=twenty_test.target_names))

# print confusion matrix
print(metrics.confusion_matrix(twenty_test.target, predicted))

0.8526831785345718
                        precision    recall  f1-score   support

           alt.atheism       0.38      0.92      0.54       319
         comp.graphics       0.94      0.84      0.89       389
          misc.forsale       0.94      0.88      0.91       390
             rec.autos       0.97      0.82      0.89       396
       rec.motorcycles       1.00      0.89      0.94       398
    rec.sport.baseball       0.98      0.86      0.92       397
      rec.sport.hockey       0.99      0.95      0.97       399
               sci.med       0.97      0.67      0.79       396
             sci.space       1.00      0.80      0.89       394
soc.religion.christian       0.95      0.91      0.93       398

              accuracy                           0.85      3876
             macro avg       0.91      0.85      0.87      3876
          weighted avg       0.92      0.85      0.87      3876

[[293   1   0   0   0   1   0   5   0  19]
 [ 59 325   3   0   0   1   0   0   1  

### Different architecture

In [10]:
clf = MLPClassifier(activation='logistic', solver='adam', max_iter=50, alpha=1e-5, hidden_layer_sizes=(25,), random_state=1)
clf.fit(X_train_counts, train_vTarget)

# make predictions on test data
clf.out_activation_ = 'softmax'
predicted = clf.predict(X_test_counts)

predicted = lb.inverse_transform(predicted)

# print accuracy
print (np.mean(predicted == twenty_test.target)) 


0.9045407636738906




In [11]:
# print accuracy
print (np.mean(predicted == twenty_test.target)) 

# print precision and recall statistics
print(metrics.classification_report(twenty_test.target, predicted,
    target_names=twenty_test.target_names))

# print confusion matrix
print(metrics.confusion_matrix(twenty_test.target, predicted))

0.9045407636738906
                        precision    recall  f1-score   support

           alt.atheism       0.56      0.88      0.69       319
         comp.graphics       0.93      0.88      0.90       389
          misc.forsale       0.92      0.93      0.93       390
             rec.autos       0.96      0.89      0.92       396
       rec.motorcycles       1.00      0.93      0.96       398
    rec.sport.baseball       0.97      0.93      0.95       397
      rec.sport.hockey       0.97      0.97      0.97       399
               sci.med       0.96      0.81      0.88       396
             sci.space       0.97      0.89      0.93       394
soc.religion.christian       0.92      0.94      0.93       398

              accuracy                           0.90      3876
             macro avg       0.92      0.90      0.91      3876
          weighted avg       0.92      0.90      0.91      3876

[[280   1   0   0   0   1   0   7   3  27]
 [ 27 341   8   3   0   4   2   0   4  

### Stop words

In [12]:
count_vect_sw = CountVectorizer(stop_words='english')

X_train_sw_counts = count_vect_sw.fit_transform(twenty_train.data)
X_test_sw_counts = count_vect_sw.transform(twenty_test.data)

In [13]:
clf = MLPClassifier(activation='logistic', solver='adam', max_iter=50, alpha=1e-5, hidden_layer_sizes=(15,), random_state=1)
clf.fit(X_train_sw_counts, train_vTarget)

# make predictions on test data
clf.out_activation_ = 'softmax'
predicted = clf.predict(X_test_sw_counts)

predicted = lb.inverse_transform(predicted)

# print accuracy
print (np.mean(predicted == twenty_test.target)) 


0.847265221878225




In [14]:
# print accuracy
print (np.mean(predicted == twenty_test.target)) 

# print precision and recall statistics
print(metrics.classification_report(twenty_test.target, predicted,
    target_names=twenty_test.target_names))

# print confusion matrix
print(metrics.confusion_matrix(twenty_test.target, predicted))

0.847265221878225
                        precision    recall  f1-score   support

           alt.atheism       0.36      0.91      0.52       319
         comp.graphics       0.96      0.79      0.87       389
          misc.forsale       0.97      0.83      0.89       390
             rec.autos       0.98      0.82      0.89       396
       rec.motorcycles       0.99      0.92      0.95       398
    rec.sport.baseball       0.98      0.86      0.92       397
      rec.sport.hockey       0.99      0.91      0.95       399
               sci.med       0.98      0.70      0.82       396
             sci.space       0.98      0.81      0.89       394
soc.religion.christian       0.94      0.93      0.93       398

              accuracy                           0.85      3876
             macro avg       0.91      0.85      0.86      3876
          weighted avg       0.92      0.85      0.87      3876

[[290   1   0   0   0   1   0   4   1  22]
 [ 75 309   1   0   0   1   0   0   2   

### Stemming

In [15]:
from nltk.stem.snowball import SnowballStemmer
#stemmer = SnowballStemmer("english", ignore_stopwords=True)
stemmer = SnowballStemmer("english")
class StemmedCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        analyzer = super(StemmedCountVectorizer, self).build_analyzer()
        return lambda doc: ([stemmer.stem(w) for w in analyzer(doc)])
#stemmed_count_vect = StemmedCountVectorizer(stop_words='english')
stemmed_count_vect = StemmedCountVectorizer()
X_train_stem_counts = stemmed_count_vect.fit_transform(twenty_train.data)
X_test_stem_counts = stemmed_count_vect.transform(twenty_test.data)

In [16]:
clf = MLPClassifier(activation='logistic', solver='adam', max_iter=50, alpha=1e-5, hidden_layer_sizes=(15,), random_state=1)
clf.fit(X_train_stem_counts, train_vTarget)

# make predictions on test data
clf.out_activation_ = 'softmax'
predicted = clf.predict(X_test_stem_counts)

predicted = lb.inverse_transform(predicted)

# print accuracy
print (np.mean(predicted == twenty_test.target)) 

0.8606811145510835




In [17]:
# print accuracy
print (np.mean(predicted == twenty_test.target)) 

# print precision and recall statistics
print(metrics.classification_report(twenty_test.target, predicted,
    target_names=twenty_test.target_names))

# print confusion matrix
print(metrics.confusion_matrix(twenty_test.target, predicted))

0.8606811145510835
                        precision    recall  f1-score   support

           alt.atheism       0.40      0.91      0.55       319
         comp.graphics       0.94      0.84      0.89       389
          misc.forsale       0.94      0.86      0.90       390
             rec.autos       0.97      0.86      0.91       396
       rec.motorcycles       1.00      0.89      0.94       398
    rec.sport.baseball       0.99      0.87      0.93       397
      rec.sport.hockey       0.99      0.92      0.95       399
               sci.med       0.97      0.73      0.83       396
             sci.space       0.98      0.83      0.90       394
soc.religion.christian       0.95      0.90      0.92       398

              accuracy                           0.86      3876
             macro avg       0.91      0.86      0.87      3876
          weighted avg       0.92      0.86      0.88      3876

[[291   0   0   0   0   1   0   5   3  19]
 [ 50 328   6   0   0   0   1   0   3  

### Convert the data to a TF-IDF representation (Note change to max_iter)

In [18]:
from sklearn.feature_extraction.text import TfidfTransformer

tfidf_transformer = TfidfTransformer()

X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

# clf_2 = MultinomialNB().fit(X_train_tfidf, twenty_train.target)

X_test_tfidf = tfidf_transformer.transform(X_test_counts)

In [19]:
clf = MLPClassifier(activation='logistic', solver='adam', max_iter=100, alpha=1e-5, hidden_layer_sizes=(15,), random_state=1)
clf.fit(X_train_tfidf, train_vTarget)

# make predictions on test data
clf.out_activation_ = 'softmax'
predicted = clf.predict(X_test_tfidf)

predicted = lb.inverse_transform(predicted)

# print accuracy
print (np.mean(predicted == twenty_test.target)) 

0.8771929824561403




In [20]:
# print accuracy
print (np.mean(predicted == twenty_test.target)) 

# print precision and recall statistics
print(metrics.classification_report(twenty_test.target, predicted,
    target_names=twenty_test.target_names))

# print confusion matrix
print(metrics.confusion_matrix(twenty_test.target, predicted))

0.8771929824561403
                        precision    recall  f1-score   support

           alt.atheism       0.43      0.90      0.58       319
         comp.graphics       0.95      0.86      0.90       389
          misc.forsale       0.96      0.89      0.92       390
             rec.autos       0.97      0.86      0.91       396
       rec.motorcycles       0.99      0.89      0.94       398
    rec.sport.baseball       0.98      0.91      0.94       397
      rec.sport.hockey       0.98      0.96      0.97       399
               sci.med       0.97      0.73      0.83       396
             sci.space       0.99      0.85      0.91       394
soc.religion.christian       0.94      0.93      0.94       398

              accuracy                           0.88      3876
             macro avg       0.92      0.88      0.89      3876
          weighted avg       0.93      0.88      0.89      3876

[[287   0   0   0   1   1   0   4   2  24]
 [ 52 334   2   0   0   1   0   0   0  

## Automate the search for a good ANN model on just the comp.* subset of newsgroups

### Make the following choices sequentially (1) hidden layer sizes, (2) include or ignore stopwords, (3) count vectors vs tfidf vectors, and then (4) stemming or not. I suggest using max_iter of at least 100 (default is 200)

- gridsearch for hidden layer sizes
  -   save into variable
- try with and without stopwords and keep the best performing in a variable
- try with count vectors, tfidf and keep the best performing in a variable
- try with stemming and without and keep best performing in a variable
- max_iter >= 100
- don't hard code best values

In [21]:
# categories com.*
comp_categories = ['comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x']
# load training data
twenty_train = fetch_20newsgroups(subset='train', categories=comp_categories, shuffle=True, random_state=42)
twenty_test = fetch_20newsgroups(subset='test', categories=comp_categories, shuffle=True, random_state=42)

In [22]:
# fit_transform
X_train_counts = count_vect.fit_transform(twenty_train.data)
X_test_counts = count_vect.transform(twenty_test.data)

lb = preprocessing.LabelBinarizer()
lb.fit(twenty_train.target)
train_vTarget=lb.transform(twenty_train.target)
test_vTarget=lb.transform(twenty_test.target)

clfa = MLPClassifier(activation='logistic', solver='adam', max_iter=100, alpha=1e-5, hidden_layer_sizes= (15,), random_state=1)
clfb = MLPClassifier(activation='logistic', solver='adam', max_iter=100, alpha=1e-5, hidden_layer_sizes= (30,), random_state=1)
clfc = MLPClassifier(activation='logistic', solver='adam', max_iter=100, alpha=1e-5, hidden_layer_sizes= (55,), random_state=1)



In [23]:
def bestClf(a,b,c):
    a.fit(X_train_counts, train_vTarget)
    b.fit(X_train_counts, train_vTarget)
    c.fit(X_train_counts, train_vTarget)
    
    a.out_activation_ = 'softmax'
    b.out_activation_ = 'softmax'
    c.out_activation_ = 'softmax'
    
    predicted_a = a.predict(X_test_counts)
    predicted_b = b.predict(X_test_counts)
    predicted_c = c.predict(X_test_counts)
    
    predicted_a = lb.inverse_transform(predicted_a)
    predicted_b = lb.inverse_transform(predicted_b)
    predicted_c= lb.inverse_transform(predicted_c)

    
    acc_a = np.mean(predicted_a == twenty_test.target)
    acc_b = np.mean(predicted_b == twenty_test.target)
    acc_c = np.mean(predicted_c == twenty_test.target)
    best_acc = max(acc_a, acc_b, acc_c)
    best_hls = a
#     print(acc_a,acc_b,acc_c)
    if (best_acc == acc_b):
        best_acc = acc_b
        best_hls = b
#         print(best_acc,best_hls)
    if (best_acc == acc_c):
        best_acc = acc_c
        best_hls = c
#         print(best_acc,best_hls)
    else:
        best_acc = acc_a
        best_hls = a
#         print(best_acc,best_hls)
    return(best_hls)
best_clf = bestClf(clfa,clfb,clfc)
print(best_clf)



MLPClassifier(activation='logistic', alpha=1e-05, hidden_layer_sizes=(55,),
              max_iter=100, random_state=1)




In [24]:
best_predict = best_clf.predict(X_test_counts)
best_predict = lb.inverse_transform(best_predict)

In [25]:
# print accuracy
print (np.mean(best_predict == twenty_test.target)) 

# print precision and recall statistics
print(metrics.classification_report(twenty_test.target, best_predict,
    target_names=twenty_test.target_names))

# print confusion matrix
print(metrics.confusion_matrix(twenty_test.target, best_predict))

0.7979539641943734
                          precision    recall  f1-score   support

           comp.graphics       0.70      0.84      0.76       389
 comp.os.ms-windows.misc       0.80      0.73      0.76       394
comp.sys.ibm.pc.hardware       0.80      0.79      0.80       392
   comp.sys.mac.hardware       0.84      0.86      0.85       385
          comp.windows.x       0.89      0.78      0.83       395

                accuracy                           0.80      1955
               macro avg       0.80      0.80      0.80      1955
            weighted avg       0.80      0.80      0.80      1955

[[325  17   7  18  22]
 [ 46 286  37  12  13]
 [ 19  29 311  29   4]
 [ 22   7  25 330   1]
 [ 55  20   7   5 308]]


## Try Stopwords

In [26]:
count_vect_sw = CountVectorizer(stop_words='english')

X_train_sw_counts = count_vect_sw.fit_transform(twenty_train.data)
X_test_sw_counts = count_vect_sw.transform(twenty_test.data)

sw_clf = best_clf
sw_clf.fit(X_train_sw_counts, train_vTarget)

# make predictions on test data
sw_clf.out_activation_ = 'softmax'
sw_predicted = sw_clf.predict(X_test_sw_counts)

sw_predicted = lb.inverse_transform(sw_predicted)

# print accuracy
print (np.mean(sw_predicted == twenty_test.target)) 

# print accuracy
print (np.mean(sw_predicted == twenty_test.target)) 

# print precision and recall statistics
print(metrics.classification_report(twenty_test.target, sw_predicted,
    target_names=twenty_test.target_names))

# print confusion matrix
print(metrics.confusion_matrix(twenty_test.target, sw_predicted))

0.7933503836317135
0.7933503836317135
                          precision    recall  f1-score   support

           comp.graphics       0.68      0.84      0.75       389
 comp.os.ms-windows.misc       0.78      0.72      0.75       394
comp.sys.ibm.pc.hardware       0.80      0.80      0.80       392
   comp.sys.mac.hardware       0.85      0.85      0.85       385
          comp.windows.x       0.89      0.76      0.82       395

                accuracy                           0.79      1955
               macro avg       0.80      0.79      0.79      1955
            weighted avg       0.80      0.79      0.79      1955

[[327  19   8  16  19]
 [ 52 285  35  11  11]
 [ 22  29 313  24   4]
 [ 24   7  25 327   2]
 [ 55  25  10   6 299]]




### Stopwords or Normal?

In [27]:
if (np.mean(best_predict==twenty_test.target)<= np.mean(sw_predicted==twenty_test.target)):
    best_clf2 = sw_clf
    isSw = "Stopwords provide the best results"
else:
    isSw = "Ignore stopwords for best results"
    best_clf2 = best_clf
print(np.mean(best_predict==twenty_test.target),np.mean(sw_predicted==twenty_test.target), best_clf2, isSw)

0.7979539641943734 0.7933503836317135 MLPClassifier(activation='logistic', alpha=1e-05, hidden_layer_sizes=(55,),
              max_iter=100, random_state=1) Ignore stopwords for best results


## Try TFIDF Vectors

In [28]:
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_test_tfidf = tfidf_transformer.transform(X_test_counts)

tfidf_clf = MLPClassifier(activation='logistic', solver='adam', max_iter=100, alpha=1e-5, hidden_layer_sizes=(15,), random_state=1)
tfidf_clf.fit(X_train_tfidf, train_vTarget)

# make predictions on test data
tfidf_clf.out_activation_ = 'softmax'
tfidf_pred = tfidf_clf.predict(X_test_tfidf)

tfidf_pred = lb.inverse_transform(tfidf_pred)

# print accuracy
print (np.mean(tfidf_pred == twenty_test.target)) 

0.7278772378516624




In [29]:
# print accuracy
print (np.mean(tfidf_pred == twenty_test.target)) 

# print precision and recall statistics
print(metrics.classification_report(twenty_test.target, tfidf_pred,
    target_names=twenty_test.target_names))

# print confusion matrix
print(metrics.confusion_matrix(twenty_test.target, tfidf_pred))

0.7278772378516624
                          precision    recall  f1-score   support

           comp.graphics       0.45      0.92      0.61       389
 comp.os.ms-windows.misc       0.89      0.56      0.69       394
comp.sys.ibm.pc.hardware       0.90      0.71      0.79       392
   comp.sys.mac.hardware       0.95      0.75      0.84       385
          comp.windows.x       0.92      0.70      0.80       395

                accuracy                           0.73      1955
               macro avg       0.82      0.73      0.74      1955
            weighted avg       0.82      0.73      0.74      1955

[[358   7   5   4  15]
 [147 222  17   1   7]
 [ 95  11 277   9   0]
 [ 85   1   9 289   1]
 [107   8   1   2 277]]


## Is TFIDF better?

In [30]:
best_acc = np.mean(best_predict == twenty_test.target)
tfidf_acc = np.mean(tfidf_pred == twenty_test.target)

if (best_acc<= tfidf_acc):
    best_clf3 = tfidf_clf
else:
    best_clf3 = best_clf
print(tfidf_acc,best_acc,best_clf3)

0.7278772378516624 0.7979539641943734 MLPClassifier(activation='logistic', alpha=1e-05, hidden_layer_sizes=(55,),
              max_iter=100, random_state=1)


## Try Stemming

In [31]:
X_train_stem_counts = stemmed_count_vect.fit_transform(twenty_train.data)
X_test_stem_counts = stemmed_count_vect.transform(twenty_test.data)

stem_clf = MLPClassifier(activation='logistic', solver='adam', max_iter=50, alpha=1e-5, hidden_layer_sizes=(55,), random_state=1)
stem_clf.fit(X_train_stem_counts, train_vTarget)

# make predictions on test data
stem_clf.out_activation_ = 'softmax'
stem_pred = stem_clf.predict(X_test_stem_counts)

stem_pred = lb.inverse_transform(stem_pred)

# print accuracy
print (np.mean(stem_pred == twenty_test.target)) 

0.7918158567774936




In [32]:
# print accuracy
print (np.mean(stem_pred == twenty_test.target)) 

# print precision and recall statistics
print(metrics.classification_report(twenty_test.target, stem_pred,
    target_names=twenty_test.target_names))

# print confusion matrix
print(metrics.confusion_matrix(twenty_test.target, stem_pred))

0.7918158567774936
                          precision    recall  f1-score   support

           comp.graphics       0.66      0.85      0.75       389
 comp.os.ms-windows.misc       0.82      0.69      0.75       394
comp.sys.ibm.pc.hardware       0.80      0.78      0.79       392
   comp.sys.mac.hardware       0.85      0.86      0.86       385
          comp.windows.x       0.87      0.77      0.82       395

                accuracy                           0.79      1955
               macro avg       0.80      0.79      0.79      1955
            weighted avg       0.80      0.79      0.79      1955

[[332  13   8  12  24]
 [ 54 270  38  14  18]
 [ 29  26 307  29   1]
 [ 24   6  21 333   1]
 [ 61  15   8   5 306]]


## Is Stemming Better?

In [33]:
if (best_acc<= np.mean(stem_pred == twenty_test.target)):
    best_clf = stem_clf
else:
    best_clf
print(np.mean(stem_pred == twenty_test.target), best_acc, best_clf)

0.7918158567774936 0.7979539641943734 MLPClassifier(activation='logistic', alpha=1e-05, hidden_layer_sizes=(55,),
              max_iter=100, random_state=1)


# Stemming is better technique