## Artificial Neural Networks and Text Data

In [1]:
from sklearn.datasets import fetch_20newsgroups

# select categories and load the training and test data

categories = ['alt.atheism', 'soc.religion.christian','comp.graphics', 'sci.med', 'sci.space',
              'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey']

# load training and test data
twenty_train = fetch_20newsgroups(subset='train', categories=categories, shuffle=True, random_state=42)
twenty_test = fetch_20newsgroups(subset='test', categories=categories, shuffle=True, random_state=42)

### Import and download NLTK (natural language toolkit for more expanded libraries)

In [2]:
import nltk
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

In [3]:
# convert to vectors of word counts

# import and use CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer

count_vect = CountVectorizer()
#count_vect = CountVectorizer(stop_words='english')

X_train_counts = count_vect.fit_transform(twenty_train.data)
X_test_counts = count_vect.transform(twenty_test.data)

### Generate label vectors for training and multiclass ANN

In [4]:
from sklearn import preprocessing
lb = preprocessing.LabelBinarizer()
lb.fit(twenty_train.target)
train_vTarget=lb.transform(twenty_train.target)
test_vTarget=lb.transform(twenty_test.target)

In [5]:
twenty_train.target

array([6, 9, 2, ..., 1, 7, 6])

In [6]:
train_vTarget

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 1],
       [0, 0, 1, ..., 0, 0, 0],
       ...,
       [0, 1, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 1, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

### Run ANN (discuss parameters)

In [7]:
from sklearn.neural_network import MLPClassifier
import numpy as np

clf = MLPClassifier(activation='logistic', solver='adam', max_iter=50, alpha=1e-5, hidden_layer_sizes=(15,), random_state=1)
clf.fit(X_train_counts, train_vTarget)
print(clf.classes_)
print(clf.n_outputs_)

# make predictions on test data
clf.out_activation_ = 'softmax'
predicted = clf.predict(X_test_counts)

print(predicted[1])
print('-----------')
print(test_vTarget[1])
predicted = lb.inverse_transform(predicted)

# print accuracy
print (np.mean(predicted == twenty_test.target)) 


[0 1 2 3 4 5 6 7 8 9]
10
[0 0 0 0 1 0 0 0 0 0]
-----------
[0 0 0 0 1 0 0 0 0 0]
0.8526831785345718




In [8]:
from sklearn import metrics

# print accuracy
print (np.mean(predicted == twenty_test.target)) 

# print precision and recall statistics
print(metrics.classification_report(twenty_test.target, predicted,
    target_names=twenty_test.target_names))

# print confusion matrix
print(metrics.confusion_matrix(twenty_test.target, predicted))

0.8526831785345718
                        precision    recall  f1-score   support

           alt.atheism       0.38      0.92      0.54       319
         comp.graphics       0.94      0.84      0.89       389
          misc.forsale       0.94      0.88      0.91       390
             rec.autos       0.97      0.82      0.89       396
       rec.motorcycles       1.00      0.89      0.94       398
    rec.sport.baseball       0.98      0.86      0.92       397
      rec.sport.hockey       0.99      0.95      0.97       399
               sci.med       0.97      0.67      0.79       396
             sci.space       1.00      0.80      0.89       394
soc.religion.christian       0.95      0.91      0.93       398

              accuracy                           0.85      3876
             macro avg       0.91      0.85      0.87      3876
          weighted avg       0.92      0.85      0.87      3876

[[293   1   0   0   0   1   0   5   0  19]
 [ 59 325   3   0   0   1   0   0   1  

### Different architecture

In [9]:
clf = MLPClassifier(activation='logistic', solver='adam', max_iter=50, alpha=1e-5, hidden_layer_sizes=(25,), random_state=1)
clf.fit(X_train_counts, train_vTarget)

# make predictions on test data
clf.out_activation_ = 'softmax'
predicted = clf.predict(X_test_counts)

predicted = lb.inverse_transform(predicted)

# print accuracy
print (np.mean(predicted == twenty_test.target)) 


0.9045407636738906




In [10]:
# print accuracy
print (np.mean(predicted == twenty_test.target)) 

# print precision and recall statistics
print(metrics.classification_report(twenty_test.target, predicted,
    target_names=twenty_test.target_names))

# print confusion matrix
print(metrics.confusion_matrix(twenty_test.target, predicted))

0.9045407636738906
                        precision    recall  f1-score   support

           alt.atheism       0.56      0.88      0.69       319
         comp.graphics       0.93      0.88      0.90       389
          misc.forsale       0.92      0.93      0.93       390
             rec.autos       0.96      0.89      0.92       396
       rec.motorcycles       1.00      0.93      0.96       398
    rec.sport.baseball       0.97      0.93      0.95       397
      rec.sport.hockey       0.97      0.97      0.97       399
               sci.med       0.96      0.81      0.88       396
             sci.space       0.97      0.89      0.93       394
soc.religion.christian       0.92      0.94      0.93       398

              accuracy                           0.90      3876
             macro avg       0.92      0.90      0.91      3876
          weighted avg       0.92      0.90      0.91      3876

[[280   1   0   0   0   1   0   7   3  27]
 [ 27 341   8   3   0   4   2   0   4  

### Stop words

In [11]:
count_vect_sw = CountVectorizer(stop_words='english')

X_train_sw_counts = count_vect_sw.fit_transform(twenty_train.data)
X_test_sw_counts = count_vect_sw.transform(twenty_test.data)

In [12]:

clf = MLPClassifier(activation='logistic', solver='adam', max_iter=50, alpha=1e-5, hidden_layer_sizes=(15,), random_state=1)
clf.fit(X_train_sw_counts, train_vTarget)

# make predictions on test data
clf.out_activation_ = 'softmax'
predicted = clf.predict(X_test_sw_counts)

predicted = lb.inverse_transform(predicted)

# print accuracy
print (np.mean(predicted == twenty_test.target)) 


0.847265221878225




In [13]:
# print accuracy
print (np.mean(predicted == twenty_test.target)) 

# print precision and recall statistics
print(metrics.classification_report(twenty_test.target, predicted,
    target_names=twenty_test.target_names))

# print confusion matrix
print(metrics.confusion_matrix(twenty_test.target, predicted))

0.847265221878225
                        precision    recall  f1-score   support

           alt.atheism       0.36      0.91      0.52       319
         comp.graphics       0.96      0.79      0.87       389
          misc.forsale       0.97      0.83      0.89       390
             rec.autos       0.98      0.82      0.89       396
       rec.motorcycles       0.99      0.92      0.95       398
    rec.sport.baseball       0.98      0.86      0.92       397
      rec.sport.hockey       0.99      0.91      0.95       399
               sci.med       0.98      0.70      0.82       396
             sci.space       0.98      0.81      0.89       394
soc.religion.christian       0.94      0.93      0.93       398

              accuracy                           0.85      3876
             macro avg       0.91      0.85      0.86      3876
          weighted avg       0.92      0.85      0.87      3876

[[290   1   0   0   0   1   0   4   1  22]
 [ 75 309   1   0   0   1   0   0   2   

### Stemming

In [14]:
from nltk.stem.snowball import SnowballStemmer
#stemmer = SnowballStemmer("english", ignore_stopwords=True)
stemmer = SnowballStemmer("english")
class StemmedCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        analyzer = super(StemmedCountVectorizer, self).build_analyzer()
        return lambda doc: ([stemmer.stem(w) for w in analyzer(doc)])
#stemmed_count_vect = StemmedCountVectorizer(stop_words='english')
stemmed_count_vect = StemmedCountVectorizer()
X_train_stem_counts = stemmed_count_vect.fit_transform(twenty_train.data)
X_test_stem_counts = stemmed_count_vect.transform(twenty_test.data)

In [15]:
clf = MLPClassifier(activation='logistic', solver='adam', max_iter=50, alpha=1e-5, hidden_layer_sizes=(15,), random_state=1)
clf.fit(X_train_stem_counts, train_vTarget)

# make predictions on test data
clf.out_activation_ = 'softmax'
predicted = clf.predict(X_test_stem_counts)

predicted = lb.inverse_transform(predicted)

# print accuracy
print (np.mean(predicted == twenty_test.target)) 

0.8606811145510835




In [16]:
# print accuracy
print (np.mean(predicted == twenty_test.target)) 

# print precision and recall statistics
print(metrics.classification_report(twenty_test.target, predicted,
    target_names=twenty_test.target_names))

# print confusion matrix
print(metrics.confusion_matrix(twenty_test.target, predicted))

0.8606811145510835
                        precision    recall  f1-score   support

           alt.atheism       0.40      0.91      0.55       319
         comp.graphics       0.94      0.84      0.89       389
          misc.forsale       0.94      0.86      0.90       390
             rec.autos       0.97      0.86      0.91       396
       rec.motorcycles       1.00      0.89      0.94       398
    rec.sport.baseball       0.99      0.87      0.93       397
      rec.sport.hockey       0.99      0.92      0.95       399
               sci.med       0.97      0.73      0.83       396
             sci.space       0.98      0.83      0.90       394
soc.religion.christian       0.95      0.90      0.92       398

              accuracy                           0.86      3876
             macro avg       0.91      0.86      0.87      3876
          weighted avg       0.92      0.86      0.88      3876

[[291   0   0   0   0   1   0   5   3  19]
 [ 50 328   6   0   0   0   1   0   3  

### Convert the data to a TF-IDF representation (Note change to max_iter)

In [18]:
from sklearn.feature_extraction.text import TfidfTransformer

tfidf_transformer = TfidfTransformer()

X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

# clf_2 = MultinomialNB().fit(X_train_tfidf, twenty_train.target)

X_test_tfidf = tfidf_transformer.transform(X_test_counts)

In [19]:
clf = MLPClassifier(activation='logistic', solver='adam', max_iter=100, alpha=1e-5, hidden_layer_sizes=(15,), random_state=1)
clf.fit(X_train_tfidf, train_vTarget)

# make predictions on test data
clf.out_activation_ = 'softmax'
predicted = clf.predict(X_test_tfidf)

predicted = lb.inverse_transform(predicted)

# print accuracy
print (np.mean(predicted == twenty_test.target)) 

0.8771929824561403




In [20]:
# print accuracy
print (np.mean(predicted == twenty_test.target)) 

# print precision and recall statistics
print(metrics.classification_report(twenty_test.target, predicted,
    target_names=twenty_test.target_names))

# print confusion matrix
print(metrics.confusion_matrix(twenty_test.target, predicted))

0.8771929824561403
                        precision    recall  f1-score   support

           alt.atheism       0.43      0.90      0.58       319
         comp.graphics       0.95      0.86      0.90       389
          misc.forsale       0.96      0.89      0.92       390
             rec.autos       0.97      0.86      0.91       396
       rec.motorcycles       0.99      0.89      0.94       398
    rec.sport.baseball       0.98      0.91      0.94       397
      rec.sport.hockey       0.98      0.96      0.97       399
               sci.med       0.97      0.73      0.83       396
             sci.space       0.99      0.85      0.91       394
soc.religion.christian       0.94      0.93      0.94       398

              accuracy                           0.88      3876
             macro avg       0.92      0.88      0.89      3876
          weighted avg       0.93      0.88      0.89      3876

[[287   0   0   0   1   1   0   4   2  24]
 [ 52 334   2   0   0   1   0   0   0  

## Automate the search for a good ANN model on just the comp.* subset of newsgroups

### Make the following choices sequentially (1) hidden layer sizes, (2) include or ignore stopwords, (3) count vectors vs tfidf vectors, and then (4) stemming or not. I suggest using max_iter of at least 100 (default is 200)

In [2]:
from sklearn.datasets import fetch_20newsgroups
cats = ['comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x']

our_train = fetch_20newsgroups(subset='train', categories=cats, shuffle=True, random_state=42)
our_train.target[:25]
for p in our_train.target[:25]:
    target_id = our_train.target[p]
    print(target_id, our_train.target_names[target_id])

# print the first article
print("\n", our_train.data[:1])

2 comp.sys.ibm.pc.hardware
1 comp.os.ms-windows.misc
3 comp.sys.mac.hardware
3 comp.sys.mac.hardware
1 comp.os.ms-windows.misc
4 comp.windows.x
3 comp.sys.mac.hardware
1 comp.os.ms-windows.misc
1 comp.os.ms-windows.misc
4 comp.windows.x
4 comp.windows.x
1 comp.os.ms-windows.misc
1 comp.os.ms-windows.misc
2 comp.sys.ibm.pc.hardware
2 comp.sys.ibm.pc.hardware
1 comp.os.ms-windows.misc
2 comp.sys.ibm.pc.hardware
2 comp.sys.ibm.pc.hardware
1 comp.os.ms-windows.misc
2 comp.sys.ibm.pc.hardware
3 comp.sys.mac.hardware
1 comp.os.ms-windows.misc
2 comp.sys.ibm.pc.hardware
2 comp.sys.ibm.pc.hardware
4 comp.windows.x

 ["From: lemons@cadsys.enet.dec.com\nSubject: Xremote into X11R6?\nReply-To: lemons@cadsys.enet.dec.com ()\nOrganization: Digital Equipment Corporation\nLines: 12\nX-Newsreader: mxrn 6.18\n\n\nHi!\n\nI remember reading (or hallucinating) that NCD's PC-Xremote functionality had \nbeen given, by NCD, to MIT for inclusion in X11R6.  Is this true?  If so,\n(set mode/cheap) can I just wa

In [3]:
from sklearn.feature_extraction.text import CountVectorizer

ct_vec = CountVectorizer(stop_words='english')
X_Train_counts = ct_vec.fit_transform(our_train.data)

ct_vec.vocabulary_.get(u'algorithm')


15455

In [4]:
our_test = fetch_20newsgroups(subset='test', categories=cats, shuffle=True, random_state=42)

# note that I am using .transform instead of .fit_transform. this keeps the columns the same as the training set
X_Test_counts = ct_vec.transform(our_test.data)

print("\n", our_test.data[:1])


 ["From: lonewolf@muse.Corp.Sun.COM (Peter Pak)\nSubject: Re: 386 Motherboard advice needed\nOrganization: Sun Microsystems\nLines: 12\nDistribution: world\nReply-To: lonewolf@muse.Corp.Sun.COM\nNNTP-Posting-Host: muse.corp.sun.com\n\nMaybe I should have been clearer.  I have a Intel 386DX/25 that I would\nlike to use to put together a system however all the motherboards that\nthe local vendors are now selling are running either at 33 or 40 MHz.  I\nguess I can cross my fingers and hope the CPU runs at that speed. ;^)\n\nI think I'll take Mark's advice and see if any of the boards have\na socketed oscillator and head down to the local electronics store...\n\nThanks for the info...\n\n=B^)\n\n"]


In [5]:
from sklearn import preprocessing
lb_1 = preprocessing.LabelBinarizer()
lb_1.fit(our_train.target)
train_vTarget_1=lb_1.transform(our_train.target)
test_vTarget_1=lb_1.transform(our_test.target)

## SELECTING BEST HIDDEN LAYER SIZE USING GRIDSEARCHCV

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.neural_network import MLPClassifier
# finish the function below
gs_params = {"hidden_layer_sizes": [110,125,150]  
             }
est = MLPClassifier()
gs_clf = GridSearchCV(estimator=est, param_grid=gs_params)
gs_results = gs_clf.fit(X_Train_counts, our_train.target)

hl_size = gs_results.best_params_
print(hl_size)
# print(gs_results.best_params_)

print(gs_results.best_estimator_)

best_clf = gs_results.best_estimator_
gs_predicted = gs_clf.predict(X_Test_counts)

print (np.mean(gs_predicted == our_test.target))


print(metrics.classification_report(our_test.target, gs_predicted,
    target_names=our_test.target_names))


print(metrics.confusion_matrix(our_test.target, gs_predicted))



## USING COUNT VECTORS

In [None]:
from sklearn.neural_network import MLPClassifier
import numpy as np
from sklearn import metrics

clf_3 = MLPClassifier(activation='logistic', solver='adam', max_iter=50, alpha=1e-5, hidden_layer_sizes=(hl_size,), random_state=1)
clf_3.fit(X_Train_counts, train_vTarget_1)
print(clf_3.classes_)
print(clf_3.n_outputs_)

# make predictions on test data
clf_3.out_activation_ = 'softmax'
predicted_3 = clf_3.predict(X_Test_counts)

print(predicted_3[1])
print('-----------')
print(test_vTarget_1[1])
predicted_3 = lb_1.inverse_transform(predicted)

# print accuracy
print (np.mean(predicted_3 == our_test.target)) 

# print precision and recall statistics
print(metrics.classification_report(our_test.target, predicted_3,
    target_names=our_test.target_names))

# print confusion matrix
print(metrics.confusion_matrix(our_test.target, predicted_3))

## USING TFIDF

In [None]:
from sklearn.feature_extraction.text import TfidfTransformer

tfidf_transformer = TfidfTransformer()

X_Train_tfidf = tfidf_transformer.fit_transform(X_Train_counts)

# clf_2 = MultinomialNB().fit(X_train_tfidf, twenty_train.target)

X_Test_tfidf = tfidf_transformer.transform(X_Test_counts)

clf_4 = MLPClassifier(activation='logistic', solver='adam', max_iter=100, alpha=1e-5, hidden_layer_sizes=(hl_size,), random_state=1)
clf.fit(X_Train_tfidf, train_vTarget_1)

# make predictions on test data
clf_4.out_activation_ = 'softmax'
predicted_4 = clf.predict(X_Test_tfidf)

predicted_4 = lb_1.inverse_transform(predicted)

# print accuracy
print (np.mean(predicted_4 == our_test.target)) 

# print precision and recall statistics
print(metrics.classification_report(our_test.target, predicted_4,
    target_names=our_test.target_names))

# print confusion matrix
print(metrics.confusion_matrix(our_test.target, predicted_4))


## WITH STOP WORDS

In [None]:
ct_vect_sw = CountVectorizer(stop_words='english')

X_Train_sw_counts = ct_vect_sw.fit_transform(our_train.data)
X_Test_sw_counts = ct_vect_sw.transform(our_test.data)

clf_5 = MLPClassifier(activation='logistic', solver='adam', max_iter=50, alpha=1e-5, hidden_layer_sizes=(hl_size,), random_state=1)
clf_5.fit(X_Train_sw_counts, train_vTarget_1)

# make predictions on test data
clf_5.out_activation_ = 'softmax'
predicted_5 = clf_5.predict(X_Test_sw_counts)

predicted_5 = lb_1.inverse_transform(predicted_5)

# print accuracy
print (np.mean(predicted_5 == our_test.target)) 

# print precision and recall statistics
print(metrics.classification_report(our_test.target, predicted_5,
    target_names=our_test.target_names))

# print confusion matrix
print(metrics.confusion_matrix(our_test.target, predicted_5))

## WITH STEMMING

In [None]:
import nltk
nltk.download()

In [None]:
from nltk.stem.snowball import SnowballStemmer
#stemmer = SnowballStemmer("english", ignore_stopwords=True)
stemmer = SnowballStemmer("english")
class StemmedCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        analyzer = super(StemmedCountVectorizer, self).build_analyzer()
        return lambda doc: ([stemmer.stem(w) for w in analyzer(doc)])
#stemmed_count_vect = StemmedCountVectorizer(stop_words='english')
stemmed_ct_vect = StemmedCountVectorizer()
X_Train_stem_counts = stemmed_ct_vect.fit_transform(our_train.data)
X_Test_stem_counts = stemmed_ct_vect.transform(our_test.data)


clf_6 = MLPClassifier(activation='logistic', solver='adam', max_iter=50, alpha=1e-5, hidden_layer_sizes=(hl_size,), random_state=1)
clf_6.fit(X_Train_stem_counts, train_vTarget_1)

# make predictions on test data
clf_6.out_activation_ = 'softmax'
predicted_6 = clf_6.predict(X_Test_stem_counts)

predicted_6 = lb_1.inverse_transform(predicted_6)

# print accuracy
print (np.mean(predicted_6 == our_test.target)) 


# print precision and recall statistics
print(metrics.classification_report(our_test.target, predicted_6,
    target_names=our_test.target_names))

# print confusion matrix
print(metrics.confusion_matrix(our_test.target, predicted_6))

## ONE SINGLE PIECE OF CODE FOR ONE-CLICK RUN

In [None]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import preprocessing
from sklearn.model_selection import GridSearchCV
from sklearn.neural_network import MLPClassifier
import numpy as np
from sklearn import metrics
from sklearn.feature_extraction.text import TfidfTransformer
import nltk
nltk.download()
from nltk.stem.snowball import SnowballStemmer

cats = ['comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x']

our_train = fetch_20newsgroups(subset='train', categories=cats, shuffle=True, random_state=42)
our_train.target[:25]
for p in our_train.target[:25]:
    target_id = our_train.target[p]
    print(target_id, our_train.target_names[target_id])

# print the first article
print("\n", our_train.data[:1])

ct_vec = CountVectorizer(stop_words='english')
X_Train_counts = ct_vec.fit_transform(our_train.data)

ct_vec.vocabulary_.get(u'algorithm')

our_test = fetch_20newsgroups(subset='test', categories=cats, shuffle=True, random_state=42)

# note that I am using .transform instead of .fit_transform. this keeps the columns the same as the training set
X_Test_counts = ct_vec.transform(our_test.data)

print("\n", our_test.data[:1])

lb_1 = preprocessing.LabelBinarizer()
lb_1.fit(our_train.target)
train_vTarget_1=lb_1.transform(our_train.target)
test_vTarget_1=lb_1.transform(our_test.target)

# finish the function below
gs_params = {"hidden_layer_sizes": [110,125,150]  
             }
est = MLPClassifier()
gs_clf = GridSearchCV(estimator=est, param_grid=gs_params)
gs_results = gs_clf.fit(X_Train_counts, our_train.target)

hl_size = gs_results.best_params_
print(hl_size)
# print(gs_results.best_params_)

print(gs_results.best_estimator_)

best_clf = gs_results.best_estimator_
gs_predicted = gs_clf.predict(X_Test_counts)

print (np.mean(gs_predicted == our_test.target))


print(metrics.classification_report(our_test.target, gs_predicted,
    target_names=our_test.target_names))


print(metrics.confusion_matrix(our_test.target, gs_predicted))

clf_3 = MLPClassifier(activation='logistic', solver='adam', max_iter=50, alpha=1e-5, hidden_layer_sizes=(hl_size,), random_state=1)
clf_3.fit(X_Train_counts, train_vTarget_1)
print(clf_3.classes_)
print(clf_3.n_outputs_)

# make predictions on test data
clf_3.out_activation_ = 'softmax'
predicted_3 = clf_3.predict(X_Test_counts)

print(predicted_3[1])
print('-----------')
print(test_vTarget_1[1])
predicted_3 = lb_1.inverse_transform(predicted)

# print accuracy
print (np.mean(predicted_3 == our_test.target)) 

# print precision and recall statistics
print(metrics.classification_report(our_test.target, predicted_3,
    target_names=our_test.target_names))

# print confusion matrix
print(metrics.confusion_matrix(our_test.target, predicted_3))

tfidf_transformer = TfidfTransformer()

X_Train_tfidf = tfidf_transformer.fit_transform(X_Train_counts)

# clf_2 = MultinomialNB().fit(X_train_tfidf, twenty_train.target)

X_Test_tfidf = tfidf_transformer.transform(X_Test_counts)

clf_4 = MLPClassifier(activation='logistic', solver='adam', max_iter=100, alpha=1e-5, hidden_layer_sizes=(hl_size,), random_state=1)
clf.fit(X_Train_tfidf, train_vTarget_1)

# make predictions on test data
clf_4.out_activation_ = 'softmax'
predicted_4 = clf.predict(X_Test_tfidf)

predicted_4 = lb_1.inverse_transform(predicted)

# print accuracy
print (np.mean(predicted_4 == our_test.target)) 

# print precision and recall statistics
print(metrics.classification_report(our_test.target, predicted_4,
    target_names=our_test.target_names))

# print confusion matrix
print(metrics.confusion_matrix(our_test.target, predicted_4))

ct_vect_sw = CountVectorizer(stop_words='english')

X_Train_sw_counts = ct_vect_sw.fit_transform(our_train.data)
X_Test_sw_counts = ct_vect_sw.transform(our_test.data)

clf_5 = MLPClassifier(activation='logistic', solver='adam', max_iter=50, alpha=1e-5, hidden_layer_sizes=(hl_size,), random_state=1)
clf_5.fit(X_Train_sw_counts, train_vTarget_1)

# make predictions on test data
clf_5.out_activation_ = 'softmax'
predicted_5 = clf_5.predict(X_Test_sw_counts)

predicted_5 = lb_1.inverse_transform(predicted_5)

# print accuracy
print (np.mean(predicted_5 == our_test.target)) 

# print precision and recall statistics
print(metrics.classification_report(our_test.target, predicted_5,
    target_names=our_test.target_names))

# print confusion matrix
print(metrics.confusion_matrix(our_test.target, predicted_5))

#stemmer = SnowballStemmer("english", ignore_stopwords=True)
stemmer = SnowballStemmer("english")
class StemmedCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        analyzer = super(StemmedCountVectorizer, self).build_analyzer()
        return lambda doc: ([stemmer.stem(w) for w in analyzer(doc)])
#stemmed_count_vect = StemmedCountVectorizer(stop_words='english')
stemmed_ct_vect = StemmedCountVectorizer()
X_Train_stem_counts = stemmed_ct_vect.fit_transform(our_train.data)
X_Test_stem_counts = stemmed_ct_vect.transform(our_test.data)


clf_6 = MLPClassifier(activation='logistic', solver='adam', max_iter=50, alpha=1e-5, hidden_layer_sizes=(hl_size,), random_state=1)
clf_6.fit(X_Train_stem_counts, train_vTarget_1)

# make predictions on test data
clf_6.out_activation_ = 'softmax'
predicted_6 = clf_6.predict(X_Test_stem_counts)

predicted_6 = lb_1.inverse_transform(predicted_6)

# print accuracy
print (np.mean(predicted_6 == our_test.target)) 


# print precision and recall statistics
print(metrics.classification_report(our_test.target, predicted_6,
    target_names=our_test.target_names))

# print confusion matrix
print(metrics.confusion_matrix(our_test.target, predicted_6))