In [1]:
%config IPCompleter.greedy=True

In [2]:
import warnings

warnings.filterwarnings('ignore')

In [3]:
import sklearn

from sklearn.datasets import fetch_20newsgroups
from sklearn.preprocessing import OneHotEncoder

print('sklearn:', sklearn.__version__)

sklearn: 0.21.3


In [4]:
dataset = fetch_20newsgroups(remove=('headers', 'footers', 'quotes'))

X = dataset.data
y = OneHotEncoder(categories = 'auto').fit_transform(dataset.target.reshape(-1, 1))

number_of_targets = len(set(dataset.target))

In [5]:
import re
import nltk

from nltk import word_tokenize, sent_tokenize
from nltk.corpus import stopwords

from sklearn.model_selection import train_test_split

print('nltk:', nltk.__version__)
print('re:', re.__version__)

nltk: 3.4.1
re: 2.2.1


In [6]:
def clean_text(dirty):
    clean_version = []
    for text in dirty:
        text = re.sub(r'[\s,]+', ' ', text)
        text = re.sub(r'[-]+', '', text)
        text = re.sub(r'(\w)(\/)(\s)', r'\1 \2\3', text)
        
        post_tokens = []
        
        sentences = sent_tokenize(text)
        for sentence in sentences:
            for token in word_tokenize(sentence):
                post_tokens.append(token)
                
        text = ' '.join(post_tokens)
        text = re.sub(r'\s+', ' ', text)
        
        clean_version.append(text)
    
    return clean_version
        
X = clean_text(X)

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 72)

In [8]:
from sklearn.feature_extraction.text import CountVectorizer

In [9]:
## max_df = remove words that appear too frequently,
##          ie: .50 -> remove words that appear in more than 50% of documents
##          ie: 50 -> remove words that appear in more than 50 documents

## min_df = remove words that appear too infrequently,
##          ie: .50 -> remove words that appear in less than 50% of the documents
##          ie: 50 -> remove words that appear in less than 50 documents

stop_words = set(stopwords.words('english'))
vectorizer = CountVectorizer(
    binary = True,
    stop_words = stop_words,
    lowercase = True,
    min_df = 5,
    max_df = 0.80,
    max_features = 8000
)

x_train_onehot = vectorizer.fit_transform(X_train)
number_of_features = len(vectorizer.get_feature_names())

In [10]:
import keras

from keras.models import Sequential
from keras.layers import Dense

print(keras.__version__)

Using TensorFlow backend.


2.2.5


In [11]:
keras.backend.clear_session()

nn = Sequential()

nn.add(Dense(units = 200, activation = 'relu', input_dim = number_of_features))
nn.add(Dense(units = number_of_targets, activation = 'softmax'))

nn.compile(loss = 'categorical_crossentropy', optimizer = 'adam', metrics = ['accuracy'])

nn.summary()

W1105 07:41:53.734017 4532311488 deprecation_wrapper.py:119] From /usr/local/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:107: The name tf.reset_default_graph is deprecated. Please use tf.compat.v1.reset_default_graph instead.

W1105 07:41:53.747394 4532311488 deprecation_wrapper.py:119] From /usr/local/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:111: The name tf.placeholder_with_default is deprecated. Please use tf.compat.v1.placeholder_with_default instead.

W1105 07:41:53.750072 4532311488 deprecation_wrapper.py:119] From /usr/local/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:66: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.

W1105 07:41:53.751450 4532311488 deprecation_wrapper.py:119] From /usr/local/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:541: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W1105 07:41:53.753255 4

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 200)               1600200   
_________________________________________________________________
dense_2 (Dense)              (None, 20)                4020      
Total params: 1,604,220
Trainable params: 1,604,220
Non-trainable params: 0
_________________________________________________________________


In [12]:
nn.fit(x_train_onehot, y_train, epochs = 5, batch_size = 100)

W1105 07:41:53.954663 4532311488 deprecation.py:323] From /usr/local/lib/python3.7/site-packages/tensorflow/python/ops/math_grad.py:1250: add_dispatch_support.<locals>.wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x122adc780>

In [13]:
scores = nn.evaluate(vectorizer.transform(X_test), y_test)

print('accuracy:', scores[1])

accuracy: 0.7228276877761414
