In [1]:
# import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import pandas as pd
import nltk
import random
from nltk.corpus import stopwords
from textblob import TextBlob
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from math import sqrt
from sklearn.metrics import r2_score
from sklearn.model_selection import cross_val_score
import os
import jsonlines
import json
from pandas.io.json import json_normalize

## 1. Neural Network Classifier with Scikit

Using the multi-label classifier dataset from earlier exercises (categorized-comments.jsonl in the reddit folder), fit a neural network classifier using scikit-learn. Use the code found in chapter 12 of the Applied Text Analysis with Python book as a guideline. Report the accuracy, precision, recall, F1-score, and confusion matrix.

In [2]:
with open('categorized-comments.jsonl') as f:
    lines = f.read().splitlines()
    
df_inter = pd.DataFrame(lines)
df_inter.columns = ['json_element']

df_inter['json_element'].apply(json.loads)

df = pd.json_normalize(df_inter['json_element'].apply(json.loads))

#create random sample with n = to 5000
df = df.sample(n=5000, random_state=1)

df.head()

Unnamed: 0,cat,txt
133147,video_games,My friend got his about week 5. Got mine the w...
363746,video_games,This post has been removed.\n\n&gt;All individ...
71307,video_games,I'm average not in Legends
261637,sports,[deleted]
399053,video_games,"Don't give up hope, that would be awesome!!!"


In [3]:
#convert column ot string
df['txt'] = df['txt'].astype(str)

#A. Convert all text to lowercase letters.
df['txt']= df['txt'].str.lower()

#B. Remove all punctuation from the text with regex
df['txt'] = df['txt'].str.replace(r'[^\w\s]+', '')

#C. Remove stop words.
from nltk.corpus import stopwords
stop = stopwords.words('english')

df['txt'] = df['txt'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

In [4]:
#D. Apply NLTK’s PorterStemmer.
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize

# set stemmer function to variable to variable 
stemmer = PorterStemmer()

# function to stems words of dataframe
def stem_sentences(sentence):
    tokens = sentence.split()
    stemmed_tokens = [stemmer.stem(token) for token in tokens]
    return ' '.join(stemmed_tokens)

# run function on datfrtame column -- takes awhile
df['txt']  = df['txt'].apply(stem_sentences)

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Create the tf-idf feature matrix
tfidf = TfidfVectorizer()

feature_matrix = tfidf.fit_transform(df['txt'])

#shar matrix as dense
texts = feature_matrix.toarray()

In [6]:
y = df['cat']

from sklearn.preprocessing import LabelBinarizer

y = LabelBinarizer().fit_transform(y)

In [7]:
X = texts

In [8]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20)

In [9]:
from sklearn.neural_network import MLPClassifier
mlp = MLPClassifier(hidden_layer_sizes=(10, 10), max_iter=1000)
mlp.fit(X_train, y_train)

MLPClassifier(hidden_layer_sizes=(10, 10), max_iter=1000)

In [10]:
predictions = mlp.predict(X_test)

In [11]:
from sklearn.metrics import classification_report, confusion_matrix
print(confusion_matrix(y_test.argmax(axis=1), predictions.argmax(axis=1)))
print(classification_report(y_test,predictions))

[[  7   3  19]
 [  5 129  94]
 [ 19  83 641]]
              precision    recall  f1-score   support

           0       0.33      0.17      0.23        29
           1       0.60      0.57      0.58       228
           2       0.85      0.89      0.87       743

   micro avg       0.79      0.80      0.79      1000
   macro avg       0.59      0.54      0.56      1000
weighted avg       0.78      0.80      0.78      1000
 samples avg       0.78      0.80      0.79      1000



  _warn_prf(average, modifier, msg_start, len(result))


## 2. Neural Network Classifier with Keras

Using the multi-label classifier dataset from earlier exercises (categorized-comments.jsonl in the reddit folder), fit a neural network classifier using Keras. Use the code found in chapter 12 of the Applied Text Analysis with Python book as a guideline. Report the accuracy, precision, recall, F1-score, and confusion matrix.

In [12]:
from keras.preprocessing.text import Tokenizer

X = df['txt']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

Using TensorFlow backend.


In [13]:
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(X_train)

X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

In [14]:
from keras.preprocessing.sequence import pad_sequences

# Adding 1 because of reserved 0 index
vocab_size = len(tokenizer.word_index) + 1

maxlen = 100

X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)

In [15]:
from numpy import array
from numpy import asarray
from numpy import zeros

embeddings_dictionary = dict()
glove_file = open('glove.6B.100d.txt', encoding="utf8")
for line in glove_file:
    records = line.split()
    word = records[0]
    vector_dimensions = asarray(records[1:], dtype='float32')
    embeddings_dictionary [word] = vector_dimensions
glove_file.close()
embedding_matrix = zeros((vocab_size, 100))
for word, index in tokenizer.word_index.items():
    embedding_vector = embeddings_dictionary.get(word)
    if embedding_vector is not None:
        embedding_matrix[index] = embedding_vector

In [16]:
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Embedding, Flatten

model = Sequential()
embedding_layer = Embedding(vocab_size, 100, weights=[embedding_matrix], input_length=maxlen , trainable=False)
model.add(embedding_layer)
model.add(Flatten())
model.add(Dense(3, activation='sigmoid'))

In [17]:
from tensorflow.keras import backend as K

def recall_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

In [18]:
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['acc',f1_m,precision_m, recall_m])

print(model.summary())

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 100, 100)          874000    
_________________________________________________________________
flatten (Flatten)            (None, 10000)             0         
_________________________________________________________________
dense (Dense)                (None, 3)                 30003     
Total params: 904,003
Trainable params: 30,003
Non-trainable params: 874,000
_________________________________________________________________
None


In [19]:
history = model.fit(X_train, y_train, batch_size=128, epochs=6, verbose=1, validation_split=0.2)

Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


In [20]:
loss, accuracy, f1_score, precision, recall = model.evaluate(X_test, y_test, verbose=0)

In [21]:
print(loss, accuracy, f1_score, precision, recall)

0.7365497350692749 0.7300000190734863 0.7370343804359436 0.7153397798538208 0.7607421875


## 3. Classifying Images

In chapter 20 of the Machine Learning with Python Cookbook, implement the code found in section 20.15 classify MSINT images using a convolutional neural network. Report the accuracy of your results.

In [22]:
import numpy as np
from tensorflow.keras.datasets import mnist 
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Dropout, Flatten
from tensorflow.keras.layers import Conv2D, Flatten, Dense, MaxPooling2D
from keras.utils import np_utils
from tensorflow.keras import backend as K

#Set that the color channel value will be first
K.set_image_data_format("channels_last")

# Set seed
np.random.seed(0)

# Set image information
channels = 1
height = 28 
width = 28

# Load data and target from MNIST 
(data_train, target_train), (data_test, target_test) = mnist.load_data()

# Reshape training image data into features
data_train = data_train.reshape(data_train.shape[0], height, width, channels)

# Reshape test image data into features
data_test = data_test.reshape(data_test.shape[0], height, width, channels)

# Rescale pixel intensity to between 0 and 1
features_train = data_train / 255
features_test = data_test / 255

# One-hot encode target
target_train = np_utils.to_categorical(target_train)
target_test = np_utils.to_categorical(target_test)
number_of_classes = target_test.shape[1]

# Start neural network
network = Sequential()

# Add convolutional layer with 64 filters, a 5x5 window, and ReLU activation function
network.add(Conv2D(filters=64,                   
            kernel_size=(5, 5),                   
            input_shape=(width, height, channels),                   
            activation='relu', padding='same'))

# Add max pooling layer with a 2x2 window
network.add(MaxPooling2D(pool_size=(2, 2)))

# Add dropout layer
network.add(Dropout(0.5))

# Add layer to flatten input
network.add(Flatten())

# Add fully connected layer of 128 units with a ReLU activation function
network.add(Dense(128, activation="relu"))

# Add dropout layer
network.add(Dropout(0.5))

# Add fully connected layer with a softmax activation function
network.add(Dense(number_of_classes, activation="softmax"))

# Add dropout layer
network.add(Dropout(0.5))

# Add fully connected layer with a softmax activation function
network.add(Dense(number_of_classes, activation="softmax"))

# Compile neural network
network.compile(loss="categorical_crossentropy", # Cross-entropy                
            optimizer="rmsprop", # Root Mean Square Propagation                
            metrics=["accuracy"]) # Accuracy performance metric

# Train neural network
history = network.fit(features_train, # Features            
            target_train, # Target            
            epochs=2, # Number of epochs            
            verbose=0, # Don't print description after each epoch            
            batch_size=1000, # Number of observations per batch            
            validation_data=(features_test, target_test)) 


In [23]:
# report on accuracy of model
history.history

{'loss': [2.053312063217163, 1.9260090589523315],
 'accuracy': [0.2892666757106781, 0.3621666729450226],
 'val_loss': [1.9073302745819092, 1.8060158491134644],
 'val_accuracy': [0.583899974822998, 0.6521999835968018]}