In [23]:
import json
import numpy as np
import pandas as pd
from pandas.io.json import json_normalize
from sklearn.pipeline import Pipeline
from sklearn.neural_network import MLPRegressor, MLPClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [93]:
from keras.wrappers.scikit_learn import KerasClassifier
from keras.models import Sequential
from keras.layers import Dense
from keras.datasets import mnist
from keras.layers import Dense, Dropout, Flatten
from keras.layers.convolutional import Conv2D, MaxPooling2D
from keras.utils import np_utils
from keras import backend as K

# 1. Neural Network Classifier with Scikit

Using the multi-label classifier dataset from earlier exercises (categorized-comments.jsonl in the reddit folder), fit a neural network classifier using scikit-learn. Use the code found in chapter 12 of the
Applied Text Analysis with Python
book as a guideline. Report the accuracy, precision, recall, F1-score, and confusion matrix.

In [2]:
json_path = 'C:\\Users\\Dan Siegel\\Desktop\\Classes\\550\\data\\reddit\\categorized-comments.jsonl'
data = []
with open(json_path) as f:
    for line in f:
        data.append(json.loads(line))        
df = pd.DataFrame.from_dict(json_normalize(data), orient='columns')
df = df.sample(frac=0.02)

In [3]:
df.head()

Unnamed: 0,cat,txt
1789214,science_and_technology,&gt; You make maintaining a really short how-t...
2299605,video_games,I'm a 760(? Went broke shortly after emerald n...
1093467,video_games,yes ppl think its more fun if they can play wh...
672471,video_games,USE YOUR REINFORCEMENTS FFS
1361454,sports,wonder whos big spoon


In [5]:
vectorizer = TfidfVectorizer()
features = vectorizer.fit_transform(df.txt)

In [28]:
df['categorical_cat'] = df['cat'].astype('category').cat.codes

In [39]:
X_train, X_test, y_train, y_test = train_test_split(features, df.categorical_cat, test_size=0.33, random_state=42)

In [40]:
clf = MLPClassifier(hidden_layer_sizes=(500,150), verbose=True)

In [41]:
clf.fit(X_train, y_train)

Iteration 1, loss = 0.90891358
Iteration 2, loss = 0.48719700
Iteration 3, loss = 0.31482848
Iteration 4, loss = 0.23548926
Iteration 5, loss = 0.18821232
Iteration 6, loss = 0.15467965
Iteration 7, loss = 0.13621932
Iteration 8, loss = 0.12646041
Iteration 9, loss = 0.12243083
Iteration 10, loss = 0.12034677
Iteration 11, loss = 0.11832514
Iteration 12, loss = 0.11715692
Iteration 13, loss = 0.11586256
Iteration 14, loss = 0.11531876
Iteration 15, loss = 0.11447640
Iteration 16, loss = 0.11393003
Iteration 17, loss = 0.11421544
Iteration 18, loss = 0.11438200
Iteration 19, loss = 0.11380901
Iteration 20, loss = 0.11283838
Iteration 21, loss = 0.11304803
Iteration 22, loss = 0.11236755
Iteration 23, loss = 0.11198006
Iteration 24, loss = 0.11189459
Iteration 25, loss = 0.11186551
Iteration 26, loss = 0.11182081
Training loss did not improve more than tol=0.000100 for two consecutive epochs. Stopping.


MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(500, 150), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=True, warm_start=False)

In [42]:
y_pred = clf.predict(X_test)

In [43]:
from sklearn.metrics import accuracy_score, confusion_matrix, recall_score, precision_score, f1_score

In [50]:
def scoring_func(predictions, test_scores):
    accu_score = accuracy_score(test_scores, predictions)
    precision = precision_score(test_scores, predictions, average='macro')
    recall = recall_score(test_scores, predictions, average='macro')
    f1 = f1_score(test_scores, predictions, average='macro')
    conf=confusion_matrix(test_scores, predictions)
    print ('accuracy: ',accu_score)
    print('precision:', precision)
    print ('Recall:', recall)
    print('F1:', f1)
    print('confusion matrix:', conf)      

In [51]:
scoring_func(y_pred, y_test)

accuracy:  0.6655479540467277
precision: 0.5951752808178006
Recall: 0.5705080922531287
F1: 0.5787725055491136
confusion matrix: [[1565  147  499  471]
 [ 324  276  157  303]
 [ 359   76 3606 1069]
 [ 407  184 1186 4865]]


# 2. Neural Network Classifier with Keras

Using the multi-label classifier dataset from earlier exercises (categorized-comments.jsonl in the reddit folder), fit a neural network classifier using Keras. Use the code found in chapter 12 of the
Applied Text Analysis with Python
book as a guideline. Report the accuracy, precision, recall, F1-score, and confusion matrix.

In [88]:
def build_network(): 
    """ 
    Create a function that returns a compiled neural network 
    """ 
    nn = Sequential() 
    nn.add(Dense(500, activation = 'relu', input_shape =(46770,))) 
    nn.add(Dense(150, activation = 'relu')) 
    nn.add(Dense(4, activation = 'softmax')) 
    nn.compile(loss ='categorical_crossentropy', 
               optimizer ='adam', 
               metrics =['accuracy']) 
    return nn

In [89]:
KC = KerasClassifier(build_fn=build_network, nb_epoch=15)

In [90]:
KC.fit(X_train, y_train)

Epoch 1/1


<keras.callbacks.History at 0x2bf5186b4e0>

In [91]:
kc_ypred = KC.predict(X_test)

In [92]:
scoring_func(kc_ypred, y_test)

accuracy:  0.6969794759261649
precision: 0.6476370498590679
Recall: 0.6108644877271785
F1: 0.6236304925419752
confusion matrix: [[1410  278  576  418]
 [ 158  411  155  336]
 [ 182   68 3950  910]
 [ 188  162 1264 5028]]


# 3. Classifying Images

In chapter 20 of the
Machine Learning with Python Cookbook
, implement the code found in section 20.15 classify MSINT images using a convolutional neural network. Report the accuracy of your results.

In [113]:
K.set_image_data_format('channels_first')
np.random.seed(0)

In [114]:
def reshape_data(_):
    return _.reshape(_.shape[0], 1, 28, 28)

In [115]:
(data_train, target_train), (data_test, target_test) = mnist.load_data()

In [116]:
data_train = reshape_data(data_train)
data_test = reshape_data(data_test)

In [117]:
features_train = data_train / 255 
features_test = data_test / 255
target_train = np_utils.to_categorical(target_train) 
target_test = np_utils.to_categorical(target_test) 
number_of_classes = target_test.shape[1]
network = Sequential()

In [119]:
network.add(Conv2D(filters = 64, kernel_size=(5,5), input_shape=(1, 28, 28), activation ='relu'))
network.add(MaxPooling2D(pool_size =(2, 2)))
network.add(Dropout(0.5))
network.add(Flatten())
network.add(Dense(128, activation ="relu")) 
network.add(Dropout(0.5))
network.add(Dense(number_of_classes, activation ="softmax"))
network.compile(loss ="categorical_crossentropy", optimizer ="rmsprop", metrics =["accuracy"])
network.fit(features_train, target_train, epochs = 2, batch_size = 1000) 

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x2bf53d11278>

In [120]:
conv_network = network.predict(features_test)

In [129]:
accuracy_score(target_test, conv_network.round())

0.9643