In [23]:
import json
import numpy as np
import pandas as pd
from pandas.io.json import json_normalize
from sklearn.pipeline import Pipeline
from sklearn.neural_network import MLPRegressor, MLPClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [93]:
from keras.wrappers.scikit_learn import KerasClassifier
from keras.models import Sequential
from keras.layers import Dense
from keras.datasets import mnist
from keras.layers import Dense, Dropout, Flatten
from keras.layers.convolutional import Conv2D, MaxPooling2D
from keras.utils import np_utils
from keras import backend as K

# 1. Neural Network Classifier with Scikit

Using the multi-label classifier dataset from earlier exercises (categorized-comments.jsonl in the reddit folder), fit a neural network classifier using scikit-learn. Use the code found in chapter 12 of the
Applied Text Analysis with Python
book as a guideline. Report the accuracy, precision, recall, F1-score, and confusion matrix.

In [139]:
json_path = 'C:\\Users\\Dan Siegel\\Desktop\\Classes\\550\\data\\reddit\\categorized-comments.jsonl'
data = []
with open(json_path) as f:
    for line in f:
        data.append(json.loads(line))        
df = pd.DataFrame.from_dict(json_normalize(data), orient='columns')
#df = df.sample(frac=0.02)

In [3]:
df.head()

Unnamed: 0,cat,txt
1789214,science_and_technology,&gt; You make maintaining a really short how-t...
2299605,video_games,I'm a 760(? Went broke shortly after emerald n...
1093467,video_games,yes ppl think its more fun if they can play wh...
672471,video_games,USE YOUR REINFORCEMENTS FFS
1361454,sports,wonder whos big spoon


In [142]:
vectorizer = TfidfVectorizer()
features = vectorizer.fit_transform(df.txt)

In [143]:
df['categorical_cat'] = df['cat'].astype('category').cat.codes

In [144]:
X_train, X_test, y_train, y_test = train_test_split(features, df.categorical_cat, test_size=0.33, random_state=42)

In [145]:
clf = MLPClassifier(hidden_layer_sizes=(500,150), verbose=True)

In [None]:
clf.fit(X_train, y_train)

In [None]:
y_pred = clf.predict(X_test)

In [43]:
from sklearn.metrics import accuracy_score, confusion_matrix, recall_score, precision_score, f1_score

In [50]:
def scoring_func(predictions, test_scores):
    accu_score = accuracy_score(test_scores, predictions)
    precision = precision_score(test_scores, predictions, average='macro')
    recall = recall_score(test_scores, predictions, average='macro')
    f1 = f1_score(test_scores, predictions, average='macro')
    conf=confusion_matrix(test_scores, predictions)
    print ('accuracy: ',accu_score)
    print('precision:', precision)
    print ('Recall:', recall)
    print('F1:', f1)
    print('confusion matrix:', conf)      

In [None]:
scoring_func(y_pred, y_test)

# 2. Neural Network Classifier with Keras

Using the multi-label classifier dataset from earlier exercises (categorized-comments.jsonl in the reddit folder), fit a neural network classifier using Keras. Use the code found in chapter 12 of the
Applied Text Analysis with Python
book as a guideline. Report the accuracy, precision, recall, F1-score, and confusion matrix.

In [138]:
X_train.shape[1]

(31456, 46770)

In [None]:
def build_network(): 
    """ 
    Create a function that returns a compiled neural network 
    """ 
    nn = Sequential() 
    nn.add(Dense(500, activation = 'relu', input_shape =(X_train.shape[1],))) 
    nn.add(Dense(150, activation = 'relu')) 
    nn.add(Dense(4, activation = 'softmax')) 
    nn.compile(loss ='categorical_crossentropy', 
               optimizer ='adam', 
               metrics =['accuracy']) 
    return nn

In [None]:
KC = KerasClassifier(build_fn=build_network)

In [None]:
KC.fit(X_train, y_train)

In [None]:
kc_ypred = KC.predict(X_test)

In [None]:
scoring_func(kc_ypred, y_test)

# 3. Classifying Images

In chapter 20 of the
Machine Learning with Python Cookbook
, implement the code found in section 20.15 classify MSINT images using a convolutional neural network. Report the accuracy of your results.

In [113]:
K.set_image_data_format('channels_first')
np.random.seed(0)

In [114]:
def reshape_data(_):
    return _.reshape(_.shape[0], 1, 28, 28)

In [115]:
(data_train, target_train), (data_test, target_test) = mnist.load_data()

In [116]:
data_train = reshape_data(data_train)
data_test = reshape_data(data_test)

In [117]:
features_train = data_train / 255 
features_test = data_test / 255
target_train = np_utils.to_categorical(target_train) 
target_test = np_utils.to_categorical(target_test) 
number_of_classes = target_test.shape[1]
network = Sequential()

In [119]:
network.add(Conv2D(filters = 64, kernel_size=(5,5), input_shape=(1, 28, 28), activation ='relu'))
network.add(MaxPooling2D(pool_size =(2, 2)))
network.add(Dropout(0.5))
network.add(Flatten())
network.add(Dense(128, activation ="relu")) 
network.add(Dropout(0.5))
network.add(Dense(number_of_classes, activation ="softmax"))
network.compile(loss ="categorical_crossentropy", optimizer ="rmsprop", metrics =["accuracy"])
network.fit(features_train, target_train, epochs = 2, batch_size = 1000) 

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x2bf53d11278>

In [120]:
conv_network = network.predict(features_test)

In [129]:
accuracy_score(target_test, conv_network.round())

0.9643