### Neural Network

In [21]:
# General libraries for most tasks.
import unicodedata
import pandas as pd
import numpy as np
import nltk as nk
from nltk.tokenize import word_tokenize

# sklearn libraries.
from sklearn.pipeline import Pipeline
from sklearn.neural_network import MLPRegressor, MLPClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split


# keras libraries.
from keras.datasets import reuters
from keras.utils.np_utils import to_categorical
from keras.preprocessing.text import Tokenizer
from keras import models
from keras import layers


### Sklearn

In [4]:
# Path to postpreprocessed, part-of-speech tagged review corpus
# {"cat": "sports", "txt": "Barely better than Gabbert?"}
jsonFile = pd.read_json("categorized-comments.jsonl", lines =  True)

stopwords = set(nk.corpus.stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
    

In [5]:
lemmatizeList = []

for index in range(0, len(jsonFile)):
    processed_tokens = word_tokenize(jsonFile["txt"][index])
    processed_tokens = [w.lower() for w in processed_tokens]
    processed_tokens = [w for w in processed_tokens if w not in stopwords]
    processed_tokens = [lemmatizer.lemmatize(w) for w in processed_tokens]
    lemmatizeList.append(processed_tokens)       

jsonFile["lemmatize"] = lemmatizeList
print("complete")

complete


In [6]:
jsonFile.head()

Unnamed: 0,cat,txt,lemmatize
0,sports,Barely better than Gabbert? He was significant...,"[barely, better, gabbert, ?, significantly, be..."
1,sports,Fuck the ducks and the Angels! But welcome to ...,"[fuck, duck, angel, !, welcome, new, niner, fa..."
2,sports,Should have drafted more WRs.\n\n- Matt Millen...,"[drafted, wrs, ., -, matt, millen, probably]"
3,sports,[Done](https://i.imgur.com/2YZ90pm.jpg),"[[, done, ], (, http, :, //i.imgur.com/2yz90pm..."
4,sports,No!! NOO!!!!!,"[!, !, noo, !, !, !, !, !]"


In [22]:
train_test_split(X, y, stratify=y, random_state=1)

NameError: name 'X' is not defined

In [57]:
#!/usr/bin/env python3

import nltk
import pickle
import sqlite3

from nltk.corpus.reader.api import CorpusReader

PKL_PATTERN = r'(?!\.)[\w\s\d\-]+\.pickle'

class SqliteCorpusReader(object):

    def __init__(self, path):
        self._cur = sqlite3.connect(path).cursor()

    def scores(self):
        """
        Returns the review score
        """
        self._cur.execute("SELECT score FROM reviews")
        scores = self._cur.fetchall()
        for score in scores:
            yield score

    def texts(self):
        """
        Returns the full review texts
        """
        self._cur.execute("SELECT content FROM content")
        texts = self._cur.fetchall()
        for text in texts:
            yield text

    def ids(self):
        """
        Returns the review ids
        """
        self._cur.execute("SELECT reviewid FROM content")
        ids = self._cur.fetchall()
        for idx in ids:
            yield idx

    def ids_and_texts(self):
        """
        Returns the review ids
        """
        self._cur.execute("SELECT * FROM content")
        results = self._cur.fetchall()
        for idx,text in results:
            yield idx,text

    def scores_albums_artists_texts(self):
        """
        Returns a generator with each review represented as a
        (score, album name, artist name, review text) tuple
        """
        sql = """
              SELECT S.score, L.label, A.artist, R.content
              FROM [reviews] S
              JOIN labels L ON S.reviewid=L.reviewid
              JOIN artists A on L.reviewid=A.reviewid
              JOIN content R ON A.reviewid=R.reviewid
              """
        self._cur.execute(sql)
        results = self._cur.fetchall()
        for score,album,band,text in results:
            yield (score,album,band,text)

    def albums(self):
        """
        Returns the names of albums being reviewed
        """
        self._cur.execute("SELECT * FROM labels")
        albums = self._cur.fetchall()
        for idx,album in albums:
            yield idx,album

    def artists(self):
        """
        Returns the name of the artist being reviewed
        """
        self._cur.execute("SELECT * FROM artists")
        artists = self._cur.fetchall()
        for idx,artist in artists:
            yield idx,artist

    def genres(self):
        """
        Returns the music genre of each review
        """
        self._cur.execute("SELECT * FROM genres")
        genres = self._cur.fetchall()
        for idx,genre in genres:
            yield idx,genre

    def years(self):
        """
        Returns the publication year of each review
        Note: There are many missing values
        """
        self._cur.execute("SELECT * FROM years")
        years = self._cur.fetchall()
        for idx,year in years:
            yield idx,year

    def paras(self):
        """
        Returns a generator of paragraphs.
        """
        for text in self.texts():
            for paragraph in text:
                yield paragraph

    def sents(self):
        """
        Returns a generator of sentences.
        """
        for para in self.paras():
            for sentence in nltk.sent_tokenize(para):
                yield sentence

    def words(self):
        """
        Returns a generator of words.
        """
        for sent in self.sents():
            for word in nltk.wordpunct_tokenize(sent):
                yield word

    def tagged_tokens(self):
        for sent in self.sents():
            for word in nltk.wordpunct_tokenize(sent):
                yield nltk.pos_tag(word)



class PickledReviewsReader(CorpusReader):
    def __init__(self, root, fileids=PKL_PATTERN, **kwargs):
        """
        Initialize the corpus reader
        """
        CorpusReader.__init__(self, root, fileids, **kwargs)

    def texts_scores(self, fileids=None):
        """
        Returns the document loaded from a pickled object for every file in
        the corpus. Similar to the SqliteCorpusReader, this uses a generator
        to achieve memory safe iteration.
        """
        # Create a generator, loading one document into memory at a time.
        for path, enc, fileid in self.abspaths(fileids, True, True):
            with open(path, 'rb') as f:
                yield pickle.load(f)

    def reviews(self, fileids=None):
        """
        Returns a generator of paragraphs where each paragraph is a list of
        sentences, which is in turn a list of (token, tag) tuples.
        """
        for text,score in self.texts_scores(fileids):
            yield text

    def scores(self, fileids=None):
        """
        Return the scores
        """
        for text,score in self.texts_scores(fileids):
            yield score

    def paras(self, fileids=None):
        """
        Returns a generator of paragraphs where each paragraph is a list of
        sentences, which is in turn a list of (token, tag) tuples.
        """
        for review in self.reviews(fileids):
            for paragraph in review:
                yield paragraph

    def sents(self, fileids=None):
        """
        Returns a generator of sentences where each sentence is a list of
        (token, tag) tuples.
        """
        for paragraph in self.paras(fileids):
            for sentence in paragraph:
                yield sentence

    def tagged(self, fileids=None):
        for sent in self.sents(fileids):
            for token in sent:
                yield token

    def words(self, fileids=None):
        """
        Returns a generator of (token, tag) tuples.
        """
        for token in self.tagged(fileids):
            yield token[0]

In [None]:
import unicodedata
import nltk
from sklearn.base import BaseEstimator, TransformerMixin
from nltk.corpus import stopwords


class TextNormalizer(BaseEstimator, TransformerMixin):
    def __init__(self, language='english'):
        self.stopwords = set(nltk.corpus.stopwords.words(language))
        self.lemmatizer = WordNetLemmatizer()
    def is_punct(self, token):
        return all(
            unicodedata.category(char).startswith('P') for char in token)
    def is_stopword(self, token):
        return token.lower() in self.stopword
    
    def normalize(self, document):
        return [
            self.lemmatize(token, tag).lower()
            for paragraph in document
            for sentence in paragraph
            for (token, tag) in sentence
            if not self.is_punct(token) and not self.is_stopword(token)]
    
    def lemmatize(self, token, pos_tag):
        tag = {
            'N': wn.NOUN,
            'V': wn.VERB,
            'R': wn.ADV,
            'J': wn.ADJ
             }.get(pos_tag[0], wn.NOUN)
        return self.lemmatizer.lemmatize(token, tag)
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, documents):
        for document in documents:
            yield self.normalize(document)


import joblib
from sklearn.model_selection import cross_val_score

def documents(corpus):
    return list(corpus.reviews())

def continuous(corpus):
    return list(corpus.scores())

def make_categorical(corpus):
    return np.digitize(continuous(corpus), [0.0, 3.0, 5.0, 7.0, 10.1])

def binarize(corpus):
    return np.digitize(continuous(corpus), [0.0, 3.0, 5.1])

def train_model(path, model, Continuous=True, saveto=None, cv=12):
    """
    Trains model from corpus at specified path; constructing cross-validation
    scores using the cv parameter, then fitting the model on the full data.
    Returns the scores.
    """
    # Load the corpus data and labels for classification
    comments = pd.read_json(r'C:\Users\danie\Documents\School\DSC-550\Week2\controversial-comments.jsonl', lines=True)
    
    corpus = PickledReviewsReader(comments.to_csv(sep='\t'))
    X = documents(corpus)
    
    print(corpus)
    
    if Continuous:
        y = continuous(corpus)
        scoring = 'r2'
        
    else:
        y = make_categorical(corpus)
        scoring = 'f1'
    
    # Compute cross-validation scores
    scores = cross_val_score(model, X, y, cv=cv, scoring=scoring)
    
    # Write to disk if specified
    if saveto:
        joblib.dump(model, saveto)
    # Fit the model on entire dataset
    model.fit(X, y)
    # Return scores
    return score

cpath = 'categorized-comments.txt'

regressor = Pipeline([
 ('norm', TextNormalizer()),
 ('tfidf', TfidfVectorizer()),
 ('ann', MLPRegressor(hidden_layer_sizes=[500,150], verbose=True))
 ])

regression_scores = train_model(cpath, regressor, Continuous=True)

classifier = Pipeline([
('norm', TextNormalizer()),
('tfidf', TfidfVectorizer()),
('ann', MLPClassifier(hidden_layer_sizes=[500,150], verbose=True))
])

classifer_scores = train_model(cpath, classifier, Continuous=False)

In [78]:
regression_scores

NameError: name 'regression_scores' is not defined

In [79]:
classifer_scores

NameError: name 'classifer_scores' is not defined

### Keras

In [30]:
# Set random seed
np.random.seed(0)
# Set the number of features we want
number_of_features = 5000
# Load feature and target data
data = reuters.load_data(num_words=number_of_features)
(data_train, target_vector_train), (data_test, target_vector_test) = data
# Convert feature data to a one-hot encoded feature matrix
tokenizer = Tokenizer(num_words=number_of_features)
features_train = tokenizer.sequences_to_matrix(data_train, mode="binary")
features_test = tokenizer.sequences_to_matrix(data_test, mode="binary")
# One-hot encode target vector to create a target matrix
target_train = to_categorical(target_vector_train)
target_test = to_categorical(target_vector_test)
# Start neural network
network = models.Sequential()
# Add fully connected layer with a ReLU activation function
network.add(layers.Dense(units=100,
 activation="relu",
input_shape=(number_of_features,)))
# Add fully connected layer with a ReLU activation function
network.add(layers.Dense(units=100, activation="relu"))
# Add fully connected layer with a softmax activation function
network.add(layers.Dense(units=46, activation="softmax"))
# Compile neural network
network.compile(loss="categorical_crossentropy", # Cross-entropy
 optimizer="rmsprop", # Root Mean Square Propagation
 metrics=["accuracy"]) # Accuracy performance metric
# Train neural network
history = network.fit(features_train, # Features
 target_train, # Target
epochs=3, # Three epochs
 verbose=0, # No output
 batch_size=100, # Number of observations per batch
 validation_data=(features_test, target_test)) # Test data

<tensorflow.python.keras.callbacks.History at 0x1c96f079fa0>