In [3]:
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

In [5]:
from sklearn import decomposition, ensemble

In [6]:
import numpy, string

In [7]:
import pandas as pd

In [8]:
from keras.preprocessing import text, sequence

Using TensorFlow backend.


In [9]:
from keras.preprocessing.text import Tokenizer

In [10]:
from keras import layers, models, optimizers

In [11]:
#1. LOADING TEXT DATASET

In [12]:
data = open('corpus', encoding="utf8").read()

In [13]:
all_labels = []
for line in data.split('\n'):
    label = line.split()[0]
    all_labels.append(label)
dist_labels = list(set(all_labels))

In [14]:
dist_labels

['__label__2', '__label__1']

In [15]:
label, text = [],[]
for i, line in enumerate(data.split('\n')):
    label.append(line.split()[0])
    text.append(' '.join(line.split()[1:]))

In [16]:
label[:10], text[:3]

(['__label__2',
  '__label__2',
  '__label__2',
  '__label__2',
  '__label__2',
  '__label__2',
  '__label__1',
  '__label__2',
  '__label__2',
  '__label__2'],
 ['Stuning even for the non-gamer: This sound track was beautiful! It paints the senery in your mind so well I would recomend it even to people who hate vid. game music! I have played the game Chrono Cross but out of all of the games I have ever played it has the best music! It backs away from crude keyboarding and takes a fresher step with grate guitars and soulful orchestras. It would impress anyone who cares to listen! ^_^',
  "The best soundtrack ever to anything.: I'm reading a lot of reviews saying that this is the best 'game soundtrack' and I figured that I'd write a review to disagree a bit. This in my opinino is Yasunori Mitsuda's ultimate masterpiece. The music is timeless and I'm been listening to it for years now and its beauty simply refuses to fade.The price tag on this is pretty staggering I must say, but if you 

In [17]:
import pandas as pd

In [18]:
trainDF = pd.DataFrame()

In [19]:
trainDF['labels'] = label

In [20]:
trainDF['texts'] = text

In [21]:
trainDF.head(5)

Unnamed: 0,labels,texts
0,__label__2,Stuning even for the non-gamer: This sound tra...
1,__label__2,The best soundtrack ever to anything.: I'm rea...
2,__label__2,Amazing!: This soundtrack is my favorite music...
3,__label__2,Excellent Soundtrack: I truly like this soundt...
4,__label__2,"Remember, Pull Your Jaw Off The Floor After He..."


In [22]:
#2. PREPARING TRAIN TEST SPLIT

In [23]:
x_train, x_valid, y_train, y_valid = model_selection.train_test_split(trainDF['texts'], trainDF['labels'])

In [24]:
#Label encode the target variable

In [25]:
encoder = preprocessing.LabelEncoder()
y_train = encoder.fit_transform(y_train)
y_valid = encoder.fit_transform(y_valid)

In [26]:
encoder.classes_

array(['__label__1', '__label__2'], dtype=object)

In [27]:
#3. FEATURE ENGINEERING

In [28]:
#3.1. count vectorizer from sklearn feature_extraction

In [29]:
import spacy

In [30]:
from html import unescape

In [31]:
def my_preprocessor(doc):
    return(unescape(doc).lower())

In [32]:
count_vect = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}', ngram_range=(1,2), stop_words='english')

In [33]:
count_vect.fit(x_train)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 2), preprocessor=None, stop_words='english',
                strip_accents=None, token_pattern='\\w{1,}', tokenizer=None,
                vocabulary=None)

In [34]:
x_train_count = count_vect.transform(x_train)

In [35]:
x_valid_count = count_vect.transform(x_valid)

In [36]:
features = count_vect.get_feature_names()

In [37]:
def wm2df(wmx, feat_names):
    
    # create an index for each row
    doc_names = ['Doc{:d}'.format(idx) for idx, _ in enumerate(wmx)]
    df = pd.DataFrame(data=wmx.toarray(), index=doc_names,
                      columns=feat_names)
    return(df)

In [39]:
wm2df(x_train_count, features).head(5)

Unnamed: 0,0,0 05,0 10,0 20gb,0 3,0 394,0 4,0 5,0 5ml,0 7,...,étai fidèle,était,était pas,étre,étre publié,éviter,última,última parte,única,única opción
Doc0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Doc1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Doc2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Doc3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Doc4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [40]:
#TFIDF vectors from sklearn feature_extraction 

In [41]:
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern = r'\w{1,}', max_features=5000)

In [42]:
tfidf_vect.fit(x_train)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=5000,
                min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern='\\w{1,}', tokenizer=None,
                use_idf=True, vocabulary=None)

In [43]:
x_train_tfidf = tfidf_vect.transform(x_train)
x_valid_tfidf = tfidf_vect.transform(x_valid)

In [44]:
tfidf_vect_ngram = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', ngram_range=(2,3), max_features=5000)

In [45]:
tfidf_vect_ngram.fit(x_train)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=5000,
                min_df=1, ngram_range=(2, 3), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern='\\w{1,}', tokenizer=None,
                use_idf=True, vocabulary=None)

In [46]:
x_train_tfidf_ngram = tfidf_vect_ngram.transform(x_train)

In [47]:
x_valid_tfidf_ngram = tfidf_vect_ngram.transform(x_valid)

In [48]:
tfidf_vect_char = TfidfVectorizer(analyzer='char', token_pattern=r'\w{1,}', ngram_range=(2,3), max_features = 5000)

In [49]:
tfidf_vect_char.fit(x_train)

TfidfVectorizer(analyzer='char', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=5000,
                min_df=1, ngram_range=(2, 3), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern='\\w{1,}', tokenizer=None,
                use_idf=True, vocabulary=None)

In [50]:
x_train_tfidf_char = tfidf_vect_char.transform(x_train)
x_valid_tfidf_char = tfidf_vect_char.transform(x_valid)

In [51]:
#WORD EMBEDDING

In [52]:
#create a tokenizer

In [53]:
token = Tokenizer()

In [54]:
token.fit_on_texts(trainDF['texts'])

In [55]:
word_index = token.word_index

In [56]:
word_index

{'the': 1,
 'and': 2,
 'i': 3,
 'a': 4,
 'to': 5,
 'of': 6,
 'it': 7,
 'this': 8,
 'is': 9,
 'in': 10,
 'for': 11,
 'that': 12,
 'was': 13,
 'book': 14,
 'you': 15,
 'not': 16,
 'but': 17,
 'with': 18,
 'on': 19,
 'my': 20,
 'have': 21,
 'as': 22,
 'are': 23,
 'one': 24,
 'be': 25,
 'so': 26,
 'all': 27,
 'if': 28,
 'very': 29,
 'like': 30,
 'read': 31,
 'good': 32,
 'great': 33,
 'at': 34,
 'movie': 35,
 'they': 36,
 'just': 37,
 'about': 38,
 'from': 39,
 'or': 40,
 'would': 41,
 'an': 42,
 'me': 43,
 'out': 44,
 'what': 45,
 'has': 46,
 'more': 47,
 'by': 48,
 'time': 49,
 'had': 50,
 'when': 51,
 'get': 52,
 'will': 53,
 "it's": 54,
 'up': 55,
 'there': 56,
 'no': 57,
 'only': 58,
 'your': 59,
 'can': 60,
 "don't": 61,
 'his': 62,
 'really': 63,
 'who': 64,
 'some': 65,
 'he': 66,
 'well': 67,
 'first': 68,
 'her': 69,
 'much': 70,
 'than': 71,
 'even': 72,
 'do': 73,
 'story': 74,
 'because': 75,
 'them': 76,
 'other': 77,
 'after': 78,
 'buy': 79,
 'we': 80,
 'were': 81,
 'too': 

In [57]:
train_seq_x = sequence.pad_sequences(token.texts_to_sequences(x_train), maxlen=70)

In [58]:
valid_seq_x = sequence.pad_sequences(token.texts_to_sequences(x_valid), maxlen=70)

In [59]:
#pre-trained word embedding vector

In [60]:
embeddings_index = {}

In [61]:
pre_trained_vectors = open('wiki-news-300d-1M.vec', 'r', encoding="utf8")

In [62]:
for i, line in enumerate(pre_trained_vectors):
    values = line.split()
    embeddings_index[values[0]] = numpy.asarray(values[1:], dtype='float32')

In [63]:
#pre_trained_vectors = open('wiki-news-300d-1M.vec', 'r').readlines()

In [64]:
print(list(embeddings_index.items())[35])

('not', array([ 1.570e-02, -7.480e-02, -8.620e-02,  5.040e-02, -3.800e-03,
        4.300e-02, -3.580e-02, -3.290e-02,  4.210e-02,  5.400e-03,
        2.210e-02,  6.220e-02,  5.860e-02,  4.820e-02, -7.620e-02,
       -6.700e-03,  5.320e-02,  1.316e-01,  1.638e-01, -5.680e-02,
       -3.210e-02, -1.290e-02, -6.000e-04,  1.519e-01, -6.130e-02,
        5.080e-02,  3.810e-02,  2.320e-02, -3.780e-02,  2.808e-01,
        5.980e-02, -1.570e-02,  1.495e-01,  7.000e-04, -3.200e-03,
        2.530e-02,  6.230e-02,  2.170e-02,  5.250e-02,  9.500e-03,
       -5.700e-02,  4.360e-02,  4.400e-02,  2.740e-02, -2.390e-02,
       -3.900e-02, -5.570e-02,  2.120e-02,  2.000e-02,  7.500e-03,
       -1.580e-02,  3.930e-02, -6.234e-01, -3.500e-02,  8.600e-03,
        1.540e-02,  5.230e-02, -6.900e-02,  1.258e-01,  2.350e-02,
        5.100e-03,  2.200e-02, -4.300e-02, -9.200e-03,  7.980e-02,
        1.110e-02,  5.500e-03,  4.560e-02, -2.220e-02,  3.640e-02,
        3.560e-02, -1.050e-02, -3.780e-02,  1.604e-01,

In [65]:
embedding_matrix = numpy.zeros((len(word_index) + 1, 300))

In [66]:
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [67]:
#EMBEDDING MATRIX represents the vector for each word in the entire text indexed by word_index

In [68]:
embedding_matrix[3]

array([-3.11600000e-01,  8.56000036e-02, -6.89999992e-03, -1.79700002e-01,
        8.54000002e-02, -6.08999990e-02,  6.25000000e-02,  1.22800000e-01,
        1.80700004e-01,  7.63999969e-02, -2.49000005e-02, -5.24999984e-02,
       -9.99999975e-05, -1.99300006e-01, -1.59700006e-01, -8.07999969e-02,
       -7.31000006e-02, -4.50000018e-02, -1.23300001e-01, -3.97000015e-02,
        1.49999997e-02, -1.38099998e-01, -4.82000001e-02,  5.62000014e-02,
       -9.74999964e-02, -1.27399996e-01,  3.95000018e-02,  2.75599986e-01,
        6.80000037e-02, -1.41800001e-01,  2.72399992e-01, -8.89999978e-03,
        9.21000019e-02,  1.52799994e-01,  2.19999999e-01,  1.84000004e-02,
       -5.13999984e-02,  1.32499993e-01,  1.40200004e-01, -1.00000005e-03,
        2.67999992e-02,  5.15000001e-02, -7.75000006e-02, -5.22000007e-02,
       -1.18000004e-02,  3.89999989e-03, -2.14900002e-01,  1.01700000e-01,
        1.93200007e-01, -6.59999996e-02, -6.58999979e-02,  5.46999983e-02,
       -6.58299983e-01, -

In [69]:
#Topic Modelling Feature using LDA

In [70]:
# train a LDA Model

In [71]:
lda_model = decomposition.LatentDirichletAllocation(n_components=20, learning_method='online', max_iter=20)

In [72]:
X_topics = lda_model.fit_transform(x_train_count)

In [73]:
topic_word = lda_model.components_ 

In [74]:
terms = count_vect.get_feature_names()

In [75]:
n_top_words = 10
topic_summaries = []
for i, topic_dist in enumerate(topic_word):
    topic_words = numpy.array(terms)[numpy.argsort(topic_dist)][:-(n_top_words+1):-1]
    topic_summaries.append(' '.join(topic_words))


In [76]:
topic_summaries

['concert unbelievable meant toys emotion area twists birthday leads diary',
 'charger apple 3d letter hawthorne 00 scarlet scarlet letter cornwell scarpetta',
 'century version tool standard versions cat originally shower smell extreme',
 'printer hp paper stargate print color poor quality photo quality poor',
 'recipes cooking concept coast ingredients portrayal lee thrash dishes worlds',
 'hot rice bands customer service delightful passion costume customer taste whatsoever',
 'catholic unusual combination church freud dragon castle dickens funk arguments',
 'card battery product blu amazon blu ray ray boots camera player',
 '1 3 size 2 5 max 4 waist product internet',
 'great cd glass cake ice 18 troma guest guns barbie string',
 'la bar y en spanish el que stopped working blocks album',
 'album cd music songs albums band sex tracks listen rock',
 'scanner driver solution great price magazine mrs europe japanese scan xp',
 'book s t good like great read just time movie',
 'movie vid

In [77]:
doc_topic = pd.DataFrame(data=X_topics).head(10)

In [80]:
doc_topic

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,0.000556,0.000556,0.000556,0.000556,0.000556,0.000556,0.000556,0.767915,0.011667,0.000556,0.000556,0.017236,0.000556,0.194293,0.000556,0.000556,0.000556,0.000556,0.000556,0.000556
1,0.002083,0.002083,0.002083,0.002083,0.002083,0.002083,0.002083,0.002083,0.363745,0.002083,0.002083,0.002083,0.002083,0.598755,0.002083,0.002083,0.002083,0.002083,0.002083,0.002083
2,0.000532,0.000532,0.000532,0.000532,0.000532,0.000532,0.000532,0.802449,0.000532,0.000532,0.000532,0.000532,0.000532,0.187976,0.000532,0.000532,0.000532,0.000532,0.000532,0.000532
3,0.003125,0.003125,0.003125,0.003125,0.003125,0.003125,0.475098,0.003125,0.003125,0.003125,0.003125,0.003125,0.003125,0.468652,0.003125,0.003125,0.003125,0.003125,0.003125,0.003125
4,0.001786,0.001786,0.001786,0.001786,0.001786,0.001786,0.001786,0.001786,0.041536,0.001786,0.001786,0.001786,0.5375,0.390607,0.001786,0.001786,0.001786,0.001786,0.001786,0.001786
5,0.000263,0.000263,0.000263,0.000263,0.000263,0.000263,0.000263,0.000263,0.000263,0.000263,0.592696,0.000263,0.000263,0.402567,0.000263,0.000263,0.000263,0.000263,0.000263,0.000263
6,0.000781,0.000781,0.000781,0.000781,0.000781,0.000781,0.000781,0.000781,0.000781,0.000781,0.000781,0.000781,0.000781,0.334098,0.651839,0.000781,0.000781,0.000781,0.000781,0.000781
7,0.001136,0.001136,0.001136,0.001136,0.001136,0.001136,0.001136,0.462842,0.001136,0.001136,0.001136,0.001136,0.001136,0.516703,0.001136,0.001136,0.001136,0.001136,0.001136,0.001136
8,0.001316,0.001316,0.001316,0.001316,0.027632,0.001316,0.001316,0.001316,0.027632,0.001316,0.001316,0.001316,0.001316,0.328454,0.001316,0.59523,0.001316,0.001316,0.001316,0.001316
9,0.001563,0.590389,0.001563,0.001563,0.001563,0.001563,0.001563,0.001563,0.001563,0.001563,0.001563,0.001563,0.001563,0.381486,0.001563,0.001563,0.001563,0.001563,0.001563,0.001563


In [84]:
doc_topic.idxmax(axis=1)

0     7
1    13
2     7
3     6
4    12
5    10
6    14
7    13
8    15
9     1
dtype: int64

In [85]:
#Classification 

In [87]:
def train_model(classifier, x_train_features, x_valid_features, is_neural_net=False):
    classifier.fit(x_train_features, y_train)
    predictions = classifier.predict(x_valid_features)
    
    if is_neural_net:
        predictions = predictions.argmax(axis=-1)
    
    return metrics.accuracy_score(predictions, y_valid)

In [88]:
#Naive Bayes

In [94]:
accuracy_count = train_model(naive_bayes.MultinomialNB(), x_train_count, x_valid_count)

In [95]:
print('NB, count vectors: {}'.format(accuracy_count))

NB, count vectors: 0.8548


In [99]:
accuracy_tfidf = train_model(naive_bayes.MultinomialNB(), x_train_tfidf, x_valid_tfidf)

In [100]:
print('NB, tfidf word: {}'.format(accuracy_tfidf))

NB, tfidf word: 0.8536


In [102]:
accuracy_ngram = train_model(naive_bayes.MultinomialNB(), x_train_tfidf_ngram, x_valid_tfidf_ngram)

In [103]:
print('NB, ngram word: {}'.format(accuracy_ngram))

NB, ngram word: 0.8388


In [105]:
accuracy_ngram_char = train_model(naive_bayes.MultinomialNB(), x_train_tfidf_char, x_valid_tfidf_char)

In [106]:
print('NB, ngram char: {}'.format(accuracy_ngram_char))

NB, ngram char: 0.8256


In [111]:
bayes_classifier = naive_bayes.MultinomialNB()

In [112]:
bayes_classifier.fit(x_train_tfidf_char, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [114]:
predictions = bayes_classifier.predict(x_valid_tfidf_char)

In [119]:
import sys
import numpy
numpy.set_printoptions(threshold=sys.maxsize)

In [120]:
y_valid

array([0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0,
       1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0,
       1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1,
       0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1,
       0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1,
       0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
       1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0,
       0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1,
       1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1,
       0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0,
       1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0,
       1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0,
       0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1,

In [121]:
predictions

array([0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1,
       0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0,
       1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0,
       1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1,
       0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1,
       0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0,
       1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1,
       1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1,
       1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
       0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1,
       1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0,
       0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1,

In [122]:
#Linear model: logistic regression

In [123]:
accuracy_count_LR = train_model(linear_model.LogisticRegression(), x_train_count, x_valid_count)



In [125]:
print('LR, Count Vector: {}'.format(accuracy_count_LR))

LR, Count Vector: 0.85


In [126]:
accuracy_tfidf_svm = train_model(svm.SVC(), x_train_tfidf_ngram, x_valid_tfidf_ngram)



In [127]:
print('SVM, TFIDF: {}'.format(accuracy_tfidf_svm))

SVM, TFIDF: 0.5268


In [128]:
accuracy_tfidf_RF = train_model(ensemble.RandomForestClassifier(), x_train_tfidf_ngram, x_valid_tfidf_ngram)



In [129]:
print('RF, TFIDF: {}'.format(accuracy_tfidf_RF))

RF, TFIDF: 0.742


In [134]:
x_train.shape

(7500,)

In [136]:
len(terms)

241330

In [137]:
x_train_tfidf.shape[1]

5000

In [130]:
#SHALLOW NEURAL NETWORK 

In [154]:
def create_model_architecture(input_size):
    #input layer
    input_layer = layers.Input((input_size, ), sparse=True)
    
    #hidden layer
    hidden_layer = layers.Dense(100, activation='relu')(input_layer)
    
    #output_layer
    output_layer = layers.Dense(1, activation='softmax')(hidden_layer)
    
    classifier = models.Model(inputs = input_layer, outputs = output_layer)
    classifier.compile(optimizer=optimizers.Adam(lr=0.01 ), loss='binary_crossentropy')
    
    return classifier
    
classifier = create_model_architecture(x_train_tfidf_ngram.shape[1])
accuracy = train_model(classifier,x_train_tfidf_ngram, x_valid_tfidf_ngram, is_neural_net=True )
print('NN, tfidf-ngram: {}'.format(accuracy))

Epoch 1/1
NN, tfidf-ngram: 0.5268


In [155]:
#Convolutional Neural Networks

In [162]:
def create_cnn():
    
    # Add an Input Layer
    input_layer = layers.Input((70, ))
    
    # Add the word embedding Layer
    embedding_layer = layers.Embedding(len(word_index) + 1, 300, weights=[embedding_matrix], trainable=False)(input_layer)
    embedding_layer = layers.SpatialDropout1D(0.3)(embedding_layer)
    
    # Add the convolutional Layer
    conv_layer = layers.Convolution1D(100, 3, activation="relu")(embedding_layer)
    
    # Add the pooling Layer
    pooling_layer = layers.GlobalMaxPool1D()(conv_layer)
    
    # Add the output Layers
    output_layer1 = layers.Dense(50, activation="relu")(pooling_layer)
    output_layer1 = layers.Dropout(0.25)(output_layer1)
    output_layer2 = layers.Dense(1, activation="sigmoid")(output_layer1)
    
    # Compile the model
    model = models.Model(inputs=input_layer, outputs=output_layer2)
    model.compile(optimizer=optimizers.Adam(), loss='binary_crossentropy')
    
    return model

classifier = create_cnn()
accuracy = train_model(classifier, train_seq_x, valid_seq_x, is_neural_net=True)
print ("CNN, Word Embeddings",  accuracy)

Instructions for updating:
Deprecated in favor of operator or tf.math.divide.
Epoch 1/1
CNN, Word Embeddings 0.5268


In [163]:
#RNN

In [166]:
def create_rnn_lstm():
    # Add an Input Layer
    input_layer = layers.Input((70, ))
    
    # Add the word embedding Layer
    embedding_layer = layers.Embedding(len(word_index) + 1, 300, weights=[embedding_matrix], trainable=False)(input_layer)
    embedding_layer = layers.SpatialDropout1D(0.3)(embedding_layer)
    
    # Add the LSTM Layer
    lstm_layer = layers.Bidirectional(layers.GRU(100))(embedding_layer)
    
    # Add the output Layers
    output_layer1 = layers.Dense(50, activation="relu")(lstm_layer)
    output_layer1 = layers.Dropout(0.25)(output_layer1)
    output_layer2 = layers.Dense(1, activation="sigmoid")(output_layer1)
    
    # Compile the model
    model = models.Model(inputs=input_layer, outputs=output_layer2)
    model.compile(optimizer=optimizers.Adam(), loss='binary_crossentropy')
    
    return model 

classifier = create_rnn_lstm()
accuracy = train_model(classifier, train_seq_x, valid_seq_x, is_neural_net=True)
print("RNN-LSTM, Word Embeddings",  accuracy)

Epoch 1/1
RNN-LSTM, Word Embeddings 0.5268


In [167]:
#Recurrent Convolutional Neural Network

In [168]:
def create_rcnn():
    input_layer = layers.Input((70,))
    
    embedding_layer = layers.Embedding(len(word_index)+1, 300, weights=[embedding_matrix], trainable=True)(input_layer)
    embedding_layer = layers.SpatialDropout1D(0.3)(embedding_layer)
    
    recurrent_layer = layers.Bidirectional(layers.LSTM(100, return_sequences=True))(embedding_layer)
    
    conv_layer = layers.Convolution1D(100,3,activation='relu')(recurrent_layer)
    
    pooling_layer = layers.GlobalMaxPool1D()(conv_layer)
    
    output_layer1 = layers.Dense(50, activation='relu')(pooling_layer)
    output_layer1 = layers.Dropout(0.25)(output_layer1)
    output_layer2 = layers.Dense(1, activation = 'sigmoid')(output_layer1)
    
    model = models.Model(inputs = input_layer, outputs = output_layer2)
    model.compile(optimizer = optimizers.Adam(), loss='binary_crossentropy')
    
    return model

classifier = create_rcnn()
accuracy = train_model(classifier, train_seq_x, valid_seq_x, is_neural_net=True)
print("RCNN, Word Embeddings",  accuracy)

Epoch 1/1
RCNN, Word Embeddings 0.5268
