In [None]:
import numpy
import tensorflow as tf
import os.path
from keras.callbacks import ModelCheckpoint
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Conv1D, GlobalMaxPooling1D, MaxPooling1D
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence, text
from keras.utils import np_utils
from nltk.tokenize import word_tokenize
import nltk
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score, average_precision_score
nltk.download('punkt')

# Set random seed for reproducibility
seed = 7
numpy.random.seed(seed)



### Load the data, divide in train/dev/test, create vocabulary
vocab = []
X_total = []
y_total = []

with open("/steam_dataset.txt", "r") as dataset:
    for line in dataset:
        X_total.append(line)
        tokenize_word = word_tokenize(line)
        for word in tokenize_word:
            vocab.append(word)
        
X_train = X_total[:25000]
X_dev = X_total[25000:25900]
X_test = X_total[25900:]
        
with open("/steam_ratings.txt", "r") as ratings:
    for line in ratings:
        y_total.append([int(line)])

y_train = y_total[:25000]
y_dev = y_total[25000:25900]
y_test = y_total[25900:]

unique_words = set(vocab)
vocab = unique_words
vocab_length = len(vocab) + int(len(vocab) / 10) # Added len for unknw words

print("### !!! CONTROL LINE !!! ###")
print('Length of the sets: train', len(X_train), len(y_train))
print('Length of the sets: dev', len(X_dev), len(y_dev))
print('Length of the sets: test', len(X_test), len(y_test))
print('Length of Vocabulary:', len(vocab))


### Review Embedding
X_train = [text.one_hot(rev, vocab_length) for rev in X_train]
X_dev = [text.one_hot(rev, vocab_length) for rev in X_dev]
X_test = [text.one_hot(rev, vocab_length) for rev in X_test]

print("### !!! CONTROL LINE !!! ###")
print('Length of the Embedded sets and Example: train,', len(X_train), X_train[0])
print('Length of the Embedded sets: dev', len(X_dev), X_dev[0])
print('Length of the Embedded sets: test', len(X_test), X_test[0])

# Length check and padding
X_total = [text.one_hot(rev, vocab_length) for rev in X_total]
word_count = lambda review: len(review)
longest = max(X_total, key=word_count)
longest_length = len(longest)

X_train = sequence.pad_sequences(X_train, longest_length, padding='post')
X_dev = sequence.pad_sequences(X_dev, longest_length, padding='post')
X_test = sequence.pad_sequences(X_test, longest_length, padding='post')


X_train = numpy.array(X_train)
X_dev = numpy.array(X_dev)
X_test = numpy.array(X_test)

y_train = np_utils.to_categorical(y_train)
y_dev = np_utils.to_categorical(y_dev)
y_test = np_utils.to_categorical(y_test)
y_train = numpy.array(y_train)
y_dev = numpy.array(y_dev)
y_test = numpy.array(y_test)

print("### !!! CONTROL LINE !!! ###")
print('Length of the Pad-Embedded sets and Example: train,', len(X_train), X_train[0])
print('Length of the Pad-Embedded sets: dev', len(X_dev), X_dev[0])
print('Length of the Pad-Embedded sets: test', len(X_test), X_test[0])
print('Length of the y: train,', len(y_train), y_train[0])
print('Length of the y: dev', len(y_dev), y_dev[0])
print('Length of the y', len(y_test), y_test[0])

### Build the model

print('Building the CNN model...')
model = Sequential()
model.add(Embedding(input_dim=vocab_length, output_dim=300, input_length=longest_length))
model.add(Conv1D(activation="relu", padding="valid", filters=300, kernel_size=7))
model.add(GlobalMaxPooling1D())
model.add(Dense(600, activation='relu'))
model.add(Dense(2, activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

print(model.summary())

#checkpoint
metric = 'val_accuracy'
filepath="/weights-multi-best.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor=metric, verbose=1, save_best_only=True,
mode= 'max' )
callbacks_list = [checkpoint]

model.fit(X_train, y_train, batch_size=200, epochs=5, validation_data=(X_dev, y_dev), callbacks=callbacks_list)

#load weights and recompile
model.load_weights(filepath)
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])


# evaluate the model
print("-" * 10)
print(model.layers)
print("-" * 10)
print("evaluation on dev")
scores = model.evaluate(X_dev, y_dev)
print("\n%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))
y_pred = model.predict(X_dev)
y_pred = y_pred.argmax(1)
y_dev = y_dev.argmax(1)
print("Accuracy Rate by 'accuracy_score' is: %f" % accuracy_score(y_dev, y_pred))
print("Accuracy Rate by 'f1_score macro' is: %f" % f1_score(y_dev, y_pred, average='macro'))

print("evaluation on test")
scores = model.evaluate(X_test, y_test)
print("\n%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))
y_test_pred = model.predict(X_test)
y_test_pred = y_test_pred.argmax(1)
y_test = y_test.argmax(1)
print(confusion_matrix(y_test, y_test_pred))
print("Accuracy Rate by 'accuracy_score' is: %f" % accuracy_score(y_test, y_test_pred))
print("Accuracy Rate by 'f1_score macro' is: %f" % f1_score(y_test, y_test_pred, average='macro'))


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
### !!! CONTROL LINE !!! ###
Length of the sets: train 25000 25000
Length of the sets: dev 900 900
Length of the sets: test 2081 2081
Length of Vocabulary: 49828
### !!! CONTROL LINE !!! ###
Length of the Embedded sets and Example: train, 25000 [49480, 25236, 30733, 33162, 49917, 5445, 17570, 47335, 29711, 39153, 39135, 33162, 29477, 35995, 52757, 49654, 49917, 26792, 50919, 5184, 40863, 5115, 33592, 33151, 7415, 42015, 29378, 51339, 49917, 13391, 27412, 3343, 36543, 33592, 3343, 7213, 21172, 26403, 17570, 35923, 50099, 14951, 49654, 54225, 44251, 51339, 53369, 35872, 22218, 39153, 19819, 4301, 3353, 48202, 19216, 24544, 15767, 30090, 35923, 4756, 23125, 44251, 36068, 38393, 6045, 7415, 48202, 16604, 51339, 18720, 33162, 5817, 161]
Length of the Embedded sets: dev 900 [50679, 27661, 36186, 7213, 24276, 37877, 51339, 600, 48202, 40233, 33162, 2274, 37800, 42406, 24575]
Length o

In [3]:
import numpy
import tensorflow as tf
import os.path
from keras.callbacks import ModelCheckpoint
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Conv1D, GlobalMaxPooling1D, MaxPooling1D
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence, text
from keras.utils import np_utils
from nltk.tokenize import word_tokenize
import nltk
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score, average_precision_score
nltk.download('punkt')

# Set random seed for reproducibility
seed = 7
numpy.random.seed(seed)

### Load the data, divide in train/dev/test, create vocabulary
vocab = []
X_total_S = []
X_total_T = []
y_total_S = []
y_total_T = []

with open("/steam_dataset.txt", "r") as dataset:
    for line in dataset:
        X_total_S.append(line)
        tokenize_word = word_tokenize(line)
        for word in tokenize_word:
            vocab.append(word)
with open("/twitter_dataset.txt", "r") as dataset:
    for line in dataset:
        X_total_T.append(line)
        tokenize_word = word_tokenize(line)
        for word in tokenize_word:
            vocab.append(word)
        
X_train = X_total_S[:24600] + X_total_T[:11900]
X_dev = X_total_S[24600:25500] + X_total_T[11900:12300]
#set for other test -> X_test = X_total_S[25500:] #set for Steam test 
X_test = X_total_T[23900:]
        
with open("/steam_ratings.txt", "r") as ratings:
    for line in ratings:
        y_total_S.append([int(line)])

with open("/twitter_ratings.txt", "r") as ratings:
    for line in ratings:
        y_total_T.append([int(line)])

y_train = y_total_S[:24600] + y_total_T[:11900]
y_dev = y_total_S[24600:25500] + y_total_T[11900:12300]
#set for other test -> y_test = y_total_S[25500:] #set for Steam test
y_test = y_total_T[23900:26051]

unique_words = set(vocab)
vocab = unique_words
vocab_length = len(vocab) + int(len(vocab) / 10) # Added len for unknw words

print("### !!! CONTROL LINE !!! ###")
print('Length of the sets: train', len(X_train), len(y_train))
print('Length of the sets: dev', len(X_dev), len(y_dev))
print('Length of the sets: test', len(X_test), len(y_test))
print('Length of Vocabulary:', len(vocab))


### Review Embedding
X_train = [text.one_hot(rev, vocab_length) for rev in X_train]
X_dev = [text.one_hot(rev, vocab_length) for rev in X_dev]
X_test = [text.one_hot(rev, vocab_length) for rev in X_test]

print("### !!! CONTROL LINE !!! ###")
print('Length of the Embedded sets and Example: train,', len(X_train), X_train[0])
print('Length of the Embedded sets: dev', len(X_dev), X_dev[0])
print('Length of the Embedded sets: test', len(X_test), X_test[0])

# Length check and padding
X_total = [text.one_hot(rev, vocab_length) for rev in X_total_S]
word_count = lambda review: len(review)
longest = max(X_total, key=word_count)
longest_length = len(longest)

X_train = sequence.pad_sequences(X_train, longest_length, padding='post')
X_dev = sequence.pad_sequences(X_dev, longest_length, padding='post')
X_test = sequence.pad_sequences(X_test, longest_length, padding='post')


X_train = numpy.array(X_train)
X_dev = numpy.array(X_dev)
X_test = numpy.array(X_test)

y_train = np_utils.to_categorical(y_train)
y_dev = np_utils.to_categorical(y_dev)
y_test = np_utils.to_categorical(y_test)
y_train = numpy.array(y_train)
y_dev = numpy.array(y_dev)
y_test = numpy.array(y_test)

print("### !!! CONTROL LINE !!! ###")
print('Length of the Pad-Embedded sets and Example: train,', len(X_train), X_train[0])
print('Length of the Pad-Embedded sets: dev', len(X_dev), X_dev[0])
print('Length of the Pad-Embedded sets: test', len(X_test), X_test[0])
print('Length of the y: train,', len(y_train), y_train[0])
print('Length of the y: dev', len(y_dev), y_dev[0])
print('Length of the y', len(y_test), y_test[0])

### Build the model

print('Building the CNN model...')
model = Sequential()
model.add(Embedding(input_dim=vocab_length, output_dim=300, input_length=longest_length))
model.add(Conv1D(activation="relu", padding="valid", filters=300, kernel_size=7))
model.add(GlobalMaxPooling1D())
model.add(Dense(600, activation='relu'))
model.add(Dense(2, activation='softmax'))

'''
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

print(model.summary())

#checkpoint
metric = 'val_accuracy'
filepath="/St-weights-multi-best.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor=metric, verbose=1, save_best_only=True,
mode= 'max' )
callbacks_list = [checkpoint]

model.fit(X_train, y_train, batch_size=200, epochs=5, validation_data=(X_dev, y_dev), callbacks=callbacks_list)
'''
filepath="/St-weights-multi-best.hdf5"

#load weights and recompile
model.load_weights(filepath)
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])


# evaluate the model
print("-" * 10)
print(model.layers)
print("-" * 10)
print("evaluation on dev")
scores = model.evaluate(X_dev, y_dev)
print("\n%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))
y_pred = model.predict(X_dev)
y_pred = y_pred.argmax(1)
y_dev = y_dev.argmax(1)
print("Accuracy Rate by 'accuracy_score' is: %f" % accuracy_score(y_dev, y_pred))
print("Accuracy Rate by 'f1_score macro' is: %f" % f1_score(y_dev, y_pred, average='macro'))

print("evaluation on test")
scores = model.evaluate(X_test, y_test)
print("\n%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))
y_test_pred = model.predict(X_test)
y_test_pred = y_test_pred.argmax(1)
y_test = y_test.argmax(1)
print(confusion_matrix(y_test, y_test_pred))
print("Accuracy Rate by 'accuracy_score' is: %f" % accuracy_score(y_test, y_test_pred))
print("Accuracy Rate by 'f1_score macro' is: %f" % f1_score(y_test, y_test_pred, average='macro'))

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
### !!! CONTROL LINE !!! ###
Length of the sets: train 36500 36500
Length of the sets: dev 1300 1300
Length of the sets: test 2151 2151
Length of Vocabulary: 79317
### !!! CONTROL LINE !!! ###
Length of the Embedded sets and Example: train, 36500 [29445, 38094, 21918, 25974, 59796, 4240, 45653, 59891, 64825, 23748, 40069, 25974, 63994, 39398, 85135, 3869, 59796, 75928, 80202, 16197, 24949, 41724, 84906, 80529, 7253, 15085, 24006, 6129, 59796, 39934, 30384, 73130, 55648, 84906, 73130, 83055, 14105, 592, 45653, 48437, 9177, 84698, 3869, 25646, 33181, 6129, 24041, 22250, 44900, 23748, 66594, 22828, 33814, 74833, 10595, 33780, 45414, 50953, 48437, 41810, 86587, 33181, 69789, 9808, 26393, 7253, 74833, 57030, 6129, 38852, 25974, 37659, 3657]
Length of the Embedded sets: dev 1300 [38951, 78224, 73996, 33814, 31132, 74574, 25974, 41267, 38210, 83055, 12040, 82869, 68837, 38066, 53412,

In [2]:
import numpy
import tensorflow as tf
import os.path
from keras.callbacks import ModelCheckpoint
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Conv1D, GlobalMaxPooling1D, MaxPooling1D
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence, text
from keras.utils import np_utils
from nltk.tokenize import word_tokenize
import nltk
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score, average_precision_score
nltk.download('punkt')

# Set random seed for reproducibility
seed = 7
numpy.random.seed(seed)

### Load the data, divide in train/dev/test, create vocabulary
vocab = []
X_total_S = []
X_total_T = []
y_total_S = []
y_total_T = []

with open("/steam_dataset.txt", "r") as dataset:
    for line in dataset:
        X_total_S.append(line)
        tokenize_word = word_tokenize(line)
        for word in tokenize_word:
            vocab.append(word)
with open("/twitter_dataset.txt", "r") as dataset:
    for line in dataset:
        X_total_T.append(line)
        tokenize_word = word_tokenize(line)
        for word in tokenize_word:
            vocab.append(word)
        
X_train = X_total_S[:23100] + X_total_T[:23100]
X_dev = X_total_S[23100:23900] + X_total_T[23100:23900]
#set for other test -> X_test = X_total_S[23900:] #set for Steam test
X_test = X_total_T[23900:] #set for test on Tweets
        
with open("/steam_ratings.txt", "r") as ratings:
    for line in ratings:
        y_total_S.append([int(line)])

with open("/twitter_ratings.txt", "r") as ratings:
    for line in ratings:
        y_total_T.append([int(line)])

y_train = y_total_S[:23100] + y_total_T[:23100]
y_dev = y_total_S[23100:23900] + y_total_T[23100:23900]
#set for other test -> y_test = y_total_S[23900:] #set for Steam test 
y_test = y_total_T[23900:26051] #set for test on Tweets

unique_words = set(vocab)
vocab = unique_words
vocab_length = len(vocab) + int(len(vocab) / 10) # Added len for unknw words

print("### !!! CONTROL LINE !!! ###")
print('Length of the sets: train', len(X_train), len(y_train))
print('Length of the sets: dev', len(X_dev), len(y_dev))
print('Length of the sets: test', len(X_test), len(y_test))
print('Length of Vocabulary:', len(vocab))


### Review Embedding
X_train = [text.one_hot(rev, vocab_length) for rev in X_train]
X_dev = [text.one_hot(rev, vocab_length) for rev in X_dev]
X_test = [text.one_hot(rev, vocab_length) for rev in X_test]

print("### !!! CONTROL LINE !!! ###")
print('Length of the Embedded sets and Example: train,', len(X_train), X_train[0])
print('Length of the Embedded sets: dev', len(X_dev), X_dev[0])
print('Length of the Embedded sets: test', len(X_test), X_test[0])

# Length check and padding
X_total = [text.one_hot(rev, vocab_length) for rev in X_total_S]
word_count = lambda review: len(review)
longest = max(X_total, key=word_count)
longest_length = len(longest)

X_train = sequence.pad_sequences(X_train, longest_length, padding='post')
X_dev = sequence.pad_sequences(X_dev, longest_length, padding='post')
X_test = sequence.pad_sequences(X_test, longest_length, padding='post')


X_train = numpy.array(X_train)
X_dev = numpy.array(X_dev)
X_test = numpy.array(X_test)

y_train = np_utils.to_categorical(y_train)
y_dev = np_utils.to_categorical(y_dev)
y_test = np_utils.to_categorical(y_test)
y_train = numpy.array(y_train)
y_dev = numpy.array(y_dev)
y_test = numpy.array(y_test)

print("### !!! CONTROL LINE !!! ###")
print('Length of the Pad-Embedded sets and Example: train,', len(X_train), X_train[0])
print('Length of the Pad-Embedded sets: dev', len(X_dev), X_dev[0])
print('Length of the Pad-Embedded sets: test', len(X_test), X_test[0])
print('Length of the y: train,', len(y_train), y_train[0])
print('Length of the y: dev', len(y_dev), y_dev[0])
print('Length of the y', len(y_test), y_test[0])

### Build the model

print('Building the CNN model...')
model = Sequential()
model.add(Embedding(input_dim=vocab_length, output_dim=300, input_length=longest_length))
model.add(Conv1D(activation="relu", padding="valid", filters=300, kernel_size=7))
model.add(GlobalMaxPooling1D())
model.add(Dense(600, activation='relu'))
model.add(Dense(2, activation='softmax'))


model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

print(model.summary())
'''
#checkpoint
metric = 'val_accuracy'
filepath="/ST-weights-multi-best.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor=metric, verbose=1, save_best_only=True,
mode= 'max' )
callbacks_list = [checkpoint]

# Since 5 epochs are not enough for this model, up to 8
model.fit(X_train, y_train, batch_size=200, epochs=8, validation_data=(X_dev, y_dev), callbacks=callbacks_list)
'''
filepath="/ST-weights-multi-best.hdf5"

#load weights and recompile
model.load_weights(filepath)
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])


# evaluate the model
print("-" * 10)
print(model.layers)
print("-" * 10)
print("evaluation on dev")
scores = model.evaluate(X_dev, y_dev)
print("\n%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))
y_pred = model.predict(X_dev)
y_pred = y_pred.argmax(1)
y_dev = y_dev.argmax(1)
print("Accuracy Rate by 'accuracy_score' is: %f" % accuracy_score(y_dev, y_pred))
print("Accuracy Rate by 'f1_score macro' is: %f" % f1_score(y_dev, y_pred, average='macro'))

print("evaluation on test")
scores = model.evaluate(X_test, y_test)
print("\n%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))
y_test_pred = model.predict(X_test)
y_test_pred = y_test_pred.argmax(1)
y_test = y_test.argmax(1)
print(confusion_matrix(y_test, y_test_pred))
print("Accuracy Rate by 'accuracy_score' is: %f" % accuracy_score(y_test, y_test_pred))
print("Accuracy Rate by 'f1_score macro' is: %f" % f1_score(y_test, y_test_pred, average='macro'))

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
### !!! CONTROL LINE !!! ###
Length of the sets: train 46200 46200
Length of the sets: dev 1600 1600
Length of the sets: test 2151 2151
Length of Vocabulary: 79317
### !!! CONTROL LINE !!! ###
Length of the Embedded sets and Example: train, 46200 [29445, 38094, 21918, 25974, 59796, 4240, 45653, 59891, 64825, 23748, 40069, 25974, 63994, 39398, 85135, 3869, 59796, 75928, 80202, 16197, 24949, 41724, 84906, 80529, 7253, 15085, 24006, 6129, 59796, 39934, 30384, 73130, 55648, 84906, 73130, 83055, 14105, 592, 45653, 48437, 9177, 84698, 3869, 25646, 33181, 6129, 24041, 22250, 44900, 23748, 66594, 22828, 33814, 74833, 10595, 33780, 45414, 50953, 48437, 41810, 86587, 33181, 69789, 9808, 26393, 7253, 74833, 57030, 6129, 38852, 25974, 37659, 3657]
Length of the Embedded sets: dev 1600 [48437, 3009, 2643, 10827, 77474, 31704, 50111, 79212]
Length of the Embedded sets: test 2151 [69924, 462