In [1]:
import pandas
import numpy
from sklearn import model_selection, preprocessing, metrics
from keras.preprocessing import text, sequence
from keras.models import Sequential
from keras import layers

# fix random seed for reproducibility
numpy.random.seed(7)

Using TensorFlow backend.


In [2]:
#import dataset
#give the proper csv file path
dataset = pandas.read_csv("BCC_Sports.csv", encoding="latin-1")
dataset

Unnamed: 0,text,label
0,Henman overcomes rival RusedskiTim Henman save...,tennis
1,Safin slumps to shock Dubai lossMarat Safin su...,tennis
2,Ferrero eyes return to top formFormer world nu...,tennis
3,Roddick into San Jose finalAndy Roddick will p...,tennis
4,Federer claims Dubai crownWorld number one Rog...,tennis
5,Young debut cut short by GinepriFifteen-year-o...,tennis
6,Melzer shocks AgassiSecond seed Andre Agassi s...,tennis
7,Federer forced to dig deepTop seed Roger Feder...,tennis
8,Nadal marches on in MexicoRafael Nadal continu...,tennis
9,Hantuchova in Dubai last eightDaniela Hantucho...,tennis


In [3]:
# split the dataset into training and test set 
# 50% training data and 50% testing data
# shuffle the dataset
x_train, x_test, y_train, y_test = model_selection.train_test_split(dataset['text'], dataset['label'], test_size=0.50, shuffle=True)

# label encode the target variable 
encoder = preprocessing.LabelEncoder()
y_train = encoder.fit_transform(y_train)
y_test = encoder.fit_transform(y_test)

# datasets details
print('Number of Training Samples: {}'.format(len(x_train)))
print('Number of Test Samples: {}'.format(len(x_test)))

num_classes = max(y_train) + 1
print('Number of Classes: {}'.format(num_classes))
print(x_train[0:5])
print(y_train[0:5])

Number of Training Samples: 368
Number of Test Samples: 369
Number of Classes: 5
408    Everton's Weir cools Euro hopesEverton defende...
147    O'Driscoll/Gregan lead Aid starsIreland's Bria...
14     Mirza shocks KuznetsovaSania Mirza continued h...
174    Saint-Andre anger at absent starsSale Sharks d...
86     Big guns ease through in San JoseTop-seeded Am...
Name: text, dtype: object
[2 3 4 3 4]


In [4]:
# preprocess dataset

# documents are padded to make them equal in length    

# create a tokenizer 
tokenizer = text.Tokenizer()
tokenizer.fit_on_texts(dataset['text'])
word_index = tokenizer.word_index

# convert text to sequence of tokens and pad them to ensure equal length vectors 
max_length = 50
newx_train = sequence.pad_sequences(tokenizer.texts_to_sequences(x_train), maxlen=max_length)
newx_test = sequence.pad_sequences(tokenizer.texts_to_sequences(x_test), maxlen=max_length)

In [5]:
# the pre-trained word-embedding is used

def load_word_embedding(filename):
    # load embedding into memory
    file = open(filename,'r', encoding='utf-8')
    lines = file.readlines()
    file.close()
    # create a map of words to vectors
    embedding = dict()
    for line in lines:
        parts = line.split()
        # key is string word, value is numpy array for vector
        embedding[parts[0]] =numpy.asarray(parts[1:], dtype='float32')
    return embedding

# load embedding from file 
# give proper path of the word_embedding file
word_embedding = load_word_embedding("glove.6B.50d.txt")
#word_embedding

In [6]:
# a weight matrix for the Embedding layer from a loaded embedding
def get_weight_matrix(embedding, vocab):
    # total vocabulary size plus 0 for unknown words
    vocab_size = len(vocab) + 1
    # define weight matrix dimensions with all 0
    weight_matrix = numpy.zeros((vocab_size, 50))
    # step vocab, store vectors using the Tokenizer's integer mapping
    for word, i in vocab.items():
        if word in embedding.keys():
            weight_matrix[i] = embedding.get(word)
    return weight_matrix

# get vectors in the right order
word_vectors = get_weight_matrix(word_embedding, tokenizer.word_index)

In [7]:
# vector representation
#print(word_index['movie'])
print(word_vectors[10])

[ 1.18910000e-01  1.52549997e-01 -8.20730031e-02 -7.41439998e-01
  7.59169996e-01 -4.83280003e-01 -3.10090005e-01  5.14760017e-01
 -9.87079978e-01  6.17570011e-04 -1.50429994e-01  8.37700009e-01
 -1.07969999e+00 -5.14599979e-01  1.31879997e+00  6.20069981e-01
  1.37789994e-01  4.71080005e-01 -7.28740022e-02 -7.26750016e-01
 -7.41159976e-01  7.52629995e-01  8.81799996e-01  2.95610011e-01
  1.35479999e+00 -2.57010007e+00 -1.35230005e+00  4.58799988e-01
  1.00680006e+00 -1.18560004e+00  3.47370005e+00  7.78980017e-01
 -7.29290009e-01  2.51020014e-01 -2.61559993e-01 -3.46839994e-01
  5.58409989e-01  7.50980020e-01  4.98299986e-01 -2.68229991e-01
 -2.74430006e-03 -1.82980001e-02 -2.80959994e-01  5.53179979e-01
  3.77059989e-02  1.85550004e-01 -1.50250003e-01 -5.75119972e-01
 -2.66710013e-01  9.21209991e-01]


In [8]:
# count the non-zeros in the word_vectors. Basic check

vocab_size = len(word_index)+1
nonzero_elements = numpy.count_nonzero(numpy.count_nonzero(word_vectors, axis=1))
nonzero_elements / vocab_size

0.8575317001925248

In [9]:
#build the model for the classification

vocab_size=len(word_index)+1
maxlen=50
embedding_dim=50

# define model
model = Sequential()
model.add(layers.Embedding(vocab_size, embedding_dim, weights=[word_vectors],input_length=maxlen,trainable=False))
model.add(layers.Conv1D(64, 3, activation='relu'))
model.add(layers.GlobalMaxPool1D())
model.add(layers.Dense(num_classes, activation='softmax'))
print(model.summary())


Instructions for updating:
Colocations handled automatically by placer.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 50, 50)            753150    
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 48, 64)            9664      
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 64)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 5)                 325       
Total params: 763,139
Trainable params: 9,989
Non-trainable params: 753,150
_________________________________________________________________
None


In [10]:
from keras.utils import to_categorical

newy_train = to_categorical(y_train)
newy_test = to_categorical(y_test)

In [11]:
# compile model
#model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])


batch_size = 32
epochs =3

# 90% training data and 10% validation data
history = model.fit(newx_train, newy_train, batch_size=batch_size, epochs=epochs, verbose=1,validation_split=0.1)
score = model.evaluate(newx_train, newy_train, batch_size=batch_size, verbose = 1)
scores = model.evaluate(newx_test, newy_test, batch_size=batch_size, verbose = 1)
print('Train loss:{}'.format(score[0]))
print("Train Accuracy: %.2f%%" % (score[1]*100))

print('Test loss:{}'.format(scores[0]))
print("Accuracy: %.2f%%" % (scores[1]*100))

Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Deprecated in favor of operator or tf.math.divide.
Train on 331 samples, validate on 37 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3
Train loss:0.975599045338838
Train Accuracy: 71.20%
Test loss:1.1070873423966612
Accuracy: 61.25%
