### Import the necessary libraries

In [34]:
from numpy import array, asarray, zeros
from keras.preprocessing.text import one_hot, Tokenizer
from keras_preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Embedding

### Define the labels

In [35]:
import pandas as pd
df = pd.read_csv('all-data.csv',encoding='latin1',header=None)              #load the csv
df.head(3)

Unnamed: 0,0,1
0,neutral,"According to Gran , the company has no plans t..."
1,neutral,Technopolis plans to develop in stages an area...
2,negative,The international electronic industry company ...


In [36]:
df.columns = ['label','content']                          #assign names
df.head(3)

Unnamed: 0,label,content
0,neutral,"According to Gran , the company has no plans t..."
1,neutral,Technopolis plans to develop in stages an area...
2,negative,The international electronic industry company ...


In [37]:
label_mapping = {'neutral': 0, 'negative': -1, 'positive': 1}
df['label'] = df['label'].map(label_mapping)                    #convert to label
df.head(3)

Unnamed: 0,label,content
0,0,"According to Gran , the company has no plans t..."
1,0,Technopolis plans to develop in stages an area...
2,-1,The international electronic industry company ...


In [38]:
labels = array(df['label'].tolist())                   #list then array

labels

array([ 0,  0, -1, ..., -1, -1, -1])

In [39]:
len(labels)==len(df)                 #make sure all labels are transferred 

True

### Define the corpus

In [40]:
docs=df['content'].tolist()                   #list

docs

['According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing .',
 'Technopolis plans to develop in stages an area of no less than 100,000 square meters in order to host companies working in computer technologies and telecommunications , the statement said .',
 'The international electronic industry company Elcoteq has laid off tens of employees from its Tallinn facility ; contrary to earlier layoffs the company contracted the ranks of its office workers , the daily Postimees reported .',
 'With the new production plant the company would increase its capacity to meet the expected increase in demand and would improve the use of raw materials and therefore increase the production profitability .',
 "According to the company 's updated strategy for the years 2009-2012 , Basware targets a long-term net sales growth in the range of 20 % -40 % with an operating profit margin of 10 % -20 % of net sales .",
 "FINANCING OF ASPOCO

In [41]:
len(docs)==len(df)    #make sure all contents are transferred 

True

### Tokenizer

In [42]:
t = Tokenizer()
t.fit_on_texts(docs)

In [43]:
vocab_size = len(t.word_index) + 1
vocab_size

10123

### Integer encode the documents

In [44]:
# integer encode the documents
encoded_docs = t.texts_to_sequences(docs)
print(encoded_docs)

[[94, 5, 3498, 1, 11, 16, 250, 336, 5, 655, 124, 88, 5, 150, 2796, 29, 10, 424, 1, 11, 10, 747], [840, 336, 5, 876, 3, 1841, 39, 193, 2, 250, 1102, 100, 292, 63, 574, 1842, 3, 126, 5, 3499, 96, 748, 3, 1186, 575, 4, 749, 1, 929, 19], [1, 293, 656, 142, 11, 337, 16, 1284, 285, 3500, 2, 167, 13, 15, 972, 279, 5011, 5, 147, 1502, 1, 11, 2352, 1, 5012, 2, 15, 267, 973, 1, 877, 5013, 251], [17, 1, 48, 88, 101, 1, 11, 135, 115, 15, 268, 5, 1503, 1, 154, 115, 3, 478, 4, 135, 657, 1, 211, 2, 687, 526, 4, 1641, 115, 1, 88, 1187], [94, 5, 1, 11, 9, 2353, 349, 7, 1, 260, 41, 1188, 303, 1103, 6, 218, 272, 30, 27, 197, 3, 1, 373, 2, 151, 275, 17, 39, 42, 31, 508, 2, 84, 151, 2, 30, 27], [878, 2, 554, 9, 197, 554, 10, 5014, 5015, 15, 197, 349, 18, 2354, 1504, 14, 3501, 116, 1843, 5016, 1505, 1189, 1844, 2797], [7, 1, 155, 51, 2, 53, 634, 9, 30, 27, 2070, 5, 5017, 13, 5018, 7, 1, 164, 50, 6, 32, 147, 180, 26, 2355, 5, 6, 5019, 799, 841, 31, 13, 6, 799, 841, 75, 2, 2798], [3, 1, 157, 51, 2, 53, 30, 27

### Padding documents to a desired max_length

In [45]:
longest_length = len(max(docs, key=lambda x: len(x)))

print(longest_length)

315


In [46]:
max_length = longest_length
padded_docs = pad_sequences(encoded_docs, maxlen=max_length, padding='post')

### ①glove.6B.100d.txt utilization

#### A. Load the GloVe Embedding into memory

In [47]:
embeddings6B100d_index = dict()                    #initialize an empty dictionary 
f = open('glove.6B.100d.txt')

for line in f:                        #for each line
    values = line.split()             #split this line into a list of values
    word = values[0]                  #1st value in the line is actually the word, -->"word"
    # print(word)                        #too long, not print here.
    coefs = asarray(values[1:], dtype='float32')  #embedding vector,-->"coefs"
    embeddings6B100d_index[word] = coefs     #"word" and "coefs" as pair. too long not print here.

f.close()

In [48]:
print('Loaded %s word vectors.' % len(embeddings6B100d_index))

Loaded 400000 word vectors.


#### B. Create Embedding matrix for the custom dataset

In [49]:
# create a weight matrix for words in training docs
embedding6B100d_matrix = zeros((vocab_size, 100))

In [50]:
len(t.word_index), vocab_size

(10122, 10123)

In [51]:
print(t.word_index)



In [52]:
for word, i in t.word_index.items():
    embedding6B100d_vector = embeddings6B100d_index.get(word)  #embeddings_index,'word-coefs' pair. Looking for 10122 words.Everytime returns 100 vectors. 
    if embedding6B100d_vector is not None:
        #put these 100 vectors, the place where corresponds the word.
        embedding6B100d_matrix[i] = embedding6B100d_vector
        #only look for 10122 words, so 10122 rows will be filled. 
        #t.word_index starts from 1, so the row left will be #0. 

print(embedding6B100d_matrix)

[[ 0.          0.          0.         ...  0.          0.
   0.        ]
 [-0.038194   -0.24487001  0.72812003 ... -0.1459      0.82779998
   0.27061999]
 [-0.1529     -0.24279     0.89837003 ... -0.59100002  1.00390005
   0.20664001]
 ...
 [ 0.46875    -0.12616     0.14973    ... -0.027374   -0.097867
  -0.092297  ]
 [ 0.37041     1.05400002  0.22189    ... -0.038925    0.28915
   1.27030003]
 [ 0.26183     0.59227002  0.76067001 ... -0.016644    0.29356
  -0.67343998]]


In [53]:
embedding6B100d_matrix.shape    #check

(10123, 100)

#### C. Define the Embedding layer

In [54]:
e = Embedding(vocab_size, 100, weights=[embedding6B100d_matrix], input_length=longest_length, trainable=False)

#### D. Define the model

In [55]:
model6B100d = Sequential()
model6B100d.add(e)
model6B100d.add(Flatten())
model6B100d.add(Dense(1, activation='sigmoid'))

#### E. Compiling the model

In [56]:
model6B100d.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])

#### F. Summarize the model

In [57]:
print(model6B100d.summary())

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 315, 100)          1012300   
                                                                 
 flatten_2 (Flatten)         (None, 31500)             0         
                                                                 
 dense_2 (Dense)             (None, 1)                 31501     
                                                                 
Total params: 1,043,801
Trainable params: 31,501
Non-trainable params: 1,012,300
_________________________________________________________________
None


#### G. Build the model

In [58]:
model6B100d.fit(padded_docs, labels, epochs=50, verbose=0)

<keras.callbacks.History at 0x2db8da55ed0>

#### H. Evaluating the model

In [59]:
loss, accuracy = model6B100d.evaluate(padded_docs, labels, verbose=0)
print('Accuracy: %f' % (accuracy*100))

Accuracy: 67.973584


### ②glove.6B.300d.txt utilization

In [60]:
embeddings6B300d_index = dict()                    #initialize an empty dictionary 
f = open('glove.6B.300d.txt')

for line in f:                        #for each line
    values = line.split()             #split this line into a list of values
    word = values[0]                  #1st value in the line is actually the word, -->"word"
    coefs = asarray(values[1:], dtype='float32')  #embedding vector,-->"coefs"
    embeddings6B300d_index[word] = coefs     #"word" and "coefs" as pair. too long not print here.

f.close()

print('Loaded %s word vectors.' % len(embeddings6B300d_index))

Loaded 400000 word vectors.


In [61]:
# create a weight matrix for words in training docs
embedding6B300d_matrix = zeros((vocab_size, 300))

for word, i in t.word_index.items():
    embedding6B300d_vector = embeddings6B300d_index.get(word) 
    if embedding6B300d_vector is not None:
        embedding6B300d_matrix[i] = embedding6B300d_vector

print(embedding6B300d_matrix)

[[ 0.          0.          0.         ...  0.          0.
   0.        ]
 [ 0.04656     0.21318001 -0.0074364  ...  0.0090611  -0.20988999
   0.053913  ]
 [-0.076947   -0.021211    0.21270999 ...  0.18351001 -0.29183
  -0.046533  ]
 ...
 [ 0.085362    0.54032999 -0.72474003 ... -0.68325001  0.14291
  -0.40482   ]
 [-0.34834999 -0.1523      0.028034   ... -0.22039001 -0.12592
   0.1101    ]
 [-0.30125001  0.33511999  0.74861002 ... -0.44067001  0.40008
   0.31669   ]]


In [62]:
e = Embedding(vocab_size, 300, weights=[embedding6B300d_matrix], input_length=max_length, trainable=False)

In [63]:
model6B300d = Sequential()
model6B300d.add(e)
model6B300d.add(Flatten())
model6B300d.add(Dense(1, activation='sigmoid'))

In [64]:
model6B300d.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])

In [65]:
print(model6B300d.summary())

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_3 (Embedding)     (None, 315, 300)          3036900   
                                                                 
 flatten_3 (Flatten)         (None, 94500)             0         
                                                                 
 dense_3 (Dense)             (None, 1)                 94501     
                                                                 
Total params: 3,131,401
Trainable params: 94,501
Non-trainable params: 3,036,900
_________________________________________________________________
None


In [66]:
model6B300d.fit(padded_docs, labels, epochs=50, verbose=0)

<keras.callbacks.History at 0x2dbabc8cd30>

In [67]:
loss, accuracy = model6B300d.evaluate(padded_docs, labels, verbose=0)
print('Accuracy: %f' % (accuracy*100))

Accuracy: 70.759392


### ③glove.840B.300d.txt utilization

In [68]:
embeddings840B300d_index = {}
f = open('glove.840B.300d.txt')
try:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = asarray(values[1:], 'float32')
        embeddings840B300d_index[word] = coefs
except:
    f.__next__()
f.close()

print('Found %s word vectors.' % len(embeddings840B300d_index))


Found 52343 word vectors.


In [69]:
# create a weight matrix for words in training docs
embedding840B300d_matrix = zeros((vocab_size, 300))

for word, i in t.word_index.items():
    embedding840B300d_vector = embeddings840B300d_index.get(word) 
    if embedding840B300d_vector is not None:
        embedding840B300d_matrix[i] = embedding840B300d_vector

print(embedding840B300d_matrix)

[[ 0.          0.          0.         ...  0.          0.
   0.        ]
 [ 0.27204001 -0.06203    -0.1884     ...  0.13015001 -0.18317001
   0.1323    ]
 [ 0.060216    0.21799    -0.04249    ...  0.11709    -0.16692001
  -0.094085  ]
 ...
 [-0.25397     0.47027999  0.53327    ... -0.10561     0.053038
   0.091461  ]
 [-0.1052     -0.23491     0.11679    ... -0.23215     0.031999
   0.56875002]
 [ 0.45069     0.55756003  0.44751999 ... -0.10301    -0.23932999
  -0.086815  ]]


In [70]:
e = Embedding(vocab_size, 300, weights=[embedding840B300d_matrix], input_length=max_length, trainable=False)

In [71]:
model840B300d = Sequential()
model840B300d.add(e)
model840B300d.add(Flatten())
model840B300d.add(Dense(1, activation='sigmoid'))

In [72]:
model840B300d.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])

In [73]:
print(model840B300d.summary())

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_4 (Embedding)     (None, 315, 300)          3036900   
                                                                 
 flatten_4 (Flatten)         (None, 94500)             0         
                                                                 
 dense_4 (Dense)             (None, 1)                 94501     
                                                                 
Total params: 3,131,401
Trainable params: 94,501
Non-trainable params: 3,036,900
_________________________________________________________________
None


In [74]:
model840B300d.fit(padded_docs, labels, epochs=50, verbose=0)

<keras.callbacks.History at 0x2dbaf124b20>

In [75]:
loss, accuracy = model840B300d.evaluate(padded_docs, labels, verbose=0)
print('Accuracy: %f' % (accuracy*100))

Accuracy: 72.410238


| embedding vectors | accuracies  |    
| ----------------- | ----------- |
| glove.6B.100d     |   67.97%    | 
| glove.6B.300d     |   70.76%    |
| glove.840B.300d   |   72.41%    |



With such neural netwrok, glove.840B.300d performs the best.

In [77]:
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier

In [82]:
import numpy as np
class GloveVectorizer:
  def __init__(self):
    # load in pre-trained word vectors
    print('Loading word vectors from Glove...')
    word2vec = {}
    embedding = []
    idx2word = []
    with open('glove.6B.100d.txt') as f:
      # is just a space-separated text file in the format:
      # word vec[0] vec[1] vec[2] ...
      for line in f:
        values = line.split()
        word = values[0]
        vec = asarray(values[1:], dtype='float32')
        word2vec[word] = vec
        embedding.append(vec)
        idx2word.append(word)
    print('Found %s word vectors.' % len(word2vec))

    # save for later
    self.word2vec = word2vec
    self.embedding = np.array(embedding)
    self.word2idx = {v:k for k,v in enumerate(idx2word)}
    self.V, self.D = self.embedding.shape

  def fit(self, data):
    pass

  def transform(self, data):
    X = np.zeros((len(data), self.D))
    n = 0
    emptycount = 0
    for sentence in data:
      tokens = sentence.lower().split()
      vecs = []
      for word in tokens:
        if word in self.word2vec:
          vec = self.word2vec[word]
          vecs.append(vec)
      if len(vecs) > 0:
        vecs = np.array(vecs)
        X[n] = vecs.mean(axis=0)
      else:
        emptycount += 1
      n += 1
    print("Numer of samples with no words found: %s / %s" % (emptycount, len(data)))
    return X

  def fit_transform(self, data):
    self.fit(data)
    return self.transform(data)

In [83]:
glove=GloveVectorizer()

Loading word vectors from Glove...
Found 400000 word vectors.


In [88]:
from sklearn.model_selection import train_test_split
Xtrain, Xtest, ytrain, ytest = train_test_split(df.content,df.label,
                                                random_state=42)

In [89]:
xtrain = glove.fit_transform(Xtrain)
xtest = glove.transform(Xtest)

Numer of samples with no words found: 0 / 3634
Numer of samples with no words found: 0 / 1212


In [90]:
svcglove=SVC(kernel='rbf', class_weight='balanced')
svcglove.fit(xtrain,ytrain)

svcglove.score(xtest,ytest)

0.6666666666666666

In [91]:
knnglove = KNeighborsClassifier()
knnglove.fit(xtrain,ytrain)

knnglove.score(xtest,ytest)

0.6658415841584159

In [92]:
nbglove = GaussianNB()
nbglove.fit(xtrain,ytrain)

nbglove.score(xtest,ytest)

0.5750825082508251

| algorithms | accuracies  |    
| -----------| ----------- |
| SVM        |   66.67%    | 
| KNN        |   66.58%    |
| GaussianNB |   57.51%    |

Previously glove.6B.100d performs the worst. This time I try to use it with algorithms, rather than neural network though a simple one. These algorithms all perform worse than that simple neural network. 