In [None]:
import keras
import gc
import pandas as pd
import numpy as np
import nltk
import matplotlib.pyplot as plt 
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.manifold import TSNE
from keras.models import Sequential
from keras.layers import Dense, LSTM, Bidirectional, Dropout, Flatten
from keras.utils import np_utils
from keras import optimizers
from keras import regularizers
from keras.utils.training_utils import multi_gpu_model
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding

# <span style="color:red"> Before this line are method predefined</span>

In [2]:
def clean_str(string):  
    """ 
    Tokenization/string cleaning for dataset 
    Every dataset is lower cased except 
    """  
    sens = word_tokenize(string.lower())
    sens = [word for word in sens if not word in english_stopwords]
    sens = [word for word in sens if not word in english_punctuations]
    sens = [lemmatizer.lemmatize(word) for word in sens]
    sens = [word for word in sens if word.isalpha()]
    sens = ' '.join(sens)
    return sens

In [3]:
def plot_2d(X, label):
    # only for this case!
    plt.figure()
    # plt.scatter(aa[:,0],aa[:,1])
    point_1 = []
    point_0 = []
    point_2 = []
    for i in range(len(label)):
        if label[i]== '1' or label[i]== 1:
            point_1.append(X[i])
        elif label[i]== '0' or label[i]== 0:
            point_0.append(X[i])
        else:
            point_2.append(X[i])
    point_1 = np.asarray(point_1)
    point_0 = np.asarray(point_0)
    point_2 = np.asarray(point_2)
    plt.scatter(point_1[:,0],point_1[:,1],color='red')
    plt.scatter(point_0[:,0],point_0[:,1],color='g')
    plt.scatter(point_2[:,0],point_2[:,1],color='b')
    plt.show()

In [4]:
def vis_tsne(X, label):
    ts = TSNE()
    X_lower = ts.fit_transform(X.reshape(X.shape[0], X.shape[2]),)
    plot_2d(X_lower, label)
    return X
    

# <span style="color:red"> Now we do some preprocessing</span>

In [5]:
df = pd.read_csv('../data/Tweets.csv')

In [6]:
LEARNING_RATE = 0.02
MAX_FEATURES = 2000
BATCH_SIZE = 8
EPOCHS = 100
DECAY = 2e-4  # about half each epoch

In [7]:
X = df['text']
y = df['airline_sentiment']
y.replace({'neutral':'2', 'positive':'1', 'negative':'0'}, inplace = True)

In [8]:
y.value_counts()

0    9178
2    3099
1    2363
Name: airline_sentiment, dtype: int64

In [9]:
# preprocssing, stopwords and rare words, tokenization and vectorizing
lemmatizer=WordNetLemmatizer()
english_stopwords = stopwords.words('english')
english_punctuations = [',', '.','\'s', ':', ';', '?', '(', ')', '[', ']', '&', '!', '*', '@', '#', '$', '%']
X = X.apply(clean_str)

# vectorizing using tfidf
vectorizer = CountVectorizer(ngram_range = (1,2), max_df = 0.95,min_df = 0.001, max_features = MAX_FEATURES)
X = vectorizer.fit_transform(X)

#  <span style="color:red"> For our first model, bidirectional LSTM with fine-tuning </span> 

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.2, random_state = 517)
X_train = X_train.toarray()
X_train = np.reshape(X_train,(X_train.shape[0], 1, X_train.shape[1]))
X_test = X_test.toarray()
X_test = np.reshape(X_test, (X_test.shape[0], 1, X_test.shape[1]))

y_train = np_utils.to_categorical(y_train, num_classes=3)
y_test = np_utils.to_categorical(y_test, num_classes=3)

X_train.shape

(11712, 1, 1618)

In [11]:
bi_lstm = Sequential()
#bi_lstm.add(Dense(512, activation = 'relu', input_shape = (1, X_train.shape[2])))
#bi_lstm.add(Dropout(0.3))
bi_lstm.add(Bidirectional(LSTM(128, recurrent_dropout = 0.3, return_sequences=False),input_shape = (1, X_train.shape[2])))
#bi_lstm.add(Dense(64, activation = 'relu'))
bi_lstm.add(Dense(3,activation = 'softmax'))
bi_lstm = multi_gpu_model(bi_lstm)
bi_lstm.summary()

earlystopping = keras.callbacks.EarlyStopping(monitor = 'val_loss', patience = 15, verbose = 1)

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
bidirectional_1_input (InputLay (None, 1, 1618)      0                                            
__________________________________________________________________________________________________
lambda_1 (Lambda)               (None, 1, 1618)      0           bidirectional_1_input[0][0]      
__________________________________________________________________________________________________
lambda_2 (Lambda)               (None, 1, 1618)      0           bidirectional_1_input[0][0]      
__________________________________________________________________________________________________
sequential_1 (Sequential)       (None, 3)            1789699     lambda_1[0][0]                   
                                                                 lambda_2[0][0]                   
__________

In [12]:
bi_lstm.compile(loss='categorical_crossentropy', optimizer=optimizers.SGD(lr = LEARNING_RATE), metrics=['accuracy'])
bi_lstm.fit(X_train, y_train, epochs=EPOCHS, batch_size=BATCH_SIZE, validation_split=0.2, callbacks = [earlystopping], verbose=2)

Train on 9369 samples, validate on 2343 samples
Epoch 1/100
 - 26s - loss: 0.8912 - acc: 0.6314 - val_loss: 0.8736 - val_acc: 0.6116
Epoch 2/100
 - 20s - loss: 0.8199 - acc: 0.6348 - val_loss: 0.8076 - val_acc: 0.6287
Epoch 3/100
 - 20s - loss: 0.7561 - acc: 0.6651 - val_loss: 0.7534 - val_acc: 0.6633
Epoch 4/100
 - 20s - loss: 0.7033 - acc: 0.7058 - val_loss: 0.7097 - val_acc: 0.6957
Epoch 5/100
 - 20s - loss: 0.6612 - acc: 0.7315 - val_loss: 0.6810 - val_acc: 0.7089
Epoch 6/100
 - 20s - loss: 0.6287 - acc: 0.7436 - val_loss: 0.6547 - val_acc: 0.7243
Epoch 7/100
 - 20s - loss: 0.6024 - acc: 0.7503 - val_loss: 0.6367 - val_acc: 0.7294
Epoch 8/100
 - 20s - loss: 0.5813 - acc: 0.7595 - val_loss: 0.6245 - val_acc: 0.7367
Epoch 9/100
 - 20s - loss: 0.5624 - acc: 0.7654 - val_loss: 0.6128 - val_acc: 0.7439
Epoch 10/100
 - 20s - loss: 0.5461 - acc: 0.7733 - val_loss: 0.6059 - val_acc: 0.7486
Epoch 11/100
 - 20s - loss: 0.5319 - acc: 0.7814 - val_loss: 0.6004 - val_acc: 0.7482
Epoch 12/100
 -

<keras.callbacks.History at 0x7f36dc0e5320>

In [13]:
bi_lstm.evaluate(X_test, y_test)



[0.5990455647309622, 0.7667349726775956]

#  <span style="color:red">SVM with Tf-Idf!</span>
###  Use train and test from above. We are doing based on One-hot embedding method. We will try to do GloVe later.

In [142]:
# Use train and test from above. We are doing based on TFIDF embedding method. We will try to do GloVe later.
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.2, random_state = 517)

In [143]:
from sklearn import svm, grid_search

# parameters = {'kernel':('linear', 'rbf'), 'C':[1, 10,100], 'gamma':[0.1,1,10]}
# svr = svm.SVC(class_weight = 'balanced', verbose = True)
# clf = grid_search.GridSearchCV(svr, parameters)
# clf.fit(X_train, y_train)



[LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM]

GridSearchCV(cv=None, error_score='raise',
       estimator=SVC(C=1.0, cache_size=200, class_weight='balanced', coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=True),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'C': [1, 10, 100], 'gamma': [0.1, 1, 10], 'kernel': ('linear', 'rbf')},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)

In [144]:
clf.best_estimator_

SVC(C=10, cache_size=200, class_weight='balanced', coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.1, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=True)

In [145]:
clf.score(X_test, y_test)

0.764344262295082

In [147]:
clf = svm.SVC(kernel = 'rbf',gamma = 0.1, C = 10,class_weight = 'balanced', verbose = True)

# <span style="color:red">Using GloVe pretrained word vector to do embedding!<span>

In [134]:
LEARNING_RATE = 0.02
MAX_SEQUENCE_LENGTH = 120
EMBEDDING_DIM = 100
BATCH_SIZE = 64
EPOCHS = 20
DECAY = 5e-5  # about half each epoch

In [135]:
X = df['text']
y = df['airline_sentiment']
y.replace({'neutral':'2', 'positive':'1', 'negative':'0'}, inplace = True)
print("X shape: {},  y shape:{}".format(X.shape, y.shape))

X shape: (14640,),  y shape:(14640,)


In [136]:
# preprocssing, stopwords and rare words, tokenization and vectorizing
lemmatizer=WordNetLemmatizer()
english_stopwords = stopwords.words('english')
english_punctuations = [',', '.','\'s', ':', ';', '?', '(', ')', '[', ']', '&', '!', '*', '@', '#', '$', '%']
X = X.apply(clean_str)
print("X shape: {},  y shape:{}".format(X.shape, y.shape))

X shape: (14640,),  y shape:(14640,)


In [137]:
# After getting rid of stopwords
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X)
sequences = tokenizer.texts_to_sequences(X)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of data tensor:', data.shape)

Found 10411 unique tokens.
Shape of data tensor: (14640, 120)


In [138]:
# OK, let's do embedding
embedding_index = {}
with open ('../data/glove.twitter.27B.100d.txt', 'r') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype = 'float32')
        embedding_index[word] = coefs
        
print('Found %s word vectors' % len(embedding_index))

Found 1193514 word vectors


In [139]:
# Then embedding!
embedding_matrix = np.zeros((len(word_index)+1, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embedding_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [140]:
X_train, X_test, y_train, y_test = train_test_split(data,y,test_size = 0.2, random_state = 517)

y_train = np_utils.to_categorical(y_train, num_classes=3)
y_test = np_utils.to_categorical(y_test, num_classes=3)

X_train.shape

(11712, 120)

In [141]:
bi_lstm_glo = Sequential()
embedding_layer = Embedding(len(word_index) + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False,
                            name='embedding_layer')
bi_lstm_glo.add(embedding_layer)
bi_lstm_glo.add(Dropout(0.2))
bi_lstm_glo.add(Bidirectional(LSTM(128, return_sequences=False)))
#bi_lstm.add(Dense(64, activation = 'relu'))
bi_lstm_glo.add(Dense(3,activation = 'softmax'))

bi_lstm_glo = multi_gpu_model(bi_lstm_glo)
bi_lstm_glo.summary()

earlystopping = keras.callbacks.EarlyStopping(monitor = 'val_loss', patience = 5, verbose = 1)

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
embedding_layer_input (InputLay (None, 120)          0                                            
__________________________________________________________________________________________________
lambda_29 (Lambda)              (None, 120)          0           embedding_layer_input[0][0]      
__________________________________________________________________________________________________
lambda_30 (Lambda)              (None, 120)          0           embedding_layer_input[0][0]      
__________________________________________________________________________________________________
sequential_19 (Sequential)      (None, 3)            1276467     lambda_29[0][0]                  
                                                                 lambda_30[0][0]                  
__________

In [142]:
gc.collect()

bi_lstm_glo.compile(loss='categorical_crossentropy', optimizer=optimizers.SGD(lr = LEARNING_RATE), metrics=['accuracy'])
bi_lstm_glo.fit(X_train, y_train, epochs=EPOCHS, batch_size=BATCH_SIZE, validation_split=0.2, callbacks = [earlystopping], verbose=1)

Train on 9369 samples, validate on 2343 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 00017: early stopping


<keras.callbacks.History at 0x7f3550e55cf8>

In [143]:
bi_lstm_glo.evaluate(X_test, y_test)



[0.6512906798899499, 0.742827868852459]

# <span style="color:red">Using GloVe pretrained word vector to make a Feedforward neural net!<span>

In [144]:
mlp = Sequential()
embedding_layer = Embedding(len(word_index) + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False,
                            name='embedding_layer')
mlp.add(embedding_layer)
mlp.add(Dropout(0.3))
mlp.add(Flatten())
mlp.add(Dense(128))
mlp.add(Dropout(0.3))
mlp.add(Dense(16))
mlp.add(Dense(3,activation = 'softmax'))

mlp = multi_gpu_model(bi_lstm_glo)
mlp.summary()

earlystopping = keras.callbacks.EarlyStopping(monitor = 'val_loss', patience = 15, verbose = 1)

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
embedding_layer_input (InputLay (None, 120)          0                                            
__________________________________________________________________________________________________
lambda_31 (Lambda)              (None, 120)          0           embedding_layer_input[0][0]      
__________________________________________________________________________________________________
lambda_32 (Lambda)              (None, 120)          0           embedding_layer_input[0][0]      
__________________________________________________________________________________________________
model_15 (Model)                (None, 3)            1276467     lambda_31[0][0]                  
                                                                 lambda_32[0][0]                  
__________

In [145]:
gc.collect()
mlp.compile(loss='categorical_crossentropy', optimizer=optimizers.SGD(lr = LEARNING_RATE), metrics=['accuracy'])
mlp.fit(X_train, y_train, epochs=EPOCHS, batch_size=BATCH_SIZE, validation_split=0.2, callbacks = [earlystopping], verbose=1)

Train on 9369 samples, validate on 2343 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7f354985c3c8>

In [146]:
mlp.evaluate(X_test, y_test)



[0.573737268565131, 0.7684426229508197]