In [1]:
import os
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import pandas as pd

from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence

from keras.models import Sequential
from keras.layers import Dense, Flatten, LSTM, Conv1D, MaxPooling1D, Dropout, Activation
from keras.layers.embeddings import Embedding



Using TensorFlow backend.


In [0]:
# data_folder = './twitter-datasets/'
data_folder = './'


# Reading the data
positive_path = os.path.join(data_folder,'train_pos.txt')
negative_path = os.path.join(data_folder,'train_neg.txt')

In [0]:
lines_positive = [line.rstrip('\n') for line in open(positive_path)]
lines_negative = [line.rstrip('\n') for line in open(negative_path)]

# Data preparation

## Create DF

In [4]:
# Create dataFrame from positive tweets and give them value 1 as a sentiment
data_pos = pd.DataFrame({"tweets": lines_positive,
                      "sentiment":np.ones(len(lines_positive))
                      })

# Create dataFrame from negative tweets and give them value 0 as a sentiment
data_neg = pd.DataFrame({"tweets": lines_negative,
                      "sentiment":np.zeros(len(lines_negative))
                      })
# Concat both of them
data = pd.concat([data_pos,data_neg],axis=0).reset_index().drop(columns=['index'])

# Shuffle everything so that we don't have all the positives in one cluster and all the negatives in another
data = data.sample(frac=1).reset_index(drop=True)
data.head()

Unnamed: 0,sentiment,tweets
0,1.0,i have 2 real bestfriends & 1 of them is my bl...
1,0.0,david's backgammon 6.1 - entertain yourself at...
2,1.0,wondering why i'm still awake ! :/ tired but n...
3,1.0,<user> be careful ! have a safe flight ! x
4,1.0,can't wait till that drake concert


## Clean up the text

In [5]:
# Remove <anything> from tweets.
data['tweets'].replace(regex=True,inplace=True,to_replace=r'<.*?>',value=r'')
data.head()

Unnamed: 0,sentiment,tweets
0,1.0,i have 2 real bestfriends & 1 of them is my bl...
1,0.0,david's backgammon 6.1 - entertain yourself at...
2,1.0,wondering why i'm still awake ! :/ tired but n...
3,1.0,be careful ! have a safe flight ! x
4,1.0,can't wait till that drake concert


In [0]:
X = data['tweets'].tolist()
y =  data['sentiment'].tolist()

## Use tokenizer
### From words to numbers

In [0]:
# map words to numbers
vocab_size = 100000
tokenizer = Tokenizer(num_words=vocab_size)
tokenizer.fit_on_texts(X)

In [61]:
num_words = len(tokenizer.word_index)
num_words

103085

In [0]:
data['length'] = data['tweets'].str.split().apply(len)
max_tokens = data['length'].max()
max_tokens = int(max_tokens/2)

X = np.array(sequence.pad_sequences(tokenizer.texts_to_sequences(X), maxlen=max_tokens, padding='post'))

# Split train test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


### From numbers to words

In [0]:
def numbers_to_string(number_array, tokenizer):
    
    ''' 
    Transforms tokens to words
    :param number_array: The numbers array to transform
    :param tokenizer: The tokenizer used

    :return: the original text
    :rtype: String
    '''
        
    indices = tokenizer.word_index
    # Create a dict that mapes numbers to their respective words
    inverse_map = dict(zip(indices.values(), indices.keys()))
        
    # Maps the numbers back to words.
    words = []
    for number in number_array:
        if number != 0: # !=0 is to remove the padding
            words.append(inverse_map[number])
    
    # Concatenate all words.
    text = " ".join(words)

    return text

In [64]:
numbers_to_string(X_train[0],tokenizer)

"bored and can't sleep"

# Neural Network

In [66]:
model = Sequential()

model.add(Embedding(num_words, 32, input_length=max_tokens))
model.add(Conv1D(filters=128, kernel_size=5, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Dropout(0.2))
model.add(Conv1D(filters=64, kernel_size=6, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Dropout(0.2))
model.add(Conv1D(filters=32, kernel_size=7, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Dropout(0.2))
model.add(Conv1D(filters=32, kernel_size=8, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Dropout(0.2))
model.add(Flatten())
model.add(Dense(1,activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

history = model.fit(X_train, y_train, batch_size=128, verbose=1, validation_split=0.2, epochs=3)


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_10 (Embedding)     (None, 31, 32)            3298720   
_________________________________________________________________
conv1d_37 (Conv1D)           (None, 31, 128)           20608     
_________________________________________________________________
max_pooling1d_37 (MaxPooling (None, 15, 128)           0         
_________________________________________________________________
dropout_37 (Dropout)         (None, 15, 128)           0         
_________________________________________________________________
conv1d_38 (Conv1D)           (None, 15, 64)            49216     
_________________________________________________________________
max_pooling1d_38 (MaxPooling (None, 7, 64)             0         
_________________________________________________________________
dropout_38 (Dropout)         (None, 7, 64)             0         
__________

In [67]:
result = model.evaluate(X_test, y_test)



In [68]:
print("Accuracy: {0:.2%}".format(result[1]))

Accuracy: 81.75%
