In [1]:
import pandas as pd 
filename= "tweet.csv"
all_data = pd.read_csv(filename, encoding = "ISO-8859-1", header = None)
all_data.head()

Unnamed: 0,0,1,2,3,4,5
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [2]:
import numpy as np
x = all_data.loc[:, [5]].values
y = all_data.loc[:, [0]].values

x = [str(data) for data in list(x)]
y = [int(data) for data in list(y)]

number_of_data_in_each_class = 70000

x_neg = x[:number_of_data_in_each_class]
y_neg = y[:number_of_data_in_each_class]
x_pos = x[850000:850000+number_of_data_in_each_class]
y_pos = y[850000:850000+number_of_data_in_each_class]
x = x_neg + x_pos
y = y_neg + y_pos


print("All Data X Shape: ",len(x))
print("All Data Y Shape: ",len(y))

All Data X Shape:  140000
All Data Y Shape:  140000


In [3]:
print("Text: ,",type(x),len(x))
print(type(x[0]))
print(type(y),len(y))
print(type(y[0]))

Text: , <class 'list'> 140000
<class 'str'>
<class 'list'> 140000
<class 'int'>


## Tokenization for Puncutuations with NLTK module

In the lexical analysis, tokenization is the process of breaking a stream of text up into words, phrases, symbols, or other meaningful elements called tokens. It helps to reveal semantic information of words. Also, it makes text cleaning easier.

In [4]:
from nltk.tokenize import WordPunctTokenizer

def listToString(s):  
    str1 = " " 
    return (str1.join(s)) 

print("Before tokenization: ",x[15])
x = [listToString(WordPunctTokenizer().tokenize(sentence)) for sentence in x]
print("After tokenization: ",x[15])

Before tokenization:  ['@iamjazzyfizzle I wish I got to watch it with you!! I miss you and @iamlilnicki  how was the premiere?!']
After tokenization:  ['@ iamjazzyfizzle I wish I got to watch it with you !! I miss you and @ iamlilnicki how was the premiere ?!']


## Tokenization for Words with NLTK module

In [5]:
"""import nltk
#nltk.download('punkt')

def listToString(s):  
    str1 = " " 
    return (str1.join(s)) 

print("Before tokenization: ",x[15])
x = [listToString(nltk.word_tokenize(sentence)) for sentence in x]
print("After tokenization: ",x[15])"""

'import nltk\n#nltk.download(\'punkt\')\n\ndef listToString(s):  \n    str1 = " " \n    return (str1.join(s)) \n\nprint("Before tokenization: ",x[15])\nx = [listToString(nltk.word_tokenize(sentence)) for sentence in x]\nprint("After tokenization: ",x[15])'

## Text Cleaning using regular expressions module

In [6]:
import re
from string import punctuation, digits

def preprocess_tweet(tweet):
    #Preprocess the text in a single tweet
    #arguments: tweet = a single tweet in form of string 
    #convert the tweet to lower case
    tweet = tweet.lower()
    #convert all urls to sting "URL"
    tweet = re.sub('((www\.[^\s]+)|(https?://[^\s]+))','URL',tweet)
    #convert all @username to "AT_USER"
    tweet = re.sub('@[^\s]+','AT_USER', tweet)
    #correct all multiple white spaces to a single white space
    tweet = re.sub('[\s]+', ' ', tweet)
    #convert "#topic" to just "topic"
    tweet = re.sub(r'#([^\s]+)', r'\1', tweet)
    
    converter = str.maketrans('', '', punctuation)
    tweet = tweet.translate(converter)
    
    converter = str.maketrans('', '', digits)
    tweet = tweet.translate(converter)
    
    return tweet

print("Before cleaning: \n",x[15])
x = [preprocess_tweet(sentence) for sentence in x]
print("After cleaning: \n",x[15])


Before cleaning: 
 ['@ iamjazzyfizzle I wish I got to watch it with you !! I miss you and @ iamlilnicki how was the premiere ?!']
After cleaning: 
  iamjazzyfizzle i wish i got to watch it with you  i miss you and  iamlilnicki how was the premiere 


In [7]:
print("Input: ,",type(x))
print(type(x[0]))
print("Output: ",type(y),len(y))
print(type(y[0]))

Input: , <class 'list'>
<class 'str'>
Output:  <class 'list'> 140000
<class 'int'>


### Create Multi-label Outputs

In [8]:
#the polarity of the tweet (0 = negative,  4 = positive)
num_neg = []
num_pos = []
for i in range(len(y)):
 
    if y[i]==0: #negative
        y[i] = [1,0]
        num_neg.append((i,"negative"))
    elif y[i]==4: #positive
        y[i] = [0,1]
        num_pos.append((i,"positive"))
    else:
        print("Hata")

x = [sentence.split() for sentence in x] # list in list structure because of Word2vec

In [9]:
print("Input: ,",type(x),len(x))
print(type(x[0]))
print("Output: ",type(y),len(y))
print(type(y[0]))


Input: , <class 'list'> 140000
<class 'list'>
Output:  <class 'list'> 140000
<class 'list'>


### Parameters

In [10]:
#Word2vec
min_count = 5
window = 2
output_size = 120
feature_epoch = 70
model_name = "1"

#train-test split
split_ratio = 0.2



### Train Word2vec Model

In [11]:
from tqdm import tqdm

def Word2vecFeatureExtraction(input, y, min_count,window, output_size, epoch, model_name):
    from gensim.models import Word2Vec
    import multiprocessing
    from time import time  # To time our operations
    
    #create model
    model = Word2Vec(min_count = min_count,window=window,size=output_size,sample=6e-5,alpha=0.03,min_alpha=0.0007,negative=20,workers=4,compute_loss=True)
    t = time()
    model.build_vocab(input, progress_per=10000)
    vocab = list(model.wv.vocab)
    #print("Vocabularies:\n**********************************************\n",vocab)
    print("Vocab. length: ",len(vocab))
    print('Time to build vocab: {} mins'.format(round((time() - t) / 60, 2)))
    

    # remove out-of-vocabulary words
    t = time()
    all_data = []
    y_last = []
    
    
    for i, sentence in enumerate(input):
        new_sentence = []
        for word in sentence:
            if word in vocab:
                new_sentence.append(word)
        if len(new_sentence)!=0:
            all_data.append(new_sentence)
            y_last.append(y[i])
            
            
            
            
    print("Removing is done. {} mins".format(round((time() - t) / 60, 2)))
    
    
    
    #train the Word2vec model
    t = time()
    model.train(all_data, total_examples = model.corpus_count, epochs = epoch, report_delay=1)
    model.save("epoch_" + str(epoch) + "_output_size_"+str(output_size)+"_"+ model_name + ".bin")
    print('Time to train the model: {} mins'.format(round((time() - t) / 60, 2)))
    
    
    tokens_lengths = [len(model.wv[value]) for value in all_data]
    print("Maximum token length in each sample: ", max(tokens_lengths))
    
    features = [model.wv[value] for value in all_data]
    #model = Word2Vec.load(model_name)
    
    return features,y_last,max(tokens_lengths), model


x_features,y,max_tokens_lengths, model = Word2vecFeatureExtraction(x, y, min_count, window, output_size, feature_epoch, model_name)

Vocab. length:  14017
Time to build vocab: 0.01 mins
Removing is done. 0.99 mins
Time to train the model: 1.34 mins
Maximum token length in each sample:  60


In [12]:
print(len(x_features), len(y))

139569 139569


In [13]:
print("Before Word2vec, length of sentence is: {} \n {} ".format(len(x[15]),x[15]))
print("After Word2vec, frekansı az olan kelimeler çıkarıldı:")
print("Word embeddings for first sentence:",np.array(x_features[15]).shape)

Before Word2vec, length of sentence is: 19 
 ['iamjazzyfizzle', 'i', 'wish', 'i', 'got', 'to', 'watch', 'it', 'with', 'you', 'i', 'miss', 'you', 'and', 'iamlilnicki', 'how', 'was', 'the', 'premiere'] 
After Word2vec, frekansı az olan kelimeler çıkarıldı:
Word embeddings for first sentence: (17, 120)


In [14]:
word1 = "car"
word2 = "home"

print(np.array(model.wv.most_similar(positive=word1)))
print("**********************************************")
print(np.array(model.wv.most_similar(positive=word2)))
print("**********************************************")
print("Similarity between {} and {} is : {}".format(word1,word2,model.wv.similarity(word1,word2)))


[['motorcycle' '0.4432039260864258']
 ['bike' '0.43241509795188904']
 ['phone' '0.4299317002296448']
 ['room' '0.4166398048400879']
 ['convertible' '0.41147440671920776']
 ['house' '0.40856924653053284']
 ['apt' '0.407231867313385']
 ['truck' '0.40702593326568604']
 ['driveway' '0.40367114543914795']
 ['roof' '0.3988969326019287']]
**********************************************
[['back' '0.728092610836029']
 ['bed' '0.6311469674110413']
 ['work' '0.5888208150863647']
 ['today' '0.5094830989837646']
 ['school' '0.4937937259674072']
 ['now' '0.48729169368743896']
 ['at' '0.4853334426879883']
 ['finally' '0.4852288067340851']
 ['off' '0.48489850759506226']
 ['tomorrow' '0.4825339615345001']]
**********************************************
Similarity between car and home is : 0.27421873807907104


In [15]:
def paddedFeatures(features, target_dimension, position):
    import numpy as np
    padded_features = []
    
    if position == "right":
        for sample in features:
            zeros = np.zeros((target_dimension-sample.shape[0],sample.shape[1]))
            padded_sample = np.concatenate((sample,zeros),axis=0)
            padded_features.append(padded_sample)
             
    if position == "left":
        for sample in features:
            zeros = np.zeros((target_dimension-sample.shape[0],sample.shape[1]))
            padded_sample = np.concatenate((zeros,sample),axis=0)
            padded_features.append(padded_sample)
            
    print("Padding is done.")
            
    return padded_features

x_features = paddedFeatures(x_features, target_dimension= max_tokens_lengths, position = "right")

Padding is done.


In [16]:
print("Word embeddings for a sentence after zero-padding:",np.array(x_features[0]).shape)

Word embeddings for a sentence after zero-padding: (60, 120)


In [17]:
from sklearn.model_selection import train_test_split
#Data Split Train and Test
x_train, x_test, y_train, y_test = train_test_split(x_features, y, test_size= split_ratio, random_state=42)

In [18]:
import numpy as np
x_train = np.array(x_train)
y_train = np.array(y_train)
x_test = np.array(x_test)
y_test = np.array(y_test)

In [19]:
print(x_train.shape,y_train.shape)
print("(Number of sentence),(Number of tokens in each sample), (embedding size for each token)")

(111655, 60, 120) (111655, 2)
(Number of sentence),(Number of tokens in each sample), (embedding size for each token)


In [20]:
from keras.models import Sequential
from keras.utils.np_utils import to_categorical
import re

model = Sequential()
model.add(LSTM(units = 100, dropout=0.2, recurrent_dropout=0.2,return_sequences=False))
#model.add(LSTM(units = 100, dropout=0.2, recurrent_dropout=0.2,return_sequences=False))
model.add(Dense(100, activation='relu'))
model.add(Dense(2,activation='softmax'))
model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])


Using TensorFlow backend.


Instructions for updating:
Colocations handled automatically by placer.


In [21]:
batch_size = 32
model.fit(x_train, y_train, epochs = 70, batch_size=batch_size, verbose = 2)

Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Instructions for updating:
Use tf.cast instead.
Epoch 1/70
 - 178s - loss: 0.6720 - acc: 0.5622
Epoch 2/70
 - 165s - loss: 0.6428 - acc: 0.6416
Epoch 3/70
 - 169s - loss: 0.6641 - acc: 0.5720
Epoch 4/70
 - 167s - loss: 0.6609 - acc: 0.5831
Epoch 5/70
 - 167s - loss: 0.6681 - acc: 0.5654
Epoch 6/70
 - 168s - loss: 0.6596 - acc: 0.5903
Epoch 7/70
 - 173s - loss: 0.6622 - acc: 0.5902
Epoch 8/70
 - 174s - loss: 0.6590 - acc: 0.6064
Epoch 9/70
 - 170s - loss: 0.6594 - acc: 0.6008
Epoch 10/70
 - 169s - loss: 0.6634 - acc: 0.5955
Epoch 11/70
 - 173s - loss: 0.6510 - acc: 0.6246
Epoch 12/70
 - 171s - loss: 0.5354 - acc: 0.7355
Epoch 13/70
 - 166s - loss: 0.4837 - acc: 0.7678
Epoch 14/70
 - 171s - loss: 0.4621 - acc: 0.7797
Epoch 15/70
 - 166s - loss: 0.4457 - acc: 0.7900
Epoch 16/70
 - 162s - loss: 0.4345 - acc: 0.7962
Epoch 17/70
 - 163s - loss: 0.4258 - acc: 0.8008
Epoch 18/70
 

<keras.callbacks.History at 0x1bd6bbf6708>

In [22]:
score,acc = model.evaluate(x_test, y_test, verbose = 2, batch_size = batch_size)
print("score: %.2f" % (score))
print("acc: %.2f" % (acc))

score: 0.43
acc: 0.81


'Epoch 1/50\n - 54s - loss: 0.6461 - acc: 0.6331\nEpoch 2/50\n - 53s - loss: 0.6677 - acc: 0.6044\nEpoch 3/50\n - 56s - loss: 0.6652 - acc: 0.5912\nEpoch 4/50\n - 58s - loss: 0.6648 - acc: 0.5905\nEpoch 5/50\n - 58s - loss: 0.6646 - acc: 0.5882\nEpoch 6/50'