In [1]:
import pandas as pd 
filename= "tweet.csv"
all_data = pd.read_csv(filename, encoding = "ISO-8859-1", header = None)
all_data.head()

Unnamed: 0,0,1,2,3,4,5
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [2]:
import numpy as np
x = all_data.loc[:, [5]].values
y = all_data.loc[:, [0]].values

x = [str(data) for data in list(x)]
y = [int(data) for data in list(y)]

number_of_data_in_each_class = 15000

x_neg = x[:number_of_data_in_each_class]
y_neg = y[:number_of_data_in_each_class]
x_pos = x[850000:850000+number_of_data_in_each_class]
y_pos = y[850000:850000+number_of_data_in_each_class]
x = x_neg + x_pos
y = y_neg + y_pos


print("All Data X Shape: ",len(x))
print("All Data Y Shape: ",len(y))

All Data X Shape:  30000
All Data Y Shape:  30000


In [3]:
print("Text: ,",type(x),len(x))
print(type(x[0]))
print(type(y),len(y))
print(type(y[0]))

Text: , <class 'list'> 30000
<class 'str'>
<class 'list'> 30000
<class 'int'>


## Tokenization for Puncutuations with NLTK module

In the lexical analysis, tokenization is the process of breaking a stream of text up into words, phrases, symbols, or other meaningful elements called tokens. It helps to reveal semantic information of words. Also, it makes text cleaning easier.

In [4]:
from nltk.tokenize import WordPunctTokenizer

def listToString(s):  
    str1 = " " 
    return (str1.join(s)) 

print("Before tokenization: ",x[15])
x = [listToString(WordPunctTokenizer().tokenize(sentence)) for sentence in x]
print("After tokenization: ",x[15])

Before tokenization:  ['@iamjazzyfizzle I wish I got to watch it with you!! I miss you and @iamlilnicki  how was the premiere?!']
After tokenization:  ['@ iamjazzyfizzle I wish I got to watch it with you !! I miss you and @ iamlilnicki how was the premiere ?!']


## Tokenization for Words with NLTK module

In [5]:
import nltk
#nltk.download('punkt')

def listToString(s):  
    str1 = " " 
    return (str1.join(s)) 

print("Before tokenization: ",x[15])
x = [listToString(nltk.word_tokenize(sentence)) for sentence in x]
print("After tokenization: ",x[15])

Before tokenization:  ['@ iamjazzyfizzle I wish I got to watch it with you !! I miss you and @ iamlilnicki how was the premiere ?!']
After tokenization:  [ ' @ iamjazzyfizzle I wish I got to watch it with you ! ! I miss you and @ iamlilnicki how was the premiere ? ! ' ]


## Text Cleaning using regular expressions module

In [6]:
import re
from string import punctuation, digits

def preprocess_tweet(tweet):
    #Preprocess the text in a single tweet
    #arguments: tweet = a single tweet in form of string 
    #convert the tweet to lower case
    tweet = tweet.lower()
    #convert all urls to sting "URL"
    tweet = re.sub('((www\.[^\s]+)|(https?://[^\s]+))','URL',tweet)
    #convert all @username to "AT_USER"
    tweet = re.sub('@[^\s]+','AT_USER', tweet)
    #correct all multiple white spaces to a single white space
    tweet = re.sub('[\s]+', ' ', tweet)
    #convert "#topic" to just "topic"
    tweet = re.sub(r'#([^\s]+)', r'\1', tweet)
    
    converter = str.maketrans('', '', punctuation)
    tweet = tweet.translate(converter)
    
    converter = str.maketrans('', '', digits)
    tweet = tweet.translate(converter)
    
    return tweet

print("Before cleaning: \n",x[15])
x = [preprocess_tweet(sentence) for sentence in x]
print("After cleaning: \n",x[15])


Before cleaning: 
 [ ' @ iamjazzyfizzle I wish I got to watch it with you ! ! I miss you and @ iamlilnicki how was the premiere ? ! ' ]
After cleaning: 
    iamjazzyfizzle i wish i got to watch it with you   i miss you and  iamlilnicki how was the premiere    


In [7]:
print("Input: ,",type(x))
print(type(x[0]))
print("Output: ",type(y),len(y))
print(type(y[0]))

Input: , <class 'list'>
<class 'str'>
Output:  <class 'list'> 30000
<class 'int'>


### Create Multi-label Outputs

In [8]:
#the polarity of the tweet (0 = negative,  4 = positive)
num_neg = []
num_pos = []
for i in range(len(y)):
 
    if y[i]==0: #negative
        y[i] = [1,0]
        num_neg.append((i,"negative"))
    elif y[i]==4: #positive
        y[i] = [0,1]
        num_pos.append((i,"positive"))
    else:
        print("Hata")

x = [sentence.split() for sentence in x] # list in list structure because of Word2vec

In [9]:
print("Input: ,",type(x),len(x))
print(type(x[0]))
print("Output: ",type(y),len(y))
print(type(y[0]))


Input: , <class 'list'> 30000
<class 'list'>
Output:  <class 'list'> 30000
<class 'list'>


### Parameters

In [10]:
#Word2vec
min_count = 10
window = 5
output_size = 40
feature_epoch = 70
model_name = "1"

#padding max token_size
target_dimension = 33

#train-test split
split_ratio = 0.2



### Train Word2vec Model

In [11]:
from tqdm import tqdm

def Word2vecFeatureExtraction(input, y, min_count,window, output_size, epoch, model_name):
    from gensim.models import Word2Vec
    import multiprocessing
    from time import time  # To time our operations
    
    #create model
    model = Word2Vec(min_count = min_count,window=window,size=output_size,sample=6e-5,alpha=0.03,min_alpha=0.0007,negative=20,workers=4,compute_loss=True)
    t = time()
    model.build_vocab(input, progress_per=10000)
    vocab = list(model.wv.vocab)
    print("Vocabularies:\n**********************************************\n",vocab)
    print("Vocab. length: ",len(vocab))
    print('Time to build vocab: {} mins'.format(round((time() - t) / 60, 2)))
    

    # remove out-of-vocabulary words
    t = time()
    all_data = []
    idx = 0
    for sentence in tqdm(input):
        new_sentence = []
        for word in sentence:
            if word in vocab:
                new_sentence.append(word)
        if len(new_sentence)==0:
            y.pop(idx)
            idx = idx + 1
        else:
            all_data.append(new_sentence)
            idx = idx + 1
            
    print("Removing is done. {} mins".format(round((time() - t) / 60, 2)))
    
    
    
    #train the Word2vec model
    t = time()
    model.train(all_data, total_examples = model.corpus_count, epochs = epoch, report_delay=1)
    model.save("epoch_" + str(epoch) + "_output_size_"+str(output_size)+"_"+ model_name + ".bin")
    print('Time to train the model: {} mins'.format(round((time() - t) / 60, 2)))
    
    
    tokens_lengths = [len(model.wv[value]) for value in all_data]
    print("Maximum token length in each sample: ", max(tokens_lengths))
    
    features = [model.wv[value] for value in all_data]
    #model = Word2Vec.load(model_name)
    
    return features,y,max(tokens_lengths), model


x_features,y,max_tokens_lengths, model = Word2vecFeatureExtraction(x, y, min_count, window, output_size, feature_epoch, model_name)

Vocabularies:
**********************************************
 ['http', 'twitpic', 'com', 'awww', 'that', 's', 'a', 'bummer', 'you', 'got', 'david', 'of', 'third', 'day', 'to', 'do', 'it', 'd', 'is', 'upset', 'he', 'can', 't', 'update', 'his', 'facebook', 'by', 'and', 'might', 'cry', 'as', 'result', 'school', 'today', 'also', 'blah', 'i', 'many', 'times', 'for', 'the', 'ball', 'managed', 'save', 'rest', 'go', 'out', 'my', 'whole', 'body', 'feels', 'itchy', 'like', 'its', 'on', 'fire', 'no', 'not', 'at', 'all', 'm', 'mad', 'why', 'am', 'here', 'because', 'see', 'over', 'there', 'crew', 'need', 'hug', 'hey', 'long', 'time', 'yes', 'bit', 'only', 'lol', 'fine', 'thanks', 'how', 'nope', 'they', 'didn', 'have', 'me', 'spring', 'break', 'in', 'plain', 'city', 'snowing', 'just', 're', 'ears', 'couldn', 'bear', 'watch', 'thought', 'loss', 'was', 'idk', 'did', 'either', 'never', 'talk', 'anymore', 'would', 've', 'been', 'first', 'but', 'gun', 'really', 'though', 'wish', 'with', 'miss', 'premiere

100%|██████████████████████████████████████████████████████████████████████████| 30000/30000 [00:05<00:00, 5420.11it/s]


Removing is done. 0.09 mins
Time to train the model: 0.25 mins
Maximum token length in each sample:  50


In [12]:
print("Before Word2vec, length of sentence is: {} \n {} ".format(len(x[15]),x[15]))
print("After Word2vec, frekansı az olan kelimeler çıkarıldı:")
print("Word embeddings for first sentence:",np.array(x_features[15]).shape)

Before Word2vec, length of sentence is: 19 
 ['iamjazzyfizzle', 'i', 'wish', 'i', 'got', 'to', 'watch', 'it', 'with', 'you', 'i', 'miss', 'you', 'and', 'iamlilnicki', 'how', 'was', 'the', 'premiere'] 
After Word2vec, frekansı az olan kelimeler çıkarıldı:
Word embeddings for first sentence: (17, 40)


In [13]:
word1 = "car"
word2 = "home"

print(np.array(model.wv.most_similar(positive=word1)))
print("**********************************************")
print(np.array(model.wv.most_similar(positive=word2)))
print("**********************************************")
print("Similarity between {} and {} is : {}".format(word1,word2,model.wv.similarity(word1,word2)))


[['accident' '0.5716012120246887']
 ['machine' '0.5589753985404968']
 ['dropped' '0.5571492910385132']
 ['shops' '0.5541484355926514']
 ['van' '0.5377576351165771']
 ['into' '0.5316316485404968']
 ['bathroom' '0.5268046855926514']
 ['garage' '0.5234856605529785']
 ['ran' '0.5125377178192139']
 ['opened' '0.5107675790786743']]
**********************************************
[['back' '0.6442150473594666']
 ['lunch' '0.6284584403038025']
 ['work' '0.6224349737167358']
 ['leaving' '0.6159682273864746']
 ['tired' '0.6112692952156067']
 ['babysitting' '0.6034722924232483']
 ['today' '0.6004750728607178']
 ['sisters' '0.6003066301345825']
 ['house' '0.5973416566848755']
 ['breakfast' '0.5951350331306458']]
**********************************************
Similarity between car and home is : 0.36809277534484863


In [14]:
def paddedFeatures(features, target_dimension, position):
    import numpy as np
    padded_features = []
    
    if position == "right":
        for sample in features:
            zeros = np.zeros((target_dimension-sample.shape[0],sample.shape[1]))
            padded_sample = np.concatenate((sample,zeros),axis=0)
            padded_features.append(padded_sample)
             
    if position == "left":
        for sample in features:
            zeros = np.zeros((target_dimension-sample.shape[0],sample.shape[1]))
            padded_sample = np.concatenate((zeros,sample),axis=0)
            padded_features.append(padded_sample)
            
    print("Padding is done.")
            
    return padded_features

x_features = paddedFeatures(x_features, target_dimension= max_tokens_lengths, position = "right")

Padding is done.


In [15]:
print("Word embeddings for a sentence after zero-padding:",np.array(x_features[0]).shape)

Word embeddings for a sentence after zero-padding: (50, 40)


In [16]:
from sklearn.model_selection import train_test_split
#Data Split Train and Test
x_train, x_test, y_train, y_test = train_test_split(x_features, y, test_size= split_ratio, random_state=42)

In [17]:
import torch.nn as nn
import torch
x_train = torch.tensor(x_train).float()
y_train = torch.tensor(y_train).float()

In [18]:
print(x_train.size(),y_train.size())
print("(Number of sentence),(Number of tokens in each sample), (embedding size for each token)")

torch.Size([23875, 50, 40]) torch.Size([23875, 2])
(Number of sentence),(Number of tokens in each sample), (embedding size for each token)


In [19]:
import torch.nn as nn
from time import time

class LSTM(nn.Module):
    def __init__(self, input_size=40, hidden_layer_size=5, output_size=2):
        super().__init__()
        
        self.batch = 1
        self.num_layers = 1
        self.hidden_layer_size = 5

        self.lstm = nn.LSTM(input_size, hidden_layer_size, batch_first = True) #add dropout = 0.2

        self.linear = nn.Linear(hidden_layer_size, hidden_layer_size)
        
        self.linear2 = nn.Linear(hidden_layer_size, output_size)

        self.hidden_cell = (torch.zeros(self.num_layers,self.batch,self.hidden_layer_size),
                            torch.zeros(self.num_layers,self.batch,self.hidden_layer_size))

    def forward(self, input_seq):
        
        #print("Input Shape: ", input_seq.shape)
    
        input_seq = input_seq.view(1,max_tokens_lengths,output_size)
        #print("Expanded LSTM Input Shape: ", input_seq.shape)
        
        lstm_out, self.hidden_cell = self.lstm(input_seq, self.hidden_cell)
        #print("LSTM output shape: ",lstm_out.shape)
        
        #print("Linear Layer Input Shape: ",lstm_out.view(-1,self.hidden_layer_size).shape)
        predictions = self.linear(lstm_out.view(-1,self.hidden_layer_size))
        
        predictions = self.linear2(predictions)
        
        #print("Predictions shape: ",predictions.shape)
        #print("Returned values for loss", predictions[-1].shape)
        y = torch.nn.functional.softmax(predictions[-1], dim=0)
        
        return y
    

model = LSTM()
loss_function = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr = 0.001)
print(model)

epochs = 150


average_loss = []
t = time()
for i in range(epochs):
    print("Epoch:", i)
    loss_list = []
    
    for k in range(y_train.shape[0]):
        if k%1000==0:
            print("Trained sample: ",k)
        optimizer.zero_grad()
        
       
        model.hidden_cell = (torch.zeros(model.num_layers, model.batch, model.hidden_layer_size),
                        torch.zeros(model.num_layers, model.batch, model.hidden_layer_size))

        y_pred = model(x_train[k])
        
        
        
        
        single_loss = loss_function(y_pred, y_train[k])
        single_loss.backward()
        optimizer.step()
        loss_list.append(single_loss.item())

        
        """if k%100==0:
            print("Data: ",k)
            print("Toplam geçen süre:",round((time() - t) / 60, 2)," dakika")
            print(f'epoch: {i:3} loss: {single_loss.item():10.8f}')
            print(y_pred)
            print(y_train[k])
            print("*************************************************")"""
        
    average_loss.append(sum(loss_list)/len(loss_list))
    checkpoint = {'model': LSTM(),'state_dict': model.state_dict(),'optimizer' : optimizer.state_dict()}
    print("Average loss: ",sum(loss_list)/y_train.shape[0])
    torch.save(checkpoint, 'Epoch {} Checkpoint.pth'.format(i))
    print("\nModel kaydedildi.\n")

LSTM(
  (lstm): LSTM(40, 5, batch_first=True)
  (linear): Linear(in_features=5, out_features=5, bias=True)
  (linear2): Linear(in_features=5, out_features=2, bias=True)
)
Epoch: 0
Trained sample:  0
Trained sample:  1000
Trained sample:  2000
Trained sample:  3000
Trained sample:  4000
Trained sample:  5000
Trained sample:  6000
Trained sample:  7000
Trained sample:  8000
Trained sample:  9000
Trained sample:  10000
Trained sample:  11000
Trained sample:  12000
Trained sample:  13000
Trained sample:  14000
Trained sample:  15000
Trained sample:  16000
Trained sample:  17000
Trained sample:  18000
Trained sample:  19000
Trained sample:  20000
Trained sample:  21000
Trained sample:  22000
Trained sample:  23000
Average loss:  0.25035995835471525


  "type " + obj.__name__ + ". It won't be checked "



Model kaydedildi.

Epoch: 1
Trained sample:  0
Trained sample:  1000
Trained sample:  2000
Trained sample:  3000
Trained sample:  4000
Trained sample:  5000
Trained sample:  6000
Trained sample:  7000
Trained sample:  8000
Trained sample:  9000
Trained sample:  10000
Trained sample:  11000
Trained sample:  12000
Trained sample:  13000
Trained sample:  14000
Trained sample:  15000
Trained sample:  16000
Trained sample:  17000
Trained sample:  18000
Trained sample:  19000
Trained sample:  20000
Trained sample:  21000
Trained sample:  22000
Trained sample:  23000
Average loss:  0.25024279656834625

Model kaydedildi.

Epoch: 2
Trained sample:  0
Trained sample:  1000
Trained sample:  2000
Trained sample:  3000
Trained sample:  4000
Trained sample:  5000
Trained sample:  6000
Trained sample:  7000
Trained sample:  8000
Trained sample:  9000
Trained sample:  10000
Trained sample:  11000
Trained sample:  12000
Trained sample:  13000
Trained sample:  14000
Trained sample:  15000
Trained sampl

KeyboardInterrupt: 