Import packages and funcions 

In [1]:
import torch 
import nltk 
from nltk.corpus import stopwords
from tweets_utils import tweet_to_tensor, get_vocabulary, process_tweet, split_tweets
from data_generators import data_generator,train_generator ,val_generator ,test_generator
from model import classifier
from train_eval import train_model, evaluate_model, inference
%load_ext autoreload
%autoreload 2


Download tweets and stopwords

In [2]:
nltk.download('twitter_samples', download_dir='/home/faris.almalik/Desktop/NLPCourse')
nltk.download('stopwords', download_dir='/home/faris.almalik/Desktop/NLPCourse')
stopwords_english = stopwords.words('english')
print(f'Stopwords length is: {len(stopwords_english)}   -->    Samples {stopwords_english[:10]}')


Stopwords length is: 179   -->    Samples ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]


[nltk_data] Downloading package twitter_samples to
[nltk_data]     /home/faris.almalik/Desktop/NLPCourse...
[nltk_data]   Package twitter_samples is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/faris.almalik/Desktop/NLPCourse...
[nltk_data]   Package stopwords is already up-to-date!


Split tweets into training/validation 

In [3]:
tweets_train, tweets_val, tweets_train_labels , tweets_val_labels, positive_train ,negative_train, positive_val, negative_val,positive_tweets ,negative_tweets  = split_tweets()

Number of positive tweets = 5000 	 Number of negative tweets = 5000
Training tweets: 4000 Positive tweets and 4000 Negative tweets
Validation tweets: 1000 Positive tweets and 1000 Negative tweets
Number of training tweets: 8000
Number of validation tweets: 2000


Preprocess the tweets and prepare them 

In [4]:
#Example of a tweet after proccessing. 
print(f'tweet before precessing  -> \t {tweets_train[800]}')
print(f'tweet after precessing ->  \t {process_tweet(tweets_train[800])}')

tweet before precessing  -> 	 @MsKristinKreuk Hugs ang Kisses from the philippines :)
tweet after precessing ->  	 ['hug', 'ang', 'kiss', 'philippin', ':)']


Create vocabulary from the training samples

In [5]:
#Get vocabulary 
vocab = get_vocabulary(train_tweets= tweets_train)
print(f'The size of the vocabulary is {len(vocab)}')

The size of the vocabulary is 9089


Now, each tweet should be represented by an array of numbers. To do so: 

In [6]:
#Use the teweet_to_tensor function 
print(f'tweet in strings -> \t {tweets_train[1000]}')
print(f'tweet in strings after preprocessing -> \t {process_tweet(tweets_train[1000])}')
print(f'tweet in numbers -> \t {tweet_to_tensor(tweets_train[1000], vocabulary= vocab)}')


tweet in strings -> 	 @ArianeBeeston Communal knowledge! :)
tweet in strings after preprocessing -> 	 ['commun', 'knowledg', ':)']
tweet in numbers -> 	 [7, 2100, 9]


In order to train the model, we need batch of tweets. Hence, the batches are obtained by: 

In [7]:
#Get one batch and check the dimensions 
batch_size = 64
shuffle = True
loop = True
inputs, targets, example_weights = next(data_generator(data_pos= positive_tweets, data_neg= negative_tweets, batch_size=batch_size, shuffle=shuffle, loop = loop,vocab_dict= vocab))
print(f'Inputs shape: {inputs.shape}')
print(f'Targets shape: {targets.shape}')

Inputs shape: torch.Size([64, 27])
Targets shape: torch.Size([64])


Create the model, define loaders, and define optimizer/loss

In [8]:
#instantiate model instant
model = classifier(vocab_size=len(vocab))

#Define some parameters 
batch_size = 64
lr = 0.0001
epochs = 10

#Instantiate loss function and optimizer
optimizer = torch.optim.Adam(params = model.parameters() ,lr = lr)
criterion = torch.nn.CrossEntropyLoss()

#Create train/test/val loaders 
train_loader = train_generator(batch_size= batch_size, train_pos= positive_train, train_neg=negative_train,vocab_dict=vocab, loop = True)
val_loader = val_generator(batch_size= batch_size, val_pos= positive_val, val_neg=negative_val, vocab_dict=vocab, loop= True)
test_loader = test_generator(batch_size= batch_size, val_pos= positive_val, val_neg=negative_val, vocab_dict=vocab, loop= False)


Training

In [9]:
#train model 
trained_model = train_model(model = model, train_loader= train_loader, optimizer= optimizer, criterion= criterion, epochs=epochs, batch_size=batch_size)

epochs : 1
Steps  = 125 	 loss = 0.6442 	 Acc = 0.7790
epochs : 2
Steps  = 125 	 loss = 0.4772 	 Acc = 0.9380
epochs : 3
Steps  = 125 	 loss = 0.3080 	 Acc = 0.9566
epochs : 4
Steps  = 125 	 loss = 0.2025 	 Acc = 0.9674
epochs : 5
Steps  = 125 	 loss = 0.1436 	 Acc = 0.9764
epochs : 6
Steps  = 125 	 loss = 0.1082 	 Acc = 0.9831
epochs : 7
Steps  = 125 	 loss = 0.0849 	 Acc = 0.9871
epochs : 8
Steps  = 125 	 loss = 0.0691 	 Acc = 0.9885
epochs : 9
Steps  = 125 	 loss = 0.0580 	 Acc = 0.9892
epochs : 10
Steps  = 125 	 loss = 0.0499 	 Acc = 0.9900


Evaluate the trained model's performance on the test set 

In [10]:
#evaluate model 
evaluate_model(model= trained_model, test_loader= test_loader, criterion= criterion)

loss = 0.0571 	 Acc = 0.9884


## Now, Try it! Enter any tweet and the model will tell whether its positive or negative 

In [11]:
my_tweeet = input('Write your tweet and press enter \n')
inference(tweet= my_tweeet, vocab= vocab, model= trained_model)

Negative Sentiment
