In [1]:
def find_sentiment(sentence, pos, neg):
    """
    This function returns the sentiment of a sentence.
    :param sentence: sentence, a string
    :param pos: set of positive words
    :param neg: set of negative words
    :return: returns positive, negative or neutral sentiment
    """

    # Split sentence by all whitespaces
    sentence = sentence.split()

    # Convert the list of words into a set
    sentence = set(sentence)

    # Count the number of common words with the positive set
    num_common_pos = len(sentence.intersection(pos))

    # Count the number of common words with the negative set
    num_common_neg = len(sentence.intersection(neg))

    # Determine sentiment by comparing counts and return the result
    if num_common_pos > num_common_neg:
        return "positive"
    if num_common_pos < num_common_neg:
        return "negative"
    return "neutral"


In [2]:
from nltk.tokenize import word_tokenize
sentence = "hi, how are you?"
sentence.split()
word_tokenize(sentence)

['hi', ',', 'how', 'are', 'you', '?']

In [3]:
from sklearn.feature_extraction.text import CountVectorizer
# create a corpus of sentences
corpus = [
 "hello, how are you?",
 "im getting bored at home. And you? What do you think?",
 "did you know about counts",
 "let's see if this works!",
 "YES!!!!"
]
# initialize CountVectorizer
ctv = CountVectorizer()
# fit the vectorizer on corpus
ctv.fit(corpus)
corpus_transformed = ctv.transform(corpus)

In [4]:
print(corpus_transformed)

  (0, 2)	1
  (0, 9)	1
  (0, 11)	1
  (0, 22)	1
  (1, 1)	1
  (1, 3)	1
  (1, 4)	1
  (1, 7)	1
  (1, 8)	1
  (1, 10)	1
  (1, 13)	1
  (1, 17)	1
  (1, 19)	1
  (1, 22)	2
  (2, 0)	1
  (2, 5)	1
  (2, 6)	1
  (2, 14)	1
  (2, 22)	1
  (3, 12)	1
  (3, 15)	1
  (3, 16)	1
  (3, 18)	1
  (3, 20)	1
  (4, 21)	1


In [5]:
print(ctv.vocabulary_)

{'hello': 9, 'how': 11, 'are': 2, 'you': 22, 'im': 13, 'getting': 8, 'bored': 4, 'at': 3, 'home': 10, 'and': 1, 'what': 19, 'do': 7, 'think': 17, 'did': 6, 'know': 14, 'about': 0, 'counts': 5, 'let': 15, 'see': 16, 'if': 12, 'this': 18, 'works': 20, 'yes': 21}


In [6]:
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import word_tokenize
# create a corpus of sentences
corpus = [
 "hello, how are you?",
 "im getting bored at home. And you? What do you think?",
 "did you know about counts",
 "let's see if this works!",
 "YES!!!!"
]
# initialize CountVectorizer with word_tokenize from nltk
# as the tokenizer
ctv = CountVectorizer(tokenizer=word_tokenize, token_pattern=None)
# fit the vectorizer on corpus
ctv.fit(corpus)
corpus_transformed = ctv.transform(corpus)
print(ctv.vocabulary_)

{'hello': 14, ',': 2, 'how': 16, 'are': 7, 'you': 27, '?': 4, 'im': 18, 'getting': 13, 'bored': 9, 'at': 8, 'home': 15, '.': 3, 'and': 6, 'what': 24, 'do': 12, 'think': 22, 'did': 11, 'know': 19, 'about': 5, 'counts': 10, 'let': 20, "'s": 1, 'see': 21, 'if': 17, 'this': 23, 'works': 25, '!': 0, 'yes': 26}


In [7]:
# Import necessary libraries
import pandas as pd
from nltk.tokenize import word_tokenize
from sklearn import linear_model, metrics, model_selection
from sklearn.feature_extraction.text import CountVectorizer

if __name__ == "__main__":
    # Read the training data
    df = pd.read_csv("IMDB Dataset.csv")
    
    # Map 'positive' to 1 and 'negative' to 0
    df.sentiment = df.sentiment.apply(lambda x: 1 if x == "positive" else 0)
    
    # Create a new column called 'kfold' and fill it with -1
    df["kfold"] = -1
    
    # Randomize the rows of the data
    df = df.sample(frac=1).reset_index(drop=True)
    
    # Fetch labels
    y = df.sentiment.values
    
    # Initiate the kfold class from model_selection module
    kf = model_selection.StratifiedKFold(n_splits=5)
    
    # Fill the new kfold column
    for f, (t_, v_) in enumerate(kf.split(X=df, y=y)):
        df.loc[v_, 'kfold'] = f
    
    # Iterate over the folds created
    for fold_ in range(5):
        # Temporary dataframes for train and test
        train_df = df[df.kfold != fold_].reset_index(drop=True)
        test_df = df[df.kfold == fold_].reset_index(drop=True)
        
        # Initialize CountVectorizer with NLTK's word_tokenize function as tokenizer
        count_vec = CountVectorizer(tokenizer=word_tokenize, token_pattern=None)
        
        # Fit count_vec on training data reviews
        count_vec.fit(train_df.review)
        
        # Transform training and validation data reviews
        xtrain = count_vec.transform(train_df.review)
        xtest = count_vec.transform(test_df.review)
        
        # Initialize logistic regression model
        model = linear_model.LogisticRegression()
        
        # Fit the model on training data reviews and sentiment
        model.fit(xtrain, train_df.sentiment)
        
        # Make predictions on test data
        preds = model.predict(xtest)
        
        # Calculate accuracy
        accuracy = metrics.accuracy_score(test_df.sentiment, preds)
        
        print(f"Fold: {fold_}")
        print(f"Accuracy = {accuracy}")
        print("")


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Fold: 0
Accuracy = 0.8939



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Fold: 1
Accuracy = 0.8906



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Fold: 2
Accuracy = 0.895



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Fold: 3
Accuracy = 0.8914

Fold: 4
Accuracy = 0.8948



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [8]:
# import what we need
import pandas as pd
from nltk.tokenize import word_tokenize
from sklearn import naive_bayes
from sklearn import metrics
from sklearn import model_selection
from sklearn.feature_extraction.text import CountVectorizer
if __name__ == "__main__":
    # Read the training data
    df = pd.read_csv("IMDB Dataset.csv")
    
    # Map 'positive' to 1 and 'negative' to 0
    df.sentiment = df.sentiment.apply(lambda x: 1 if x == "positive" else 0)
    
    # Create a new column called 'kfold' and fill it with -1
    df["kfold"] = -1
    
    # Randomize the rows of the data
    df = df.sample(frac=1).reset_index(drop=True)
    
    # Fetch labels
    y = df.sentiment.values
    
    # Initiate the kfold class from model_selection module
    kf = model_selection.StratifiedKFold(n_splits=5)
    
    # Fill the new kfold column
    for f, (t_, v_) in enumerate(kf.split(X=df, y=y)):
        df.loc[v_, 'kfold'] = f
    
    # Iterate over the folds created
    for fold_ in range(5):
        # Temporary dataframes for train and test
        train_df = df[df.kfold != fold_].reset_index(drop=True)
        test_df = df[df.kfold == fold_].reset_index(drop=True)
        
        # Initialize CountVectorizer with NLTK's word_tokenize function as tokenizer
        count_vec = CountVectorizer(tokenizer=word_tokenize, token_pattern=None)
        
        # Fit count_vec on training data reviews
        count_vec.fit(train_df.review)
        
        # Transform training and validation data reviews
        xtrain = count_vec.transform(train_df.review)
        xtest = count_vec.transform(test_df.review)
        
        # initialize naive bayes model
        model = naive_bayes.MultinomialNB()
        
        # Fit the model on training data reviews and sentiment
        model.fit(xtrain, train_df.sentiment)
        
        # Make predictions on test data
        preds = model.predict(xtest)
        
        # Calculate accuracy
        accuracy = metrics.accuracy_score(test_df.sentiment, preds)
        
        print(f"Fold: {fold_}")
        print(f"Accuracy = {accuracy}")
        print("")

Fold: 0
Accuracy = 0.8431

Fold: 1
Accuracy = 0.8494

Fold: 2
Accuracy = 0.8413

Fold: 3
Accuracy = 0.8321

Fold: 4
Accuracy = 0.8532



In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import word_tokenize
# create a corpus of sentences
corpus = [
    "hello, how are you?",
 "im getting bored at home. And you? What do you think?",
 "did you know about counts",
 "let's see if this works!",
 "YES!!!!"
]
# initialize TfidfVectorizer with word_tokenize from nltk
# as the tokenizer
tfv = TfidfVectorizer(tokenizer=word_tokenize, token_pattern=None)
# fit the vectorizer on corpus
tfv.fit(corpus)
corpus_transformed = tfv.transform(corpus)
print(corpus_transformed)

  (0, 27)	0.2965698850220162
  (0, 16)	0.4428321995085722
  (0, 14)	0.4428321995085722
  (0, 7)	0.4428321995085722
  (0, 4)	0.35727423026525224
  (0, 2)	0.4428321995085722
  (1, 27)	0.35299699146792735
  (1, 24)	0.2635440111190765
  (1, 22)	0.2635440111190765
  (1, 18)	0.2635440111190765
  (1, 15)	0.2635440111190765
  (1, 13)	0.2635440111190765
  (1, 12)	0.2635440111190765
  (1, 9)	0.2635440111190765
  (1, 8)	0.2635440111190765
  (1, 6)	0.2635440111190765
  (1, 4)	0.42525129752567803
  (1, 3)	0.2635440111190765
  (2, 27)	0.31752680284846835
  (2, 19)	0.4741246485558491
  (2, 11)	0.4741246485558491
  (2, 10)	0.4741246485558491
  (2, 5)	0.4741246485558491
  (3, 25)	0.38775666010579296
  (3, 23)	0.38775666010579296
  (3, 21)	0.38775666010579296
  (3, 20)	0.38775666010579296
  (3, 17)	0.38775666010579296
  (3, 1)	0.38775666010579296
  (3, 0)	0.3128396318588854
  (4, 26)	0.2959842226518677
  (4, 0)	0.9551928286692534


In [10]:
# Import what we need
import pandas as pd
from nltk.tokenize import word_tokenize
from sklearn import linear_model, metrics, model_selection
from sklearn.feature_extraction.text import TfidfVectorizer

if __name__ == "__main__":
    # Read the training data
    df = pd.read_csv("IMDB Dataset.csv")
    
    # Map 'positive' to 1 and 'negative' to 0
    df.sentiment = df.sentiment.apply(lambda x: 1 if x == "positive" else 0)
    
    # Create a new column called 'kfold' and fill it with -1
    df["kfold"] = -1
    
    # Randomize the rows of the data
    df = df.sample(frac=1).reset_index(drop=True)
    
    # Fetch labels
    y = df.sentiment.values
    
    # Initiate the kfold class from model_selection module
    kf = model_selection.StratifiedKFold(n_splits=5)
    
    # Fill the new kfold column
    for f, (t_, v_) in enumerate(kf.split(X=df, y=y)):
        df.loc[v_, 'kfold'] = f
    
    # Iterate over the folds created
    for fold_ in range(5):
        # Temporary dataframes for train and test
        train_df = df[df.kfold != fold_].reset_index(drop=True)
        test_df = df[df.kfold == fold_].reset_index(drop=True)
        
        # Initialize TfidfVectorizer with NLTK's word_tokenize function as tokenizer
        tfidf_vec = TfidfVectorizer(tokenizer=word_tokenize, token_pattern=None)
        
        # Fit tfidf_vec on training data reviews
        tfidf_vec.fit(train_df.review)
        
        # Transform training and validation data reviews
        xtrain = tfidf_vec.transform(train_df.review)
        xtest = tfidf_vec.transform(test_df.review)
        
        # Initialize logistic regression model
        model = linear_model.LogisticRegression()
        
        # Fit the model on training data reviews and sentiment
        model.fit(xtrain, train_df.sentiment)
        
        # Make predictions on test data
        preds = model.predict(xtest)
        
        # Calculate accuracy
        accuracy = metrics.accuracy_score(test_df.sentiment, preds)
        
        print(f"Fold: {fold_}")
        print(f"Accuracy = {accuracy}")
        print("")


Fold: 0
Accuracy = 0.8987

Fold: 1
Accuracy = 0.8973

Fold: 2
Accuracy = 0.8925

Fold: 3
Accuracy = 0.8963

Fold: 4
Accuracy = 0.899



In [11]:
from nltk import ngrams
from nltk.tokenize import word_tokenize
# let's see 3 grams
N = 3
# input sentence
sentence = "hi, how are you?"
# tokenized sentence
tokenized_sentence = word_tokenize(sentence)
# generate n_grams
n_grams = list(ngrams(tokenized_sentence, N))
print(n_grams)

[('hi', ',', 'how'), (',', 'how', 'are'), ('how', 'are', 'you'), ('are', 'you', '?')]


In [12]:
tfidf_vec = TfidfVectorizer(
 tokenizer=word_tokenize,
 token_pattern=None,
 ngram_range=(1, 3)
 )

In [13]:
from nltk.stem import WordNetLemmatizer
from nltk.stem.snowball import SnowballStemmer
# initialize lemmatizer
lemmatizer = WordNetLemmatizer()
# initialize stemmer
stemmer = SnowballStemmer("english")
words = ["fishing", "fishes", "fished"]
for word in words:
 print(f"word={word}")
 print(f"stemmed_word={stemmer.stem(word)}")
 print(f"lemma={lemmatizer.lemmatize(word)}")
 print("")

word=fishing
stemmed_word=fish
lemma=fishing

word=fishes
stemmed_word=fish
lemma=fish

word=fished
stemmed_word=fish
lemma=fished



In [14]:
import nltk
nltk.download('wordnet')


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Willi\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [15]:
import pandas as pd
from nltk.tokenize import word_tokenize
from sklearn import decomposition
from sklearn.feature_extraction.text import TfidfVectorizer

# Create a corpus of sentences
# Reading only 10k samples from training data for this example
corpus = pd.read_csv("IMDB Dataset.csv", nrows=10000)
corpus = corpus.review.values

# Initialize TfidfVectorizer with word_tokenize from nltk as the tokenizer
tfv = TfidfVectorizer(tokenizer=word_tokenize, token_pattern=None)

# Fit the vectorizer on the corpus
tfv.fit(corpus)

# Transform the corpus using tfidf
corpus_transformed = tfv.transform(corpus)

# Initialize SVD with 10 components
svd = decomposition.TruncatedSVD(n_components=10)

# Fit SVD
corpus_svd = svd.fit(corpus_transformed)

# Choose the first sample and create a dictionary
# of feature names and their scores from SVD
# You can change the sample_index variable to
# get dictionary for any other sample
sample_index = 0
feature_scores = dict(
    zip(
        tfv.get_feature_names_out(),
        corpus_svd.components_[sample_index]
    )
)

# Once we have the dictionary, we can now
# sort it in decreasing order and get the
# top N topics
N = 5
print(sorted(feature_scores, key=feature_scores.get, reverse=True)[:N])


['the', ',', '.', 'a', 'and']


In [16]:
N = 5
for sample_index in range(5):
    feature_scores = dict(
        zip(
            tfv.get_feature_names_out(),
            corpus_svd.components_[sample_index]
        )
    )
    print(
        sorted(
            feature_scores,
            key=feature_scores.get,
            reverse=True
        )[:N]
    )


['the', ',', '.', 'a', 'and']
['br', '<', '>', '/', '-']
['i', 'movie', '!', 'it', 'was']
[',', '!', "''", '``', 'you']
['!', 'the', "''", '``', '...']


In [17]:
import re
import string
def clean_text(s):
 """
 This function cleans the text a bit
 :param s: string
  :return: cleaned string
 """
 # split by all whitespaces
 s = s.split()

 # join tokens by single space
 # why we do this?
 # this will remove all kinds of weird space
 # "hi. how are you" becomes
 # "hi. how are you"
 s = " ".join(s)

 # remove all punctuations using regex and string module
 s = re.sub(f'[{re.escape(string.punctuation)}]', '', s)

 # you can add more cleaning here if you want
 # and then return the cleaned string
 return s

In [18]:
import numpy as np

def sentence_to_vec(s, embedding_dict, stop_words, tokenizer):
    """
    Given a sentence and other information,
    this function returns embedding for the whole sentence
    :param s: sentence, string
    :param embedding_dict: dictionary word:vector
    :param stop_words: list of stop words, if any
    :param tokenizer: a tokenization function
    """
    # convert sentence to string and lowercase it
    words = str(s).lower()

    # tokenize the sentence
    words = tokenizer(words)

    # remove stop word tokens
    words = [w for w in words if not w in stop_words]

    # keep only alpha-numeric tokens
    words = [w for w in words if w.isalpha()]

    # initialize empty list to store embeddings
    M = []
    for w in words:
        # for every word, fetch the embedding from
        # the dictionary and append to list of
        # embeddings
        if w in embedding_dict:
            M.append(embedding_dict[w])

    # if we don't have any vectors, return zeros
    if len(M) == 0:
        return np.zeros(300)

    # convert list of embeddings to array
    M = np.array(M)

    # calculate sum over axis=0
    v = M.sum(axis=0)

    # return normalized vector
    return v / np.sqrt((v ** 2).sum())


In [3]:
# fasttext.py
import io
import numpy as np
import pandas as pd
from nltk.tokenize import word_tokenize
from sklearn import linear_model
from sklearn import metrics
from sklearn import model_selection
from sklearn.feature_extraction.text import TfidfVectorizer

def load_vectors(fname):
    # taken from: https://fasttext.cc/docs/en/english-vectors.html
    fin = io.open(
        fname,
        'r',
        encoding='utf-8',
        newline='\n',
        errors='ignore'
    )
    n, d = map(int, fin.readline().split())
    data = {}
    for line in fin:
        tokens = line.rstrip().split(' ')
        data[tokens[0]] = list(map(float, tokens[1:]))
    return data

def sentence_to_vec(s, embedding_dict, stop_words, tokenizer):
    """
    Given a sentence and other information,
    this function returns embedding for the whole sentence
    :param s: sentence, string
    :param embedding_dict: dictionary word:vector
    :param stop_words: list of stop words, if any
    :param tokenizer: a tokenization function
    """
    # convert sentence to string and lowercase it
    words = str(s).lower()

    # tokenize the sentence
    words = tokenizer(words)

    # remove stop word tokens
    words = [w for w in words if not w in stop_words]

    # keep only alpha-numeric tokens
    words = [w for w in words if w.isalpha()]

    # initialize empty list to store embeddings
    M = []
    for w in words:
        # for every word, fetch the embedding from
        # the dictionary and append to list of
        # embeddings
        if w in embedding_dict:
            M.append(embedding_dict[w])

    # if we don't have any vectors, return zeros
    if len(M) == 0:
        return np.zeros(300)

    # convert list of embeddings to array
    M = np.array(M)

    # calculate sum over axis=0
    v = M.sum(axis=0)

    # return normalized vector
    return v / np.sqrt((v ** 2).sum())

if __name__ == "__main__":
    # read the training data
    df = pd.read_csv("IMDB Dataset.csv")
    # map positive to 1 and negative to 0
    df.sentiment = df.sentiment.apply(
        lambda x: 1 if x == "positive" else 0
    )
    # the next step is to randomize the rows of the data
    df = df.sample(frac=1).reset_index(drop=True)
    # load embeddings into memory
    print("Loading embeddings")
    embeddings = load_vectors("crawl-300d-2M.vec/crawl-300d-2M.vec")
    # create sentence embeddings
    print("Creating sentence vectors")
    vectors = []
    for review in df.review.values:
        vectors.append(
            sentence_to_vec(
                s=review,
                embedding_dict=embeddings,
                stop_words=[],
                tokenizer=word_tokenize
            )
        )

    vectors = np.array(vectors)
    # fetch labels
    y = df.sentiment.values

    # initiate the kfold class from model_selection module
    kf = model_selection.StratifiedKFold(n_splits=5)

    # fill the new kfold column
    for fold_, (t_, v_) in enumerate(kf.split(X=vectors, y=y)):
        print(f"Training fold: {fold_}")
        # temporary dataframes for train and test
        xtrain = vectors[t_, :]
        ytrain = y[t_]
        xtest = vectors[v_, :]
        ytest = y[v_]
        # initialize logistic regression model
        model = linear_model.LogisticRegression()
        # fit the model on training data reviews and sentiment
        model.fit(xtrain, ytrain)
        # make predictions on test data
        # threshold for predictions is 0.5
        preds = model.predict(xtest)
        # calculate accuracy
        accuracy = metrics.accuracy_score(ytest, preds)
        print(f"Accuracy = {accuracy}")
        print("")


Loading embeddings
Creating sentence vectors
Training fold: 0
Accuracy = 0.8608

Training fold: 1
Accuracy = 0.8581

Training fold: 2
Accuracy = 0.8582

Training fold: 3
Accuracy = 0.8558

Training fold: 4
Accuracy = 0.8624



In [4]:
# create_folds.py
# import pandas and model_selection module of scikit-learn
import pandas as pd
from sklearn import model_selection

if __name__ == "__main__":
    # Read training data
    df = pd.read_csv("IMDB Dataset.csv")
    # map positive to 1 and negative to 0
    df.sentiment = df.sentiment.apply(
        lambda x: 1 if x == "positive" else 0
    )
    # we create a new column called kfold and fill it with -1
    df["kfold"] = -1
    # the next step is to randomize the rows of the data
    df = df.sample(frac=1).reset_index(drop=True)

    # fetch labels
    y = df.sentiment.values

    # initiate the kfold class from model_selection module
    kf = model_selection.StratifiedKFold(n_splits=5)

    # fill the new kfold column
    for f, (t_, v_) in enumerate(kf.split(X=df, y=y)):
        df.loc[v_, 'kfold'] = f

    # save the new csv with kfold column
    df.to_csv("imdb_folds.csv", index=False)


In [5]:
# dataset.py
import torch

class IMDBDataset:
    def __init__(self, reviews, targets):
        """
        :param reviews: this is a numpy array
        :param targets: a vector, numpy array
        """
        self.reviews = reviews
        self.target = targets

    def __len__(self):
        # returns length of the dataset
        return len(self.reviews)

    def __getitem__(self, item):
        # for any given item, which is an int,
        # return review and targets as torch tensor
        # item is the index of the item in concern
        review = self.reviews[item, :]
        target = self.target[item]
        return {
            "review": torch.tensor(review, dtype=torch.long),
            "target": torch.tensor(target, dtype=torch.float)
        }


In [6]:
import torch
import torch.nn as nn

class LSTM(nn.Module):
    def __init__(self, embedding_matrix):
        """
        :param embedding_matrix: numpy array with vectors for all words
        """
        super(LSTM, self).__init__()
        # number of words = number of rows in embedding matrix
        num_words = embedding_matrix.shape[0]
        # dimension of embedding is num of columns in the matrix
        embed_dim = embedding_matrix.shape[1]
        # we define an input embedding layer
        self.embedding = nn.Embedding(
            num_embeddings=num_words,
            embedding_dim=embed_dim
        )
        # embedding matrix is used as weights of
        # the embedding layer
        self.embedding.weight = nn.Parameter(
            torch.tensor(
                embedding_matrix,
                dtype=torch.float32
            )
        )
        # we don't want to train the pretrained embeddings
        self.embedding.weight.requires_grad = False
        # a simple bidirectional LSTM with
        # hidden size of 128
        self.lstm = nn.LSTM(
            embed_dim,
            128,
            bidirectional=True,
            batch_first=True,
        )
        # output layer which is a linear layer
        # we have only one output
        # input (512) = 128 + 128 for mean and same for max pooling
        self.out = nn.Linear(512, 1)

    def forward(self, x):
        # pass data through embedding layer
        # the input is just the tokens
        x = self.embedding(x)
        # move embedding output to lstm
        x, _ = self.lstm(x)
        # apply mean and max pooling on lstm output
        avg_pool = torch.mean(x, 1)
        max_pool, _ = torch.max(x, 1)

        # concatenate mean and max pooling
        # this is why size is 512
        # 128 for each direction = 256
        # avg_pool = 256 and max_pool = 256
        out = torch.cat((avg_pool, max_pool), 1)
        # pass through the output layer and return the output
        out = self.out(out)
        # return linear output
        return out


In [7]:
import torch
import torch.nn as nn

def train(data_loader, model, optimizer, device):
    """
    This is the main training function that trains model
    for one epoch
    :param data_loader: this is the torch dataloader
    :param model: model (lstm model)
    :param optimizer: torch optimizer, e.g. adam, sgd, etc.
    :param device: this can be "cuda" or "cpu"
    """
    # set model to training mode
    model.train()
    # go through batches of data in data loader
    for data in data_loader:
        # fetch review and target from the dict
        reviews = data["review"]
        targets = data["target"]
        # move the data to device that we want to use
        reviews = reviews.to(device, dtype=torch.long)
        targets = targets.to(device, dtype=torch.float)
        # clear the gradients
        optimizer.zero_grad()
        # make predictions from the model
        predictions = model(reviews)
        # calculate the loss
        loss = nn.BCEWithLogitsLoss()(
            predictions,
            targets.view(-1, 1)
        )
        # compute gradient of loss w.r.t.
        # all parameters of the model that are trainable
        loss.backward()
        # single optimization step
        optimizer.step()

def evaluate(data_loader, model, device):
    # initialize empty lists to store predictions
    # and targets
    final_predictions = []
    final_targets = []
    # put the model in eval mode
    model.eval()
    # disable gradient calculation
    with torch.no_grad():
        for data in data_loader:
            reviews = data["review"]
            targets = data["target"]
            reviews = reviews.to(device, dtype=torch.long)
            targets = targets.to(device, dtype=torch.float)
            # make predictions
            predictions = model(reviews)
            # move predictions and targets to list
            # we need to move predictions and targets to cpu too
            predictions = predictions.cpu().numpy().tolist()
            targets = data["target"].cpu().numpy().tolist()
            final_predictions.extend(predictions)
            final_targets.extend(targets)
    # return final predictions and targets
    return final_predictions, final_targets


In [None]:
import sys
sys.path.append('/path/to/directory/containing/engine/')
import engine


In [14]:
pip install lstm

Collecting lstm
  Downloading lstm-0.1.0-py3-none-any.whl.metadata (5.0 kB)
Downloading lstm-0.1.0-py3-none-any.whl (3.6 kB)
Installing collected packages: lstm
Successfully installed lstm-0.1.0
Note: you may need to restart the kernel to use updated packages.


In [16]:
import numpy as np

def load_embeddings(word_index, embedding_file, vector_length=300):
    """
    Creates an embedding matrix from a pre-trained embeddings file.
    :param word_index: Dictionary mapping words to their index in the model.
    :param embedding_file: Path to the file containing pre-trained word vectors.
    :param vector_length: Length of each word vector.
    :return: A matrix of shape (number of words + 1, vector_length) with embedding vectors.
    """
    max_features = len(word_index) + 1  # +1 for the zero padding
    embeddings_index = {}

    # Load embeddings from file
    with open(embedding_file, 'r', encoding='utf-8') as file:
        for line in file:
            values = line.strip().split()
            word = values[0]
            if word in word_index or word.capitalize() in word_index or word.upper() in word_index:
                coefs = np.asarray(values[1:], dtype='float32')
                embeddings_index[word] = coefs

    # Create an embedding matrix
    embedding_matrix = np.zeros((max_features, vector_length))
    for word, i in word_index.items():
        if i >= max_features:
            continue
        # Check various forms of the word
        embedding_vector = embeddings_index.get(word) \
                        or embeddings_index.get(word.capitalize()) \
                        or embeddings_index.get(word.upper())
        
        if embedding_vector is not None and len(embedding_vector) == vector_length:
            embedding_matrix[i] = embedding_vector

    return embedding_matrix
