In [6]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/fasttext-wikinews/wiki-news-300d-1M.vec
/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv


In [35]:
import gc
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize

In [8]:
DATA = "../input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv"
FASTTEXT_EMBEDDINGS = "../input/fasttext-wikinews/wiki-news-300d-1M.vec"

In [9]:
def sentence_to_vec(sentence, embedding_dict={}, stop_words=[], tokenizer=None):
    words = str(sentence).lower()
    words = tokenizer(words)
    words = [w for w in words if w not in stop_words]
    words = [w for w in words if w.isalpha()]
    
    M = []
    for w in words:
        if w in embedding_dict:
            M.append(embedding_dict[w])
    
    if len(M) == 0:
        return np.zeros(300)
    
    v = np.array(M).sum(axis=0)
    
    return v / np.sqrt((v**2).sum())


In [10]:
import io
from sklearn import linear_model, metrics, model_selection
from sklearn.feature_extraction.text import TfidfVectorizer

def load_vectors(fname):
    # taken from: https://fasttext.cc/docs/en/english-vectors.html
    fin = io.open(fname,'r',encoding='utf-8',newline='\n',errors='ignore')
    n, d = map(int, fin.readline().split())
    data = {}
    for line in fin:
        tokens = line.rstrip().split(' ')
        data[tokens[0]] = list(map(float, tokens[1:]))
    
    return data

In [11]:
%%time
df = pd.read_csv(DATA, nrows=10000)
df["sentiment"] = df["sentiment"].apply(lambda label: 1 if label=="positive" else 0)
df = df.sample(frac=1.0).reset_index(drop=True)

CPU times: user 149 ms, sys: 29.3 ms, total: 178 ms
Wall time: 496 ms


In [12]:
%%time
print("Embeddings...")
embeddings = load_vectors(FASTTEXT_EMBEDDINGS)

Embeddings...


In [33]:
from numpy import dot
from numpy.linalg import norm

good = np.array(embeddings.get("good"))
bad = np.array(embeddings.get("bad"))
worst = np.array(embeddings.get("worst"))
best = np.array(embeddings.get("best"))

good_bad = dot(good, bad) / (norm(good)*norm(bad))
print(good_bad)
best_worst = dot(best, worst) / (norm(best)*norm(worst))
print(best_worst)

0.8331158890694513
0.7301328920606268


In [34]:
%%time
print("Sentence Vectors..")
vectors = []
for review in df["review"].values:
    vectors.append(
        sentence_to_vec(
            sentence = review,
            embedding_dict = embeddings,
            stop_words = [],
            tokenizer = word_tokenize
        )
    )
    
vectors = np.array(vectors)
y = df["sentiment"].values

kf = model_selection.StratifiedKFold(n_splits=3)
for fold, (train_idx, val_idx) in enumerate(kf.split(X=vectors, y=y)):
    print("FOLD: ", fold)
    xtrain = vectors[train_idx, :]
    xval = vectors[val_idx, :]
    ytrain = y[train_idx]
    yval = y[val_idx]
    
    model = linear_model.LogisticRegression()
    model.fit(xtrain, ytrain)
    preds = model.predict(xval)
    acc = metrics.accuracy_score(yval, preds)
    print(f"Accuracy: {np.round(acc, 3)}")


Sentence Vectors..
FOLD:  0
Accuracy: 0.798
FOLD:  1
Accuracy: 0.824
FOLD:  2
Accuracy: 0.817
CPU times: user 2min 4s, sys: 832 ms, total: 2min 5s
Wall time: 2min 4s


In [37]:
%%time
#create_folds.py
def create_folds(df, num_folds=5):
    df = df.sample(frac=1.0).reset_index(drop=True)
    df["folds"] = -1
    kf = model_selection.StratifiedKFold(n_splits=num_folds)
    for fold, (train_idx, val_idx) in enumerate(kf.split(X=df["review"], y=df["sentiment"])):
        df.loc[val_idx, "folds"] = fold

    return df

df = create_folds(df)
df

CPU times: user 9.51 ms, sys: 31 µs, total: 9.54 ms
Wall time: 10.9 ms


Unnamed: 0,review,sentiment,folds
0,Sure it may not be a classic but it's one full...,1,0
1,This might have been an excellent flick. Howev...,0,0
2,This poor remake of the 1963 classic starts re...,0,0
3,At school I was taught how some shots were cal...,1,0
4,Farrah Fawcett gives an award nominated perfor...,1,0
...,...,...,...
9995,I mistakenly kept myself awake late last night...,0,4
9996,Please! Do not waste any money on this movie. ...,0,4
9997,I wanted to see the movie because of an articl...,0,4
9998,"If this movie is coming to a theater near you,...",0,4


In [39]:
#dataset.py
import torch

class IMDBDataset:
    def __init__(self, reviews, targets):
        self.reviews = reviews
        self.targets = targets
        
    def __len__(self):
        return len(self.reviews)
    
    def __getitem__(self, item):
        review = self.reviews[item, :]
        target = self.targets[item]
        return {
            "review": torch.tensor(review, dtype=torch.long),
            "target": torch.tensor(review, dtype=torch.float)
        }


In [None]:
### lstm.py
import torch.nn as nn

class LSTM(nn.Module):
    def __init__(self, embedding_matrix):
        super().__init__()
        num_words = embedding_matrix.shape[0]
        embedding_dim = embedding_matrix.shape[1]
        
        self.embedding = nn.Embedding(num_embeddings=num_words, embedding_dim=embedding_dim)
        
        self.embedding.weight = nn.Parameter(
            torch.tensor(embedding_matrix, dtype=torch.float32)
        )
        
        self.embedding.weight.requires_grad = False
        self.lstm = nn.LSTM(
            embedding_dim,
            128,
            bidirectional=True,
            batch_first=True
        )
        self.out = nn.Linear(512,1)
        
    def forward(self, x):
        x = self.embedding(x)
        x, _ = self.lstm(x)
        
        avg_pool = torch.mean(x, 1)
        max_pool = torch.max(x, 1)
        
        out = torch.cat((avg_pool, max_pool), 1)
        out = self.out(out)
        
        return out
        
        

In [40]:
#engine.py

def train(data_loader, model, optimizer, device):
    for data in data_loader:
        reviews = data["review"]
        targets = data["target"]
        
        reviews = reviews.to(device, dtype=torch.long)
        targets = targets.to(device, dtype=torch.float)
        
        optimizer.zero_grad()
        
        predictions = model(reviews)
        
        loss = nn.BCEWithLogitsLoss(predictions, targets.view(-1,1))
        
        loss.backward()
        optimizer.step()
        
def evaluate(data_loader, model, device):
    final_preds = []
    final_targets = []
    
    model.eval()
    
    with torch.no_grad():
        for data in data_loader:
            reviews = data["review"]
            targets = data["target"]
            reviews = reviews.to(device, dtype=torch.long)
            targets = targets.to(device, dtype=torch.float)
            
            preds = model(reviews)
            
            preds = preds.cpu().numpy().tolist()
            targets = data["target"].cpu().numpy().tolist()
            final_preds.extend(preds)
            final_targets.extend(targets)
            
    return final_preds, final_targets