In [54]:
import pandas as pd
import spacy

from lib.text_preprocessing import preprocess_text, clean_html, to_lower, simple_punctuation_only, lemmatize, remove_stopwords
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from tqdm.auto import tqdm
from typing import Dict, List, Optional, Union

tqdm.pandas()

In [30]:
lang = spacy.load('en_core_web_sm')
pipeline = [lemmatize, remove_stopwords, simple_punctuation_only, to_lower]

In [32]:
df = pd.read_csv('../data/raw/IMDB Dataset.csv')
df = df[:1000]

df['review_clean'] = df['review'].progress_apply(lambda s: preprocess_text(s, lang, pipeline, [clean_html]))
df.head()

  0%|          | 0/1000 [00:00<?, ?it/s]

Unnamed: 0,review,sentiment,review_clean
0,One of the other reviewers has mentioned that ...,positive,reviewer mention watch 1 oz episode hook. righ...
1,A wonderful little production. <br /><br />The...,positive,wonderful little production. filming technique...
2,I thought this was a wonderful way to spend ti...,positive,think wonderful way spend time hot summer week...
3,Basically there's a family where a little boy ...,negative,basically family little boy (jake) think zombi...
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,petter mattei love time money visually stunnin...


In [40]:
X_train, X_test = train_test_split(df, train_size=0.7, random_state=42)

In [41]:
X_train

Unnamed: 0,review,sentiment,review_clean
541,An idiotic dentist finds out that his wife has...,negative,"idiotic dentist find wife unfaithful. , new st..."
440,This movie is lame and not funny at all. The p...,negative,movie lame funny . plot sense. scientist work ...
482,"Weak,stale, tired, cliched; wants to be Basic ...",negative,"weak,stale, tired, cliched; want basic instinc..."
422,"First and foremost, I loved the novel by Ray B...",negative,"foremost, love novel ray bradbury. kind horror..."
778,"I never watched the 'Next Action Hero' show, a...",positive,"watch action hero , read comment , know movie ..."
...,...,...,...
106,The performance of every actor and actress (in...,positive,performance actor actress ( film) excellently ...
270,"Clifton Webb is one of my favorites. However, ...",negative,"clifton webb favorite. , mister scoutmaster go..."
860,This production was quite a surprise for me. I...,positive,production surprise . absolutely love obscure ...
435,You wear only the best Italian suits from Arma...,positive,"wear good italian suit armani, hand stitch fit..."


In [38]:
Y_test

NameError: name 'Y_test' is not defined

In [4]:
enc = LabelEncoder()
enc.fit(df['sentiment'].to_numpy())
y = enc.transform(df['sentiment'])
y.shape

(1000,)

In [5]:
vectorizer = TfidfVectorizer(max_features=50000)
X = vectorizer.fit_transform(df['review'])
X.shape

(1000, 17922)

In [20]:
len(vectorizer.get_feature_names())

17922

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

((800, 17922), (200, 17922), (800,), (200,))

In [7]:
from sklearn.ensemble import RandomForestClassifier

text_classifier = RandomForestClassifier(n_estimators=200, random_state=42)
text_classifier.fit(X_train, y_train)

RandomForestClassifier(n_estimators=200, random_state=42)

In [8]:
predictions = text_classifier.predict(X_test)
print(confusion_matrix(y_test,predictions))
print(classification_report(y_test,predictions))
print(accuracy_score(y_test, predictions))

[[81 23]
 [23 73]]
              precision    recall  f1-score   support

           0       0.78      0.78      0.78       104
           1       0.76      0.76      0.76        96

    accuracy                           0.77       200
   macro avg       0.77      0.77      0.77       200
weighted avg       0.77      0.77      0.77       200

0.77


In [70]:
X.shape

(1000, 17922)

In [17]:
import pytorch_lightning as pl
import torch
import torch.nn.functional as F

from torch import nn
from torch.utils.data import Dataset, DataLoader

X_train_t = torch.from_numpy(X_train.toarray())
X_test_t = torch.from_numpy(X_test.toarray())
y_train_t = torch.from_numpy(y_train)
y_test_t = torch.from_numpy(y_test)

X_t = torch.from_numpy(X.toarray()).type(torch.FloatTensor)
y_t = torch.from_numpy(y).type(torch.FloatTensor)

class SentimentDataset(Dataset):

    def __init__(self):
        pass

    def __len__(self):
        return X.shape[0]

    def __getitem__(self, idx):
        return [X_t[idx], y_t[idx]]


class SentimentDataModule(pl.LightningDataModule):

    def __init__(self):
        super().__init__()
        pass

    def prepare_data(self) -> None:
        pass

    def setup(self, stage: Optional[str] = None) -> None:
        pass

    def train_dataloader(self) -> Union[DataLoader, List[DataLoader], Dict[str, DataLoader]]:
        dataset = SentimentDataset()
        return DataLoader(dataset, batch_size=250)

    def val_dataloader(self) -> Union[DataLoader, List[DataLoader]]:
        dataset = SentimentDataset()
        return DataLoader(dataset, batch_size=250)

    def test_dataloader(self) -> Union[DataLoader, List[DataLoader]]:
        dataset = SentimentDataset()
        return DataLoader(dataset, batch_size=10)


class SentimentClassifier(pl.LightningModule):

    def __init__(self):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(X_train_t.shape[1], 64),
            nn.ReLU(),
            nn.Linear(64, 2),
            nn.LogSoftmax(dim=1))

    def forward(self, X):
        return self.model(X)

    def training_step(self, batch, batch_idx):
        X, y = batch
        z = self(X)
        loss = F.nll_loss(z, y.type(torch.LongTensor))
        self.log('train_loss', loss, prog_bar=True, on_epoch=True)
        return loss

    def validation_step(self, batch, batch_idx):
        X, y = batch
        z = self(X)
        loss = F.nll_loss(z, y.type(torch.LongTensor))
        self.log('validation_loss', loss, prog_bar=True, on_epoch=True)
        return loss

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=0.002)

(X_t.shape, y_t.shape)

(torch.Size([1000, 17922]), torch.Size([1000]))

In [53]:
t = torch.tensor([1,2,3,4,5,6,7,8,9,10, 11, 12])
m = torch.zeros(10)
F.pad(t, [0, 10 - t.shape[0]], mode='constant', value=0)

tensor([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10])

In [18]:
data = SentimentDataModule()
classifier = SentimentClassifier()

trainer = pl.Trainer(max_epochs=5)
trainer.fit(classifier, data)

GPU available: True, used: False
TPU available: False, using: 0 TPU cores

  | Name  | Type       | Params
-------------------------------------
0 | model | Sequential | 1.1 M 
-------------------------------------
1.1 M     Trainable params
0         Non-trainable params
1.1 M     Total params
4.589     Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]



Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

In [23]:
a = [torch.tensor([0.1, 0.2]), torch.tensor([2,3])]
torch.stack(a)

tensor([[0.1000, 0.2000],
        [2.0000, 3.0000]])

In [63]:
vectorizer.transform(['hello world', 'wuhuuu'])

<2x17922 sparse matrix of type '<class 'numpy.float64'>'
	with 2 stored elements in Compressed Sparse Row format>

In [61]:
m = nn.Softmax(dim=1)
input = torch.randn(10, 3)
m(input), input

(tensor([[0.6378, 0.2236, 0.1385],
         [0.2834, 0.1194, 0.5971],
         [0.0962, 0.5562, 0.3475],
         [0.7077, 0.0109, 0.2815],
         [0.1490, 0.1711, 0.6798],
         [0.1536, 0.7008, 0.1457],
         [0.5729, 0.2285, 0.1986],
         [0.1283, 0.7184, 0.1532],
         [0.3433, 0.4774, 0.1793],
         [0.4072, 0.4145, 0.1783]]),
 tensor([[ 0.8071, -0.2411, -0.7198],
         [ 0.1799, -0.6843,  0.9251],
         [-1.1688,  0.5857,  0.1154],
         [ 1.1061, -3.0716,  0.1841],
         [-0.5397, -0.4013,  0.9780],
         [-0.7005,  0.8174, -0.7535],
         [-0.0708, -0.9902, -1.1304],
         [-0.2136,  1.5091, -0.0359],
         [ 0.0146,  0.3444, -0.6347],
         [ 1.0143,  1.0318,  0.1884]]))