In [5]:
import re
import numpy as np
import pandas as pd

# from sklearn.model_selection import cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer

import torch 
from torch.optim import Adam
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import pytorch_lightning as pl


import matplotlib.pyplot as plt

# from sklearn.svm import *
# from sklearn.linear_model import *
# from sklearn.ensemble import RandomForestClassifier

In [6]:
# Treino, teste e modelo de submissão
train = pd.read_csv('data/Train.csv')
test = pd.read_csv('data/Test.csv')
sub = pd.read_csv('data/SampleSubmission.csv')

# Parece que os IDs do modelo de submissão estão fora de ordem
# então substituímos eles pela ordem do arquivo de teste
sub.ID = test.ID

# Target a ser previsto o treino
target = train['label']

# Dimensões dos dataframes de treino e teste
train.shape, test.shape

((70000, 3), (30000, 2))

In [7]:
# Preprocessamento básico
def preprocess(txt):   
    txt = re.sub(r' +', ' ', txt)
    txt = txt.lower()
    return txt

def remove_sw(corpus, stop_words):
    corpus = corpus.apply(lambda x: [e for e in x.split(' ') if e not in stop_words])
    corpus = corpus.str.join(' ')
    return corpus

train['txt_ok'] = train.text.apply(preprocess)
test['txt_ok'] = test.text.apply(preprocess)

REMOVE_STOPWORDS = True
# REMOVE_STOPWORDS = False

if REMOVE_STOPWORDS:
    THRESHOLD = 5000
    words = train.text.str.split(' ').explode()
    stop_words = words.value_counts()[words.value_counts() > THRESHOLD].index.tolist()
    train['txt_ok'] = remove_sw(train['txt_ok'], stop_words=stop_words)
    test['txt_ok'] = remove_sw(test['txt_ok'], stop_words=stop_words)

In [8]:
# Todo o conjunto de textos
corpus = pd.concat([train.txt_ok, test.txt_ok], axis=0)

In [9]:
# Vetorização: transformação dos textos em uma matriz numérica
# de documentos x termos.

N_FEATURES = 1000000 // 10

vec = TfidfVectorizer(max_features=N_FEATURES, ngram_range=(1, 3), sublinear_tf=False)
vec.fit(corpus)
train_vec = vec.transform(train['txt_ok'])
test_vec = vec.transform(test['txt_ok'])

In [10]:
train_vec.shape, target.shape

((70000, 100000), (70000,))

In [26]:
class DS(Dataset):
    def __init__(self, data, label, n_features):
        super(DS, self).__init__()
        self.data = data   #torch.from_numpy(data)
        self.label = label #torch.from_numpy(label)
        self.data_len = data.shape[0]
        self.n_features = n_features
        
    def __len__(self):
        return self.data_len

    def __getitem__(self, idx):
        idx_row = self.data[idx, :].nonzero()[1]
        X = np.zeros(self.n_features)
        X[idx_row] = self.data[idx, idx_row].toarray().flatten()

        X = torch.FloatTensor(X)
        y = self.label[idx]
        return X, y

In [27]:
# class Net2(torch.nn.Module):
#     def __init__(self, n_features):
#         super().__init__()
#         self.hidden = 600
#         self.fc1 = torch.nn.Linear(n_features, self.hidden)
#         self.fc2 = torch.nn.Linear(self.hidden, self.hidden // 2)
#         self.fc3 = torch.nn.Linear(self.hidden // 2, 3)

#     def forward(self, x):
#         x = F.relu(self.fc1(x))
#         x = F.dropout(x, p=0.4, training=self.training)
#         x = F.relu(self.fc2(x))
#         x = F.dropout(x, p=0.4, training=self.training)
#         x = F.relu(self.fc3(x))
#         return x

In [203]:
class Net(pl.LightningModule):
    
    def __init__(self, n_features, n_classes=1):
        super(Net, self).__init__()
        self.hidden = 600
        self.fc1 = torch.nn.Linear(n_features, self.hidden)
        self.fc2 = torch.nn.Linear(self.hidden, self.hidden // 2)
        self.fc3 = torch.nn.Linear(self.hidden // 2, n_classes)

    def forward(self, x, training=False):
        x = F.relu(self.fc1(x))
        x = F.dropout(x, p=0.4, training=training)
        x = F.relu(self.fc2(x))
        x = F.dropout(x, p=0.4, training=training)
        x = F.relu(self.fc3(x))
        return x
    
    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=1e-3)
    
    def training_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x, training=True)
        crit = torch.nn.CrossEntropyLoss()
#         loss = F.cross_entropy(y_hat, y)
        loss = crit(y_hat, y)
        return loss

In [57]:
# F.cross_entropy?

In [58]:
# train_dataset = DS(train_vec, target + 1, n_features=N_FEATURES)
# train_loader = DataLoader(train_dataset, batch_size=16)

In [59]:
# for X, y in train_loader:
#     break
    
# X.shape, y

In [60]:
train_dataset = DS(train_vec, target + 1, n_features=N_FEATURES)
train_loader = DataLoader(train_dataset, batch_size=16)

model = Net(n_features=N_FEATURES)
# trainer = pl.Trainer(max_steps=3)
trainer = pl.Trainer(max_steps=20, progress_bar_refresh_rate=10)
trainer.fit(model, train_loader)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores

  | Name | Type   | Params
--------------------------------
0 | fc1  | Linear | 60 M  
1 | fc2  | Linear | 180 K 
2 | fc3  | Linear | 903   


Training: |          | 0/? [00:00<?, ?it/s]

1

In [None]:
# class TextSentiment(nn.Module):
#     def __init__(self, vocab_size, embed_dim, num_class):
#         super().__init__()
#         self.embedding = nn.EmbeddingBag(vocab_size, embed_dim, sparse=True)
#         self.fc = nn.Linear(embed_dim, n
#                             um_class)
#         self.init_weights()

#     def init_weights(self):
#         initrange = 0.5
#         self.embedding.weight.data.uniform_(-initrange, initrange)
#         self.fc.weight.data.uniform_(-initrange, initrange)
#         self.fc.bias.data.zero_()

#     def forward(self, text, offsets):
#         embedded = self.embedding(text, offsets)
#         return self.fc(embedded)

---