## Submissão básica

- Neste notebook vamos criar uma submissão <strong>baseline</strong>, que é uma submissão inicial sem muita análise e processamento.
- A ideia é entender como treinar um modelo simples com os dados e criar uma submissão válida, que será nosso ponto de partida a ser superado.
- Também serve para verificar se nossa validação local está semelhante à pontuação na competição. Como a quantidade de submissões ao longo da competição é limitada em 300, o ideal é passarmos a maior parte do tempo validando os experimentos localmente.
- Ou seja, é esencial que a acurácia obtida na validação local se aproxime da acurácia do leaderboard.

In [1]:
import re
import numpy as np
import pandas as pd

from sklearn.model_selection import cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.svm import *
from sklearn.linear_model import *
from sklearn.ensemble import RandomForestClassifier

In [26]:
# Treino, teste e modelo de submissão
train = pd.read_csv('data/Train.csv')
test = pd.read_csv('data/Test.csv')
sub = pd.read_csv('data/SampleSubmission.csv')

# Parece que os IDs do modelo de submissão estão fora de ordem
# então substituímos eles pela ordem do arquivo de teste
sub.ID = test.ID

# Target a ser previsto o treino
target = train['label']

# Dimensões dos dataframes de treino e teste
train.shape, test.shape

((70000, 3), (30000, 2))

In [27]:
# Distribuição dos targets
train.label.value_counts()

 1    38239
-1    29295
 0     2466
Name: label, dtype: int64

In [28]:
words = train.text.str.split(' ').explode()
words.value_counts()[:10]

w       19465
ya       8961
fi       8837
el       7808
slim     7103
rabi     6477
si       3631
ca       3144
l        3107
kol      3093
Name: text, dtype: int64

In [29]:
THRESHOLD = 5000
stop_words = words.value_counts()[words.value_counts() > THRESHOLD].index.tolist()

print('STOPWORDS:', stop_words)

STOPWORDS: ['w', 'ya', 'fi', 'el', 'slim', 'rabi']


In [30]:
train.head()

Unnamed: 0,ID,text,label
0,13P0QT0,3sbaaaaaaaaaaaaaaaaaaaa lek ou le seim riahi o...,-1
1,SKCLXCJ,cha3eb fey9elkoum menghir ta7ayoul ou kressi,-1
2,V1TVXIJ,bereau degage nathef ya slim walahi ya7chiw fi...,-1
3,U0TTYY8,ak slouma,1
4,68DX797,entom titmanou lina a7na 3iid moubarik a7na ch...,-1


In [88]:
# Preprocessamento básico
def preprocess(txt):   
    txt = re.sub(r' +', ' ', txt)
    txt = txt.lower()
    return txt

def remove_sw(corpus, stop_words):
    corpus = corpus.apply(lambda x: [e for e in x.split(' ') if e not in stop_words])
    corpus = corpus.str.join(' ')
    return corpus

train['txt_ok'] = train.text.apply(preprocess)
test['txt_ok'] = test.text.apply(preprocess)

REMOVE_STOPWORDS = True
# REMOVE_STOPWORDS = False

if REMOVE_STOPWORDS:
    train['txt_ok'] = remove_sw(train['txt_ok'], stop_words=stop_words)
    test['txt_ok'] = remove_sw(test['txt_ok'], stop_words=stop_words)

In [89]:
# Todo o conjunto de textos
corpus = pd.concat([train.txt_ok, test.txt_ok], axis=0)

In [244]:
# Vetorização: transformação dos textos em uma matriz numérica
# de documentos x termos.
vec = TfidfVectorizer(ngram_range=(1, 6), sublinear_tf=True)
vec.fit(corpus)
train_vec = vec.transform(train['txt_ok'])
test_vec = vec.transform(test['txt_ok'])

In [245]:
# Repare que a matriz gerada é esparsa, o que permite lidar
# com as 189093 colunas sem ocupar muita memória
train_vec.shape

(70000, 3097846)

In [246]:
# Validação cruzada estratificada 5-folds com um modelo linear
m = LinearSVC(max_iter=2000, random_state=42)

s = cross_val_score(
    m,
    train_vec, 
    target, 
    scoring='accuracy',
    cv=5,
    n_jobs=-1
)

# Média das 5 validações
print(np.round(s, 5))
print('Mean: ', np.mean(s))

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    6.0s remaining:    9.0s


[0.78429 0.78614 0.78579 0.78629 0.78921]
Mean:  0.7863428571428572


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    6.7s finished


In [249]:
m = SGDClassifier(loss='log', penalty='l2', class_weight='balanced', tol=0.001, random_state=42)

s = cross_val_score(
    m,
    train_vec, 
    target, 
    scoring='accuracy',
    cv=5,
    n_jobs=-1
)

# Média das 5 validações
print(np.round(s, 5))
print('Mean: ', np.mean(s))

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    2.0s remaining:    3.1s


[0.76171 0.75793 0.75714 0.76071 0.76193]
Mean:  0.7598857142857143


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    2.9s finished


In [62]:
# Validação cruzada estratificada 5-folds com um modelo linear
# m = LogisticRegressionCV(random_state=42, tol=0.1, solver='sag', multi_class='multinomial', n_jobs=-1, verbose=1)
m2 = LogisticRegression(random_state=42, multi_class='multinomial', n_jobs=-1)

s = cross_val_score(
    m2,
    train_vec, 
    target, 
    scoring='accuracy',
    cv=5,
    verbose=1,
    n_jobs=-1
)

# Média das 5 validações
print(np.round(s, 5))
print('Mean: ', np.mean(s))

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:  3.3min remaining:  4.9min


[0.59779 0.59893 0.595   0.59636 0.59943]
Mean:  0.5974999999999999


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  3.5min finished


In [24]:
# Treina com todos os dados e gera uma submissão
m.fit(train_vec, target)

sub.label = m.predict(test_vec)

sub.to_csv('sub.csv', index=False)
sub

Unnamed: 0,ID,label
0,2DDHQW9,1
1,5HY6UEY,-1
2,ATNVUJX,1
3,Q9XYVOQ,1
4,TOAHLRH,1
...,...,...
29995,NHXTL3R,-1
29996,U1YWB2O,1
29997,O3KYLM0,1
29998,W4C38TY,-1


In [25]:
test_vec.todense().shape

(30000, 500000)

---

In [22]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm

In [23]:
class MulticlassClassification(nn.Module):
    def __init__(self, num_feature, num_class):
        super(MulticlassClassification, self).__init__()
        
        self.layer1 = nn.Linear(num_feature, 512)
        self.layer2 = nn.Linear(512, 128)
        self.layer3 = nn.Linear(128, 64)
        self.layer_out = nn.Linear(64, num_class) 
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=0.3)

        self.batchnorm1 = nn.BatchNorm1d(512)
        self.batchnorm2 = nn.BatchNorm1d(128)
        self.batchnorm3 = nn.BatchNorm1d(64)
        
    def forward(self, x):
        x = self.layer1(x)
        x = self.batchnorm1(x)
        x = self.relu(x)
        
        x = self.layer2(x)
        x = self.batchnorm2(x)
        x = self.relu(x)
        x = self.dropout(x)
        
        x = self.layer3(x)
        x = self.batchnorm3(x)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.layer_out(x)
        return x
    
model = MulticlassClassification(num_feature=train_vec.todense().shape[1], num_class=3)
# model.to(device)

criterion = nn.CrossEntropyLoss() #weight=[class_weights].to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
print(model)


MulticlassClassification(
  (layer1): Linear(in_features=500000, out_features=512, bias=True)
  (layer2): Linear(in_features=512, out_features=128, bias=True)
  (layer3): Linear(in_features=128, out_features=64, bias=True)
  (layer_out): Linear(in_features=64, out_features=3, bias=True)
  (relu): ReLU()
  (dropout): Dropout(p=0.3, inplace=False)
  (batchnorm1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (batchnorm2): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (batchnorm3): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)


In [42]:
def to_sparsetensor(x):

    coo = x.tocoo()
    indices = np.vstack((coo.row, coo.col))
    i = torch.LongTensor(indices)
    v = torch.FloatTensor(coo.data)
    shape = coo.shape

    tensor = torch.sparse.FloatTensor(i, v, torch.Size(shape))
    return tensor

In [44]:
# to_sparsetensor(test_vec).to_dense()

In [71]:
for i in train_loader:
    print(i)
    break

tensor(indices=tensor([[    0,     0,     0,  ...,   127,   127,   127],
                       [74857, 74797, 70861,  ...,  6874,  4905,  2457]]),
       values=tensor([0.3395, 0.1640, 0.3130,  ..., 0.4038, 0.3857, 0.1838]),
       size=(128, 100000), nnz=1525, layout=torch.sparse_coo)


In [65]:
train_loader = DataLoader(dataset=to_sparsetensor(train_vec), batch_size=128) #, sampler=weighted_sampler)
val_loader = DataLoader(dataset=to_sparsetensor(test_vec), batch_size=1)
test_loader = DataLoader(dataset=to_sparsetensor(test_vec), batch_size=1)


In [46]:
print("Begin training.")
EPOCHS = 50

for e in tqdm(range(1, EPOCHS+1)):
    
    # TRAINING
    train_epoch_loss = 0
    train_epoch_acc = 0

model.train()

for X_train_batch, y_train_batch in train_loader:
    X_train_batch, y_train_batch = X_train_batch, y_train_batch
    optimizer.zero_grad()

    y_train_pred = model(X_train_batch)

    train_loss = criterion(y_train_pred, y_train_batch)
    train_acc = multi_acc(y_train_pred, y_train_batch)

    train_loss.backward()
    optimizer.step()

    train_epoch_loss += train_loss.item()
    train_epoch_acc += train_acc.item()

# VALIDATION    
with torch.no_grad():

    val_epoch_loss = 0
    val_epoch_acc = 0

    model.eval()
    for X_val_batch, y_val_batch in val_loader:
        X_val_batch, y_val_batch = X_val_batch.to(device), y_val_batch.to(device)

        y_val_pred = model(X_val_batch)

        val_loss = criterion(y_val_pred, y_val_batch)
        val_acc = multi_acc(y_val_pred, y_val_batch)

        val_epoch_loss += val_loss.item()
        val_epoch_acc += val_acc.item()
                
loss_stats['train'].append(train_epoch_loss/len(train_loader))
loss_stats['val'].append(val_epoch_loss/len(val_loader))
accuracy_stats['train'].append(train_epoch_acc/len(train_loader))
accuracy_stats['val'].append(val_epoch_acc/len(val_loader))
                              
    
print(f'Epoch {e+0:03}: | Train Loss: {train_epoch_loss/len(train_loader):.5f} | Val Loss: {val_epoch_loss/len(val_loader):.5f} | Train Acc: {train_epoch_acc/len(train_loader):.3f}| Val Acc: {val_epoch_acc/len(val_loader):.3f}')

100%|██████████| 50/50 [00:00<00:00, 320175.88it/s]

Begin training.





TypeError: object of type 'zip' has no len()