# Load data

In [1]:
import pickle
import pandas as pd
import os
from pathlib import Path

# get data PATH from current directory
PATH = os.path.join(Path(os.getcwd()).parent, 'data')


PATH_VOCAB = os.path.join(PATH, 'vocabularies')
PATH_SUB = os.path.join(PATH, 'submissions')
PATH_MODEL = os.path.join(PATH, 'models')

In [2]:
file = open(os.path.join(PATH, 'classes_dict.pkl'), 'rb')
classes_dict = pickle.load(file)

cat_dict = classes_dict['class_stoi']
cat_dict_inv = classes_dict['class_itos']
dict_count = classes_dict['class_count']    

In [3]:
X_test = pd.read_csv(os.path.join(PATH, 'test.csv'))

In [4]:
from joblib import load
import pickle
import torch


# Leo vocabulario

file = open(os.path.join(PATH_VOCAB, 'vocab_test_mytk.pkl'), 'rb')
vocab = pickle.load(file)

# My tokenizer

In [5]:
import pickle
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer


file = open( os.path.join(PATH_VOCAB, 'vocab_test_red.pkl'), 'rb')
vocab_mix = pickle.load(file)

vectorizer = CountVectorizer(analyzer = 'word',
                             strip_accents='unicode',
                             min_df=2
                            )

tokenizer = vectorizer.build_analyzer()


counter = Counter(vocab_mix.values())

a = [x for x in counter if counter[x]>1]



vocab_transf = {}
for key in vocab_mix:
    vocab_transf[vocab_mix[key]] = 'r_{}'.format(key)
    
vocab_transf


def my_tokenizer(string):
    tokens = tokenizer(string)
    list_tokens = []
    for token in tokens:
        if token in vocab_mix:
            num = vocab_mix[token]
            #print(token, vocab_mix[token], counter[num])
            if counter[num]>1:
                list_tokens.append(vocab_transf[vocab_mix[token]])
            list_tokens.append(token)

    #print(list_tokens, string)
    return list_tokens

# Make BoW

In [6]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer 


vectorizer = CountVectorizer(
                                analyzer = 'word',
                                strip_accents='unicode',
                                vocabulary = vocab,
                                tokenizer=my_tokenizer
                                )


vectors = vectorizer.fit_transform(X_test.title)


# Predictions

## Utils

In [7]:
from scipy.sparse import coo_matrix
import numpy as np

def csr_to_tensor(X_sample):
    coo = coo_matrix(X_sample)

    values = coo.data
    indices = np.vstack((coo.row, coo.col))

    i = torch.LongTensor(indices)
    v = torch.FloatTensor(values)
    shape = coo.shape

    return torch.sparse.FloatTensor(i, v, torch.Size(shape))

def predict(model , X_test):
    images = csr_to_tensor(X_test)
    model2 = model.to('cpu')
    outputs = model2(images)
    _, predicted = torch.max(outputs.data, 1)
    return predicted.cpu().numpy()

## load model, make predictions and make submission file

Notice that predictions are made using CPU. 

In [None]:
from itertools import product

vec_lr = [0.0003]
vec_v = [1, 2, 3, 4, 5, 6, 7]

for lr, v in product(vec_lr, vec_v):
    
    print(lr, v)
    # load each model
    model = torch.load(os.path.join(PATH_MODEL, 'model_test_mytk_prop20_lr{}_v{}.pt'.format(lr, v)))
    
    # predict
    y_pred = predict(model.eval(), vectors)
    
    
    list_y_pred = []
    for y in y_pred:
        list_y_pred.append(cat_dict_inv[y])

    
    
    # Make submission file
    X_test['category'] = list_y_pred    
    X = X_test.drop(columns=['title', 'language'])
    X = X.set_index('id')
    X.to_csv(os.path.join(PATH_SUB, 'sub_test_mytk_prop20_lr{}_v{}.csv'.format(lr, v)))
    
    

0.0003 3
0.0003 4
0.0003 5


#  Vote

## load predictions

In [None]:
sub_last_mytk_03_v1 = pd.read_csv(os.path.join(PATH_SUB,'sub_test_last_mytk_prop20_lr0.0003_v1.csv'))
sub_last_mytk_03_v2 = pd.read_csv(os.path.join(PATH_SUB,'sub_test_last_mytk_prop20_lr0.0003_v2.csv'))
sub_last_mytk_03_v3 = pd.read_csv(os.path.join(PATH_SUB,'sub_test_last_mytk_prop20_lr0.0003_v3.csv'))
sub_last_mytk_03_v4 = pd.read_csv(os.path.join(PATH_SUB,'sub_test_last_mytk_prop20_lr0.0003_v4.csv'))
sub_last_mytk_03_v5 = pd.read_csv(os.path.join(PATH_SUB,'sub_test_last_mytk_prop20_lr0.0003_v5.csv'))
sub_last_mytk_03_v6 = pd.read_csv(os.path.join(PATH_SUB,'sub_test_last_mytk_prop20_lr0.0003_v6.csv'))
sub_last_mytk_03_v7 = pd.read_csv(os.path.join(PATH_SUB,'sub_test_last_mytk_prop20_lr0.0003_v7.csv'))

## voting

In [None]:
from collections import Counter 

list_counters = []
# best 2
models1 = [sub_last_mytk_03_v1.category, sub_last_mytk_03_v2.category,sub_last_mytk_03_v3.category,
          sub_last_mytk_03_v4.category, sub_last_mytk_03_v5.category, sub_last_mytk_03_v6.category,
          sub_last_mytk_03_v7.category]


for i in range(len(sub08_03.category)):
    cnt = Counter()
    for model in models1:
        cnt[model[i]] += 1
    
    list_counters.append(cnt)

## Generate submission file of this group

In [None]:
categories = []
ide = []
for i,count in enumerate(list_counters):
   # print(max(count))
    categories.append(max(count, key=count.get))
    ide.append(i)

sub_mix = pd.DataFrame(categories, columns = ['category']) #, index=['id])', 'b', 'c']) 
sub_mix['id'] = ide
sub_mix = sub_mix.set_index('id')


sub_mix.to_csv(os.path.join(PATH_SUB,'sub_mytk_20.csv'))