# Ensembled Transferred Enbeddings

**Ensembled Transferred Embeddings** 

- Installation and imports

- Read and preprocess data

- Uninformative detection
    - Train Autoencoder
    - Extract transferred embeddings from Autoencoder
    - Train Logistic regression model with transferred embeddings and labels from Mturk
    - Evaluate with ROC curve

- Item categorization
    - Preprocess text for the network
    - Train goods/services classifier
    - Train model on invoices with goods categories and model on invoices with service categories
    - Train model on eBay data
    - Extract transferred embeddings from each model
    - Build models with each embedding with Mturk labels (invoice-service embedding with service categories and 
                                                          invoice-goods and eBay embeddings with goods categories)
    - Predict on the test set
    - Evaluate using accuracy and F1

## Installation in imports

In [None]:
!pip install -q keras
!pip install -q gensim
!pip install scikit-plot
!pip install --upgrade tqdm

In [None]:
!nvidia-smi

In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from matplotlib import pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import RegexpTokenizer, word_tokenize
import re
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, GlobalAveragePooling1D
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
import nltk
from sklearn.naive_bayes import MultinomialNB
import gensim 



def pre_processing(text):
#     ps = PorterStemmer()
#     maping = {'tshirt':'t-shirt','airpos':'headphones','vguc':'very good used condition','nwot':'new with tag','jeggings':'leggings','qty':'quantity',
# 'lipsense':'lipstick','fbid':'facebook id','xlarge':'xl','lampwork':'glasswork','druzy':'druse','vneck':'v-neck'}
#     stop_words = set(stopwords.words('english'))
#     tokenizer = RegexpTokenizer()
    letters_only = re.sub(r'[\W_]+', " ", text)  # Removing punctuation + numbers
#     letters_only = clean_numbers(letters_only)
    tokens = letters_only.lower().split(' ')  # Tokenizing
#     clean_words = [word if word not in maping else maping[word] for word in tokens if word not in stop_words]  # Stop words
#     stem_words = [product_id(w) for w in clean_words]  # Stemming
#     stem_words = [ps.stem(w) for w in clean_words]  # Stemming
    sentences = " ".join(tokens)
    return sentences


## Read and preprocess data

In [None]:
invoice_data_path = '/content/drive/My Drive/Colab Notebooks/paypal/paypal.csv'
ebay_data_path = '/content/drive/My Drive/Colab Notebooks/paypal/ebay.csv'
invoice_test_path = '/content/drive/My Drive/Colab Notebooks/paypal/test_set_9_3.csv'
unknowns_path = '/content/drive/My Drive/Colab Notebooks/paypal/unsolvable items.csv'

In [None]:
#load data
invoice_data = pd.read_csv(invoice_data_path)
                        
ebay_data = pd.read_csv(ebay_data_path)
ebay_data = ebay_data[(pd.notnull(ebay_data['text'])) & (pd.notnull(ebay_data['vertical']))]
invoice_test = pd.read_csv(invoice_test_path,encoding = "ISO-8859-1")
unknowns = pd.read_csv(unknowns_path)

In [None]:
ebay_data = ebay_data.drop_duplicates(['ebay_item_name'])
invoice_test = invoice_test.drop_duplicates(['item_name','description'])

In [None]:
#preprocess text- concatinate item name and description and remove
invoice_data['text'] = invoice_data.apply(lambda x: x['item_name'] if str(x['description'])=='nan' else x['item_name'] +' '+ x['description'],axis=1)
invoice_data['text'] = invoice_data['text'].apply(lambda x: pre_processing(x))

ebay_data['text'] = ebay_data['ebay_item_name'].apply(lambda x: pre_processing(x))

invoice_test['text'] = invoice_test.apply(lambda x: x['ITEM NAME'] if str(x['item description'])=='nan' else x['ITEM NAME'] +' '+ x['item description'],axis=1)
invoice_test['text'] = invoice_test['text'].apply(lambda x: pre_processing(x))

invoice_test = invoice_test[invoice_test.category.notnull()]
invoice_test = invoice_test[invoice_test.text.notnull()]

unknowns['text'] = unknowns.apply(lambda x: x['ITEM NAME'] if str(x['item description'])=='nan' else x['ITEM NAME'] +' '+ x['item description'],axis=1)
unknowns['text'] = unknowns['text'].apply(lambda x: pre_processing(x))
unknowns['category'] ='unknown'

In [None]:
#set good and services categories
ebay_categories = ['fashion', 'auto-parts', 'cellphones', 'houseware', 'electronics',
       'sports-equip', 'memorabilia', 'toys', 'music-videos', 'arts-n-craft',
       'jewelry', 'cosmetics', 'computer-hardware', 'garden-equip', 'food-n-drink' ,
       'office-supplies', 'books', 'health', 'baby-products', 'pet-supplies',
       'furniture', 'nutritional-supp', 'cameras', 'food-n-drink', 'coins',
        'music-instruments', 'software','tickets']
invoice_data['is_ebay'] = invoice_data.true_indy_name.isin(ebay_categories)
invoice_test['is_ebay'] = invoice_test.category.isin(ebay_categories)

In [None]:
#split to train and test
ind = np.random.random(len(invoice_test)) < 0.75
invoice_train_manual = invoice_test[ind]
invoice_test = invoice_test[~ind]

In [None]:
#add unknown to test set
ind = np.random.random(len(unknowns))<0.75
invoice_test = pd.concat([invoice_test,unknowns[~ind]],axis=0)
invoice_train_manual = pd.concat([invoice_train_manual,unknowns[ind]],axis=0)
invoice_test['unknown']=invoice_test['category'] =='unknown'
invoice_train_manual['unknown']=invoice_train_manual['category'] =='unknown'
invoice_test['unknown'].mean()

In [None]:
#download glove word embeddings
from gensim.models import KeyedVectors
!wget http://nlp.stanford.edu/data/wordvecs/glove.6B.zip
!unzip glove.6B.zip

In [None]:
#load glove model to memory
from gensim.scripts.glove2word2vec import glove2word2vec
glove2word2vec(glove_input_file="glove.6B.300d.txt", word2vec_output_file="gensim_glove_vectors.txt")
glove_model = KeyedVectors.load_word2vec_format("gensim_glove_vectors.txt", binary=False)

In [None]:
#build embedding matrix
num_words = 30000
maxlen=15
tokenizer = Tokenizer(num_words = num_words, split=' ')
tokenizer.fit_on_texts(invoice_data['text'].values)

a = []
non=0
embed_dim=300
word_index = tokenizer.word_index
embedding_matrix = np.random.random((len(word_index) + 1, embed_dim))
for word, i in tqdm(word_index.items()):
    if word in glove_model.wv:
        embedding_vector = glove_model.wv[word]
        embedding_matrix[i] = embedding_vector
    elif i<num_words:
      non+=1
      a.append(word)
embedding_matrix = embedding_matrix[:num_words,:]
non

## Uninformative detection

In [None]:
#build autoencoder model
from  keras.optimizers import Adam
from sklearn.preprocessing import OneHotEncoder
from keras.layers import Bidirectional
from keras.models import Model
from keras.layers import Input, Dense, Embedding, SpatialDropout1D, concatenate, Dropout, RepeatVector
from keras.layers import GRU, Bidirectional, GlobalAveragePooling1D, GlobalMaxPooling1D
from keras.preprocessing import text, sequence
from keras.callbacks import ModelCheckpoint
from keras import optimizers
#parse all sentences
sentenses = invoice_data.text.str.split().values
#preprocess for keras
num_words=30000
maxlen=15
tokenizer = Tokenizer(num_words = num_words, split=' ')
tokenizer.fit_on_texts(sentenses)
seqs = tokenizer.texts_to_sequences(sentenses)
pad_seqs = []
for i in seqs:
    if len(i)>4:
        pad_seqs.append(i)
pad_seqs = pad_sequences(pad_seqs,maxlen)
#The model
embed_dim = 150
latent_dim = 30
batch_size = 502
encoder_inputs = Input(shape=(maxlen,), name='Encoder-Input')
emb_layer = Embedding(num_words, embed_dim,input_length = maxlen, name='Body-Word-Embedding', mask_zero=False)
x = emb_layer(encoder_inputs)
state_h = GRU(latent_dim, name='Encoder-Last-GRU')(x)
encoder_model = Model(inputs=encoder_inputs, outputs=state_h, name='Encoder-Model')
seq2seq_encoder_out = encoder_model(encoder_inputs)
decoded = RepeatVector(maxlen)(seq2seq_encoder_out)
decoder_gru = GRU(latent_dim, return_sequences=True, name='Decoder-GRU-before')
decoder_gru_output = decoder_gru(decoded)
decoder_dense = Dense(num_words, activation='softmax', name='Final-Output-Dense-before')
decoder_outputs = decoder_dense(decoder_gru_output)
seq2seq_Model = Model(encoder_inputs,decoder_outputs )
seq2seq_Model.compile(optimizer=optimizers.Nadam(lr=0.001), loss='sparse_categorical_crossentropy')
history = seq2seq_Model.fit(pad_seqs, np.expand_dims(pad_seqs, -1),
          batch_size=batch_size,
          epochs=5,
          validation_split=0.05)


In [None]:
#Feature extraction from autoencoder
X_train = tokenizer.texts_to_sequences(invoice_train_manual['text'].values)
X_train = pad_sequences(X_train,maxlen=maxlen)
X_train = encoder_model.predict(X_train, verbose=1)

X_test = tokenizer.texts_to_sequences(invoice_test['text'].values)
X_test = pad_sequences(X_test,maxlen=maxlen)
X_test = encoder_model.predict(X_test, verbose=1)
#classifier on the autoencoder embedding
lr = LogisticRegression().fit(X_train,invoice_train_manual.unknown)
lr.score(X_test,invoice_test.unknown)

In [None]:
#roc curve
from scikitplot.metrics import plot_roc
plot_roc(invoice_test.unknown, lr.predict_proba(X_test))

## Item categorization

In [None]:
#preprocess text for model
tokenizer = Tokenizer(num_words = num_words, split=' ')
le = LabelEncoder()
y = le.fit_transform(invoice_data['true_indy_name'].astype(str).values)

tokenizer.fit_on_texts(invoice_data['text'].values)
X = tokenizer.texts_to_sequences(invoice_data['text'].values)
X = pad_sequences(X,maxlen=maxlen)
X_test = tokenizer.texts_to_sequences(invoice_test['text'][invoice_test['category'].isin(le.classes_)].values)
X_test = pad_sequences(X_test,maxlen=maxlen)
Y =le.transform(invoice_data['true_indy_name'].astype(str)).reshape(-1, 1)

Y_test = le.transform(invoice_test['category'][invoice_test['category'].isin(le.classes_)]).reshape(-1, 1)



In [None]:
#Build goods or service classifier
from  keras.optimizers import Adam
from sklearn.preprocessing import OneHotEncoder
from keras.layers import Bidirectional
from keras.models import Model
from keras.layers import Input, Dense, Embedding, SpatialDropout1D, concatenate, Dropout
from keras.layers import GRU, Bidirectional, GlobalAveragePooling1D, GlobalMaxPooling1D, LSTM
from keras.preprocessing import text, sequence
from keras.callbacks import ModelCheckpoint
num_words = 30000
maxlen=15

embed_dim = 300
lstm_out = 200
batch_size= 256

# ##Buidling the LSTM network


inp_phisical = Input(shape=(maxlen, ))

x_phisical = Embedding(num_words, embed_dim,input_length = X.shape[1], trainable=False, weights=[embedding_matrix])(inp_phisical)
# x = Embedding(num_words, embed_dim,input_length = X.shape[1], trainable=True)(inp)
x_phisical = SpatialDropout1D(0.3)(x_phisical)
x_phisical = LSTM(lstm_out, return_sequences=True)(x_phisical)
x_phisical = LSTM(100)(x_phisical)
x_phisical = Dropout(0.2)(x_phisical)
dens_phisical = Dense(30)(x_phisical)
outp_phisical = Dense(1, activation="sigmoid")(dens_phisical)
    
model_phisical = Model(inputs=inp_phisical, outputs=outp_phisical)
model_phisical.compile(loss = 'binary_crossentropy', optimizer='adam',metrics = ['accuracy'])


model_phisical.fit(X, invoice_data.is_ebay, 
                  batch_size =batch_size,validation_data=(X_test, invoice_test[invoice_test['category'].isin(le.classes_)].is_ebay), 
                   epochs =2,  verbose = 1)

# model.summary()

In [None]:
# #building goods and services transffered models with invoice data
le_service = LabelEncoder()
le_goods = LabelEncoder()

y_service = le_service.fit_transform(invoice_data[(~invoice_data.is_ebay.values)]['true_indy_name'].astype(str).values)
y_goods = le_goods.fit_transform(invoice_data[(invoice_data.is_ebay.values)]['true_indy_name'].astype(str).values)

test_informative = invoice_test[(invoice_test['category'].isin(le_service.classes_)) | (invoice_test['category'].isin(le_goods.classes_))]


y_test_goods, y_test_service = le_goods.transform(invoice_test.category[invoice_test.category.isin(le_goods.classes_)]), le_service.transform(invoice_test.category[invoice_test.category.isin(le_service.classes_)])
inp_goods = Input(shape=(maxlen, ))

x_goods = Embedding(num_words, embed_dim,input_length = X.shape[1], trainable=False, weights=[embedding_matrix])(inp_goods)
# x = Embedding(num_words, embed_dim,input_length = X.shape[1], trainable=True)(inp)
x_goods = SpatialDropout1D(0.3)(x_goods)
x_goods = LSTM(lstm_out, return_sequences=True)(x_goods)
x_goods = LSTM(100)(x_goods)
x_goods = Dropout(0.2)(x_goods)
dens_goods = Dense(30)(x_goods)
outp_goods = Dense(len(le_goods.classes_), activation="softmax")(dens_goods)
    
model_goods = Model(inputs=inp_goods, outputs=outp_goods)
model_goods.compile(loss = 'sparse_categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])

inp_service = Input(shape=(maxlen, ))

x_service = Embedding(num_words, embed_dim,input_length = X.shape[1], trainable=False, weights=[embedding_matrix])(inp_service)
# x = Embedding(num_words, embed_dim,input_length = X.shape[1], trainable=True)(inp)
x_service = SpatialDropout1D(0.3)(x_service)
x_service = LSTM(lstm_out, return_sequences=True)(x_service)
x_service = LSTM(100)(x_service)
x_service = Dropout(0.2)(x_service)
dens_service = Dense(30)(x_service)
outp_service = Dense(len(le_service.classes_), activation="sigmoid")(dens_service)
    
model_service = Model(inputs=inp_service, outputs=outp_service)
model_service.compile(loss = 'sparse_categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])

model_goods.fit(X[invoice_data.is_ebay.values],y_goods,batch_size =batch_size,validation_data=(X_test[test_informative.category.isin(le_goods.classes_)], y_test_goods), epochs = 5,  verbose = 1)

model_service.fit(X[~invoice_data.is_ebay.values],y_service,batch_size =batch_size,validation_data=(X_test[test_informative.category.isin(le_service.classes_)], y_test_service), epochs =8,  verbose = 1)

In [None]:
#building ebay transffered model
X_ebay = tokenizer.texts_to_sequences(ebay_data['text'][ebay_data['vertical'].isin(le_goods.classes_)].values)
X_ebay = pad_sequences(X_ebay,maxlen=maxlen)
Y_ebay =le_goods.transform(ebay_data['vertical'][ebay_data['vertical'].isin(le_goods.classes_)].astype(str)).reshape(-1, 1)

inp_ebay = Input(shape=(maxlen, ))

x_ebay = Embedding(num_words, embed_dim,input_length = X.shape[1], trainable=False, weights=[embedding_matrix])(inp_ebay)
# x = Embedding(num_words, embed_dim,input_length = X.shape[1], trainable=True)(inp)
x_ebay = SpatialDropout1D(0.3)(x_ebay)
x_ebay = LSTM(lstm_out, return_sequences=True)(x_ebay)
x_ebay = LSTM(100)(x_ebay)
x_ebay = Dropout(0.2)(x_ebay)
dens_ebay = Dense(30)(x_ebay)
outp_ebay = Dense(len(le_goods.classes_), activation="sigmoid")(dens_ebay)
    
model_ebay = Model(inputs=inp_ebay, outputs=outp_ebay)
model_ebay.compile(loss = 'sparse_categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])

model_ebay.fit(X_ebay,Y_ebay,batch_size =batch_size,validation_data=(X_test[test_informative.category.isin(le_goods.classes_)], y_test_goods), epochs =7,  verbose = 1)


In [None]:
#set embedding model
emb_model_goods = Model(inputs=inp_goods, outputs=dens_goods)
emb_model_service = Model(inputs=inp_service, outputs=dens_service)
emb_model_ebay = Model(inputs=inp_ebay, outputs=dens_ebay)



In [None]:
#extract transffered embeddings on the manually labelled mturk data


X_finetune = tokenizer.texts_to_sequences(invoice_train_manual['text'])
X_finetune = pad_sequences(X_finetune,maxlen=maxlen)
y_finetune = invoice_train_manual['category']

X_test = tokenizer.texts_to_sequences(invoice_test['text'])
X_test = pad_sequences(X_test,maxlen=maxlen)
y_test = invoice_test['category']

goods_emb = emb_model_goods.predict(X_finetune)
service_emb = emb_model_service.predict(X_finetune)
ebay_emb = emb_model_ebay.predict(X_finetune)

test_goods_emb = emb_model_goods.predict(X_test)
test_service_emb = emb_model_service.predict(X_test)
test_ebay_emb = emb_model_ebay.predict(X_test)

invoice_goods_emb = emb_model_goods.predict(X,batch_size=5000,verbose=1)
invoice_service_emb = emb_model_service.predict(X,batch_size=5000,verbose=1)
invoice_ebay_emb = emb_model_ebay.predict(X,batch_size=5000,verbose=1)


In [None]:
import warnings
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import f1_score

warnings.filterwarnings("ignore")

    
res = [[],[],[],[],[],[],[],[],[],[]]
res2 = [[],[],[],[],[],[],[],[],[],[]]

from sklearn.model_selection import StratifiedKFold
kf = StratifiedKFold(n_splits=10,shuffle=True)
for train_index, test_index in kf.split(invoice_test, invoice_test.category):
    print('------iter------')
    X_finetune = tokenizer.texts_to_sequences(invoice_test['text'].values)
    X_finetune = pad_sequences(X_finetune,maxlen=maxlen)
    y_finetune = invoice_test['category']

    goods_emb = emb_model_goods.predict(X_finetune)
    service_emb = emb_model_service.predict(X_finetune)
    ebay_emb = emb_model_ebay.predict(X_finetune)
    auto_emb = encoder_model.predict(X_finetune)
    invoice_emb = emb_model_invoice.predict(X_finetune)

    emb_train = np.concatenate((goods_emb,service_emb,ebay_emb,auto_emb),axis=1)
    emb_auto_invoice = np.concatenate((goods_emb,service_emb,auto_emb),axis=1)
    emb_auto_ebay = np.concatenate((ebay_emb,auto_emb),axis=1)
    emb_ebay_invoice = np.concatenate((goods_emb,service_emb,ebay_emb),axis=1)

    ind_train_goods = (invoice_test.is_ebay.fillna(False).values) &  (invoice_test.index.isin(train_index))
    ind_test_goods =  (invoice_test.is_ebay.fillna(False).values)  &  (invoice_test.index.isin(test_index))
    ind_train_service = (~invoice_test.is_ebay.fillna(False).values)  &  (invoice_test.index.isin(train_index))
    ind_test_service = (~invoice_test.is_ebay.fillna(False).values)  & (invoice_test.index.isin(test_index))
    ind_train_all = (invoice_test.index.isin(train_index)) 
    ind_test_all = (invoice_test.index.isin(test_index)) 


    lr_goods = MLPClassifier()
    lr_goods.fit(goods_emb[ind_train_all],y_finetune[ind_train_all])
    print(lr_goods.score(goods_emb[ind_test_all],y_finetune[ind_test_all]))
    lr_goods_pred = lr_goods.predict(goods_emb[ind_test_all])
    print(f1_score(y_finetune[ind_test_all], lr_goods_pred, average='weighted'))
    res[0].append(lr_goods.score(goods_emb[ind_test_all],y_finetune[ind_test_all]))
    res2[0].append(f1_score(y_finetune[ind_test_all], lr_goods_pred, average='weighted'))

    lr_service = MLPClassifier()
    lr_service.fit(service_emb[ind_train_all],y_finetune[ind_train_all])
    print(lr_service.score(service_emb[ind_test_all],y_finetune[ind_test_all]))
    lr_service_pred = lr_service.predict(service_emb[ind_test_all])
    print(f1_score(y_finetune[ind_test_all], lr_service_pred, average='weighted'))
    res[1].append(lr_service.score(service_emb[ind_test_all],y_finetune[ind_test_all]))
    res2[1].append(f1_score(y_finetune[ind_test_all], lr_service_pred, average='weighted'))

    lr_ebay = MLPClassifier()
    lr_ebay.fit(ebay_emb[ind_train_all],y_finetune[ind_train_all])
    print(lr_ebay.score(ebay_emb[ind_test_all],y_finetune[ind_test_all]))
    lr_ebay_pred = lr_ebay.predict(ebay_emb[ind_test_all])
    print(f1_score(y_finetune[ind_test_all], lr_ebay_pred, average='weighted'))
    res[2].append(lr_ebay.score(ebay_emb[ind_test_all],y_finetune[ind_test_all]))
    res2[2].append(f1_score(y_finetune[ind_test_all], lr_ebay_pred, average='weighted'))

    lr_auto = MLPClassifier()
    lr_auto.fit(auto_emb[ind_train_all],y_finetune[ind_train_all])
    print(lr_auto.score(auto_emb[ind_test_all],y_finetune[ind_test_all]))
    lr_auto_pred = lr_auto.predict(auto_emb[ind_test_all])
    print(f1_score(y_finetune[ind_test_all], lr_auto_pred, average='weighted'))
    res[3].append(lr_auto.score(auto_emb[ind_test_all],y_finetune[ind_test_all]))
    res2[3].append(f1_score(y_finetune[ind_test_all], lr_auto_pred, average='weighted'))

    lr_invoice = MLPClassifier()
    lr_invoice.fit(invoice_emb[ind_train_all],y_finetune[ind_train_all])
    print(lr_invoice.score(invoice_emb[ind_test_all],y_finetune[ind_test_all]))
    lr_invoice_pred = lr_invoice.predict(invoice_emb[ind_test_all])
    print(f1_score(y_finetune[ind_test_all], lr_invoice_pred, average='weighted'))
    res[4].append(lr_invoice.score(invoice_emb[ind_test_all],y_finetune[ind_test_all]))
    res2[4].append(f1_score(y_finetune[ind_test_all], lr_invoice_pred, average='weighted'))

    lr_emb = LogisticRegression()
    lr_emb.fit(emb_train[ind_train_all],y_finetune.values[ind_train_all])
    print(lr_emb.score(emb_train[ind_test_all],y_finetune.values[ind_test_all]))
    lr_emb_pred = lr_emb.predict(emb_train[ind_test_all])
    print(f1_score(y_finetune[ind_test_all], lr_emb_pred, average='weighted'))
    res[5].append(lr_emb.score(emb_train[ind_test_all],y_finetune.values[ind_test_all]))
    res2[5].append(f1_score(y_finetune[ind_test_all], lr_emb_pred, average='weighted'))


    lr_meta = MLPClassifier()
    # meta = np.concatenate([lr_goods.predict_proba(goods_emb),lr_service.predict_proba(service_emb),
    #                       lr_ebay.predict_proba(ebay_emb),lr_auto.predict_proba(auto_emb)],axis=1)
    meta = np.concatenate([lr_invoice.predict_proba(invoice_emb),
                        lr_ebay.predict_proba(ebay_emb),lr_auto.predict_proba(auto_emb)],axis=1)
    lr_meta.fit(meta[ind_train_all],y_finetune.values[ind_train_all])
    print(lr_meta.score(meta[ind_test_all],y_finetune.values[ind_test_all]))
    lr_meta_pred = lr_meta.predict(meta[ind_test_all])
    print(f1_score(y_finetune[ind_test_all], lr_meta_pred, average='weighted'))
    res[6].append(lr_meta.score(meta[ind_test_all],y_finetune.values[ind_test_all]))
    res2[6].append(f1_score(y_finetune[ind_test_all], lr_meta_pred, average='weighted'))

    lr_emb_ebay_invoice = LogisticRegression()
    lr_emb_ebay_invoice.fit(emb_ebay_invoice[ind_train_all],y_finetune.values[ind_train_all])
    print(lr_emb_ebay_invoice.score(emb_ebay_invoice[ind_test_all],y_finetune.values[ind_test_all]))
    lr_emb_ebay_invoice_pred = lr_emb_ebay_invoice.predict(emb_ebay_invoice[ind_test_all])
    print(f1_score(y_finetune[ind_test_all], lr_emb_ebay_invoice_pred, average='weighted'))
    res[7].append(lr_emb_ebay_invoice.score(emb_ebay_invoice[ind_test_all],y_finetune.values[ind_test_all]))
    res2[7].append(f1_score(y_finetune[ind_test_all], lr_emb_ebay_invoice_pred, average='weighted'))

    lr_emb_auto_incoice = LogisticRegression()
    lr_emb_auto_incoice.fit(emb_auto_invoice[ind_train_all],y_finetune.values[ind_train_all])
    print(lr_emb_auto_incoice.score(emb_auto_invoice[ind_test_all],y_finetune.values[ind_test_all]))
    lr_emb_auto_invoice_pred = lr_emb_auto_incoice.predict(emb_auto_invoice[ind_test_all])
    print(f1_score(y_finetune[ind_test_all], lr_emb_auto_invoice_pred, average='weighted'))
    res[8].append(lr_emb_auto_incoice.score(emb_auto_invoice[ind_test_all],y_finetune.values[ind_test_all]))
    res2[8].append(f1_score(y_finetune[ind_test_all], lr_emb_auto_invoice_pred, average='weighted'))

    lr_emb_auto_ebay = LogisticRegression()
    lr_emb_auto_ebay.fit(emb_auto_ebay[ind_train_all],y_finetune.values[ind_train_all])
    print(lr_emb_auto_ebay.score(emb_auto_ebay[ind_test_all],y_finetune.values[ind_test_all]))
    lr_emb_auto_ebay_pred = lr_emb_auto_ebay.predict(emb_auto_ebay[ind_test_all])
    print(f1_score(y_finetune[ind_test_all], lr_emb_auto_ebay_pred, average='weighted'))
    res[9].append(lr_emb_auto_ebay.score(emb_auto_ebay[ind_test_all],y_finetune.values[ind_test_all]))
    res2[9].append(f1_score(y_finetune[ind_test_all], lr_emb_auto_ebay_pred, average='weighted'))


