Соревнование: https://www.kaggle.com/c/phonetics-en-2021

In [None]:
import numpy as np
import scipy
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from matplotlib.colors import Colormap, ListedColormap
import pandas as pd
import time
from keras.datasets import mnist
import torch
import torchvision
from time import time
from torchvision import datasets, transforms
from torch import nn, optim
from torch.nn import functional as F
from torch.autograd import Variable
from tqdm import tqdm
from torch.nn.utils.rnn import pad_sequence
from google.colab import files
from zipfile import ZipFile
from functools import partial
from dataclasses import dataclass
from collections import OrderedDict
from zipfile import ZipFile as zip
from torchsummary import summary
from typing import Type, Any, Callable, Union, List, Optional
from torch import Tensor
import random

device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
SOS_token = 0
EOS_token = 1


class Transcription:
    def __init__(self, name):
        self.name = name
        self.sym2index = {}
        self.sym2count = {}
        self.index2sym = {0: "SOS", 1: "EOS"}
        self.n_syms = 2  # Count SOS and EOS

    def addWord(self, word):
        for sym in word.split(' '):
            self.addSym(sym)

    def addSym(self, sym):
        if sym not in self.sym2index:
            self.sym2index[sym] = self.n_syms
            self.sym2count[sym] = 1
            self.index2sym[self.n_syms] = sym
            self.n_syms += 1
        else:
            self.sym2count[sym] += 1

In [None]:
def readPairs(word, trans):
    words = open('/content/gdrive/My Drive/train.txt', encoding='utf-8').read().strip().split('\n')

    pairs = []
    for w in words:
        pair = w.split(' ')
        pairs.append([' '.join(pair[0]), pair[1].replace('_', ' ')])

    input = Transcription(word)
    output = Transcription(trans)

    return input, output, pairs

In [None]:
def prepareData(word, trans):
    input_lang, output_lang, pairs = readPairs(word, trans)

    for pair in pairs:
        input_lang.addWord(pair[0])
        output_lang.addWord(pair[1])
    print("Counted words:")
    print(input_lang.name, input_lang.n_syms)
    print(output_lang.name, output_lang.n_syms)
    return input_lang, output_lang, pairs

input_lang, output_lang, pairs = prepareData('word', 'transcriprion')
print(random.choice(pairs))

Counted words:
word 30
transcriprion 41
['A B O', 'AA B OW']


In [None]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)

    def forward(self, input, hidden):
        embedded = self.embedding(input).view(1, 1, -1)
        output = embedded
        output, hidden = self.gru(output, hidden)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

In [None]:
class DecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size):
        super(DecoderRNN, self).__init__()
        self.hidden_size = hidden_size


        self.embedding = nn.Embedding(output_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input, hidden):
        output = self.embedding(input).view(1, 1, -1)
        output = F.relu(output)
        output, hidden = self.gru(output, hidden)
        output = self.softmax(self.out(output[0]))
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

In [None]:
class AttnDecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size, dropout_p=0.1, max_length=75):
        super(AttnDecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.dropout_p = dropout_p
        self.max_length = max_length

        self.embedding = nn.Embedding(self.output_size, self.hidden_size)
        self.attn = nn.Linear(self.hidden_size * 2, self.max_length)
        self.attn_combine = nn.Linear(self.hidden_size * 2, self.hidden_size)
        self.dropout = nn.Dropout(self.dropout_p)
        self.gru = nn.GRU(self.hidden_size, self.hidden_size)
        self.out = nn.Linear(self.hidden_size, self.output_size)

    def forward(self, input, hidden, encoder_outputs):
        embedded = self.embedding(input).view(1, 1, -1)
        embedded = self.dropout(embedded)

        attn_weights = F.softmax(
            self.attn(torch.cat((embedded[0], hidden[0]), 1)), dim=1)
        attn_applied = torch.bmm(attn_weights.unsqueeze(0),
                                 encoder_outputs.unsqueeze(0))

        output = torch.cat((embedded[0], attn_applied[0]), 1)
        output = self.attn_combine(output).unsqueeze(0)

        output = F.relu(output)
        output, hidden = self.gru(output, hidden)

        output = F.log_softmax(self.out(output[0]), dim=1)
        return output, hidden, attn_weights

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

In [None]:
def indexesFromWord(trans, word):
    return [trans.sym2index[s] for s in word.split(' ')]

def tensorFromWord(trans, word):
    indexes = indexesFromWord(trans, word)
    indexes.append(EOS_token)
    return torch.tensor(indexes, dtype=torch.long, device=device).view(-1, 1)

def tensorsFromPair(pair):
    input_tensor = tensorFromWord(input_lang, pair[0])
    target_tensor = tensorFromWord(output_lang, pair[1])
    return (input_tensor, target_tensor)

In [None]:
teacher_forcing_ratio = 0.7


def train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, max_length=75):
    encoder_hidden = encoder.initHidden()

    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    input_length = input_tensor.size(0)
    target_length = target_tensor.size(0)

    encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

    loss = 0

    for ei in range(input_length):
        encoder_output, encoder_hidden = encoder(
            input_tensor[ei], encoder_hidden)
        encoder_outputs[ei] = encoder_output[0, 0]

    decoder_input = torch.tensor([[SOS_token]], device=device)

    decoder_hidden = encoder_hidden

    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False

    if use_teacher_forcing:
        # Teacher forcing: Feed the target as the next input
        for di in range(target_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            loss += criterion(decoder_output, target_tensor[di])
            decoder_input = target_tensor[di]  # Teacher forcing

    else:
        # Without teacher forcing: use its own predictions as the next input
        for di in range(target_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            topv, topi = decoder_output.topk(1)
            decoder_input = topi.squeeze().detach()  # detach from history as input

            loss += criterion(decoder_output, target_tensor[di])
            if decoder_input.item() == EOS_token:
                break

    loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss.item() / target_length

In [None]:
import time
import math


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

In [None]:
def trainIters(encoder, decoder, n_iters, print_every=500, learning_rate=0.01):
    start = time.time()
    plot_losses = []
    print_loss_total = 0  # Reset every print_every
    plot_loss_total = 0  # Reset every plot_every

    encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)
    training_pairs = [tensorsFromPair(random.choice(pairs))
                      for i in range(n_iters)]
    #training_pairs = [tensorsFromPair(p) for p in pairs]
    criterion = nn.NLLLoss()

    for iter in range(1, n_iters + 1):
        training_pair = training_pairs[iter - 1]
        input_tensor = training_pair[0]
        target_tensor = training_pair[1]

        loss = train(input_tensor, target_tensor, encoder,
                     decoder, encoder_optimizer, decoder_optimizer, criterion)
        print_loss_total += loss
        plot_loss_total += loss

        if iter % print_every == 0:
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            print('%s (%d %d%%) %.4f' % (timeSince(start, iter / n_iters),
                                         iter, iter / n_iters * 100, print_loss_avg))


In [None]:
def evaluate(encoder, decoder, sentence, max_length=75):
    with torch.no_grad():
        input_tensor = tensorFromWord(input_lang, sentence)
        input_length = input_tensor.size()[0]
        encoder_hidden = encoder.initHidden()

        encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

        for ei in range(input_length):
            encoder_output, encoder_hidden = encoder(input_tensor[ei],
                                                     encoder_hidden)
            encoder_outputs[ei] += encoder_output[0, 0]

        decoder_input = torch.tensor([[SOS_token]], device=device)  # SOS

        decoder_hidden = encoder_hidden

        decoded_words = []
        decoder_attentions = torch.zeros(max_length, max_length)

        for di in range(max_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            decoder_attentions[di] = decoder_attention.data
            topv, topi = decoder_output.data.topk(1)
            if topi.item() == EOS_token:
                break
            else:
                decoded_words.append(output_lang.index2sym[topi.item()])

            decoder_input = topi.squeeze().detach()

        return decoded_words, decoder_attentions[:di + 1]

In [None]:
def evaluateRandomly(encoder, decoder, n=10000):
    count = 0
    for i in range(n):
        pair = random.choice(pairs)
        print(pair[0].replace(' ', ''))
        print(pair[1].replace(' ', "_"))
        output_syms, attentions = evaluate(encoder, decoder, pair[0])
        output_word = '_'.join(output_syms)
        if (pair[1].replace(' ', "_") == output_word ):
            count += 1
        print(output_word)
        print('')
    print('Верно:', count, 'слов из 10000')

In [None]:
len(pairs)

83194

In [None]:
"""hidden_size = 256
encoder1 = EncoderRNN(input_lang.n_syms, hidden_size).to(device)
decoder1 = AttnDecoderRNN(hidden_size, output_lang.n_syms, dropout_p=0.1).to(device)
"""
trainIters(encoder1, decoder1, 5000)

0m 14s (- 40m 49s) (500 0%) 2.8512
0m 25s (- 34m 50s) (1000 1%) 2.7714
0m 35s (- 32m 7s) (1500 1%) 2.6945
0m 45s (- 30m 54s) (2000 2%) 2.6874
0m 55s (- 29m 59s) (2500 3%) 2.6029
1m 5s (- 29m 12s) (3000 3%) 2.4654
1m 15s (- 28m 41s) (3500 4%) 2.2308
1m 25s (- 28m 18s) (4000 4%) 2.1642
1m 36s (- 28m 5s) (4500 5%) 2.0194
1m 46s (- 27m 45s) (5000 6%) 1.8399
1m 56s (- 27m 28s) (5500 6%) 1.7879
2m 7s (- 27m 17s) (6000 7%) 1.6792
2m 17s (- 27m 6s) (6500 7%) 1.5991
2m 28s (- 26m 53s) (7000 8%) 1.5376
2m 39s (- 26m 49s) (7500 9%) 1.5382
2m 50s (- 26m 39s) (8000 9%) 1.4118
3m 0s (- 26m 28s) (8500 10%) 1.3961
3m 11s (- 26m 17s) (9000 10%) 1.4098
3m 21s (- 26m 4s) (9500 11%) 1.3008
3m 32s (- 25m 51s) (10000 12%) 1.3421
3m 42s (- 25m 37s) (10500 12%) 1.2330
3m 52s (- 25m 24s) (11000 13%) 1.2326
4m 2s (- 25m 14s) (11500 13%) 1.2948
4m 13s (- 25m 1s) (12000 14%) 1.1969
4m 23s (- 24m 50s) (12500 15%) 1.1915
4m 34s (- 24m 40s) (13000 15%) 1.1853
4m 44s (- 24m 30s) (13500 16%) 1.1401
4m 55s (- 24m 18s) 

In [None]:
trainIters(encoder1, decoder1, 20000)

0m 12s (- 8m 5s) (500 2%) 0.5034
0m 23s (- 7m 23s) (1000 5%) 0.5359
0m 33s (- 6m 53s) (1500 7%) 0.5226
0m 43s (- 6m 33s) (2000 10%) 0.4965
0m 54s (- 6m 20s) (2500 12%) 0.4582
1m 5s (- 6m 8s) (3000 15%) 0.5443
1m 15s (- 5m 56s) (3500 17%) 0.5205
1m 26s (- 5m 45s) (4000 20%) 0.5056
1m 36s (- 5m 34s) (4500 22%) 0.5522
1m 47s (- 5m 22s) (5000 25%) 0.4921
1m 58s (- 5m 11s) (5500 27%) 0.4934
2m 9s (- 5m 1s) (6000 30%) 0.5428
2m 20s (- 4m 51s) (6500 32%) 0.5672
2m 30s (- 4m 39s) (7000 35%) 0.5393
2m 41s (- 4m 28s) (7500 37%) 0.5131
2m 51s (- 4m 17s) (8000 40%) 0.5523
3m 2s (- 4m 6s) (8500 42%) 0.4874
3m 13s (- 3m 55s) (9000 45%) 0.5388
3m 23s (- 3m 45s) (9500 47%) 0.6052
3m 34s (- 3m 34s) (10000 50%) 0.4958
3m 44s (- 3m 23s) (10500 52%) 0.5242
3m 55s (- 3m 13s) (11000 55%) 0.5738
4m 6s (- 3m 2s) (11500 57%) 0.4944
4m 17s (- 2m 51s) (12000 60%) 0.4466
4m 27s (- 2m 40s) (12500 62%) 0.4663
4m 38s (- 2m 29s) (13000 65%) 0.4443
4m 48s (- 2m 18s) (13500 67%) 0.4901
4m 59s (- 2m 8s) (14000 70%) 0.46

In [None]:
trainIters(encoder1, decoder1, 20000)

0m 11s (- 7m 46s) (500 2%) 0.4689
0m 22s (- 7m 11s) (1000 5%) 0.5135
0m 33s (- 6m 52s) (1500 7%) 0.4851
0m 44s (- 6m 36s) (2000 10%) 0.5237
0m 54s (- 6m 22s) (2500 12%) 0.5028
1m 5s (- 6m 10s) (3000 15%) 0.5062
1m 16s (- 5m 59s) (3500 17%) 0.5062
1m 26s (- 5m 47s) (4000 20%) 0.4796
1m 37s (- 5m 36s) (4500 22%) 0.5140
1m 47s (- 5m 23s) (5000 25%) 0.5094
1m 58s (- 5m 12s) (5500 27%) 0.5567
2m 9s (- 5m 1s) (6000 30%) 0.5628
2m 20s (- 4m 51s) (6500 32%) 0.5315
2m 30s (- 4m 39s) (7000 35%) 0.4901
2m 41s (- 4m 28s) (7500 37%) 0.5137
2m 51s (- 4m 17s) (8000 40%) 0.5749
3m 1s (- 4m 6s) (8500 42%) 0.5137
3m 12s (- 3m 55s) (9000 45%) 0.5089
3m 22s (- 3m 43s) (9500 47%) 0.4546
3m 33s (- 3m 33s) (10000 50%) 0.5209
3m 43s (- 3m 22s) (10500 52%) 0.5277
3m 54s (- 3m 11s) (11000 55%) 0.5377
4m 5s (- 3m 1s) (11500 57%) 0.5984
4m 15s (- 2m 50s) (12000 60%) 0.4937
4m 26s (- 2m 39s) (12500 62%) 0.5486
4m 36s (- 2m 29s) (13000 65%) 0.5474
4m 47s (- 2m 18s) (13500 67%) 0.4934
4m 58s (- 2m 7s) (14000 70%) 0.

In [None]:
evaluateRandomly(encoder1, decoder1)

[1;30;43mВыходные данные были обрезаны до нескольких последних строк (5000).[0m
G_OW_B_IY
G_OW_B_IY

WILTSIE
W_IH_L_T_S_IY
W_IH_L_S_T_IY

HARGRAVES
HH_AA_R_G_R_EY_V_Z
HH_AA_R_G_R_EY_V_Z

LINGERING
L_IH_NG_G_ER_IH_NG
L_IH_NG_G_ER_IH_NG

PRESSBURGER
P_R_EH_S_B_ER_G_ER
P_R_EH_S_B_AH_G_ER

CONSUMED
K_AH_N_S_UW_M_D
K_AA_N_S_AH_M_D

DORAVILLE
D_AO_R_AH_V_IH_L
D_AO_R_EY_V_ER_L_L_L

PROCUREMENT
P_R_OW_K_Y_UH_R_M_AH_N_T
P_R_OW_K_ER_R_M_AH_N_T

WATTERS
W_AA_T_ER_Z
W_AE_T_ER_Z

UNDERSECRETARY
AH_N_D_ER_S_EH_K_R_IH_T_EH_R_IY
AH_N_D_ER_S_EH_K_T_R_IY_IY_IY

EX-GIRLFRIEND
EH_K_Z_G_ER_L_F_R_EH_N_D
EH_G_S_F_EH_R_AH_AH_N_D

HOLLAR
HH_AA_L_ER
HH_AA_L_ER

DECAYING
D_IH_K_EY_IH_NG
D_IH_K_EY_IH_NG

CHILDCARE
CH_AY_L_D_K_EH_R
K_IH_L_D_K_R

MCINTOSH
M_AE_K_AH_N_T_AO_SH
M_IH_K_IH_N_SH_AO

BUTTE
B_Y_UW_T
B_AH_T

HICKOK
HH_IH_K_AH_K
HH_IH_K_AH_K

NOLA
N_OW_L_AH
N_OW_L_AA

OUTS
AW_T_S
AW_T_S

UNHOOK
AH_N_HH_UH_K
AH_N_HH_UH_K

GIESEKE
G_IY_S_IH_K
JH_IY_S_K

ALPA
AE_L_P_AH
AA_L_P_AH

SPEECHWRITER
S_P_IY_CH_R_AY_T

In [None]:
test = pd.read_csv('/content/gdrive/My Drive/test.csv')
test_w = test['Word'].values

In [None]:
len(test_w)

41597

In [None]:
for i in range(len(test_w)):
    test_w[i] = ' '.join(test_w[i])

In [None]:
test_w

array(['P I T C H E D', 'D I S S O L V E R S', 'S C R A W N Y', ...,
       'S C O G I N', 'H E S S I O N', 'T A R N O W S K I'], dtype=object)

In [None]:
def evaluate_test(encoder, decoder):
    res = []
    for el in test_w:
        print(el)
        output_sym, attentions = evaluate(encoder, decoder, el)
        output_word = '_'.join(output_sym)
        res.append(output_word)
        print(output_word)
        print('')
    return res

In [None]:
res_w = evaluate_test(encoder1, decoder1)

[1;30;43mВыходные данные были обрезаны до нескольких последних строк (5000).[0m
B_EH_R_EH_T_OW

P A I R I N G
P_EH_R_IH_NG

L E P P A R D
L_EH_P_ER_D

A L S U P
AH_L_S_UW_P

G R I M A C E S
G_R_IH_M_AH_S_IH_Z

C H R Y S A N T H E M U M
K_R_IH_S_AH_N_TH_UW_M_M

P A N T I E S
P_AE_N_T_IY_Z

R O U L E T T E
R_UW_L_EH_T

B A N K N O T E S
B_AE_N_K_OW_N_T_S

M C G A V I N
M_AH_G_EY_V_AH_N

M O S K A T E L ' S
M_AA_S_K_AH_T_AH_L_Z

G R U N T E D
G_R_AH_N_T_IH_D

A L C A T E L
AE_L_K_EY_T_AH_L

F L U N K E D
F_L_AH_NG_K_T

C O J I M A R ' S
K_OW_JH_IY_M_ER_Z

P R E S E A S O N
P_R_IH_Z_IH_S_AH_N

G O N G S
G_AA_NG_Z

E L E C T S
IH_L_EH_K_T_S

D I F F I C U L T I E S
D_IH_F_IH_K_Y_UW_L_IY_IY_Z

P L A U S I B I L I T Y
P_L_AO_S_IH_B_IH_L_IH_T_IY

P L U N K I T T
P_L_AH_N_K_IH_T

R O C K E R
R_AA_K_ER

P R A B
P_R_AE_B

J U L I A N A
JH_UW_L_IY_IY_N_AH

T O M B
T_AA_M

G O L D F I N C H
G_OW_L_D_F_IH_N_F

D U R A B L E
D_UH_R_AH_B_AH_L

S A T H E R
S_AE_DH_ER

R O D I C K
R_AA_D_IH_K

F R E T

In [None]:
result = pd.DataFrame()
result['Transcription'] = res_w
result.index = np.arange(1, len(test) + 1)
result.index.names = ['Id']

In [None]:
result

Unnamed: 0_level_0,Transcription
Id,Unnamed: 1_level_1
1,P_IH_CH_T
2,D_IH_S_AA_L_V_ER_Z
3,S_K_R_AO_N_IY
4,B_AA_N_EH_F_AA_N_T
5,IH_K_S_IY_D_Z
...,...
41593,IH_N_AA_K_Y_AH_L_EY_SH_AH_N
41594,AH_N_T_OW
41595,S_K_AA_G_IH_N
41596,HH_EH_SH_AH_N


In [None]:
result.to_csv('new.csv')
files.download("new.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>