In [1]:
import numpy as np
import pandas as pd

In [6]:
df = pd.read_csv('output.txt', sep='\t', names=['translation', 'WER'])
df.head()

Unnamed: 0,translation,WER
0,"кто не работает, тот не ест.",0.0
1,он *працуе в банке.,25.0
2,я *чытаў этот роман весь день.,16.67
3,яна продолжила работать.,66.67
4,мой отец часто *апавядаў зам о свои школьные *...,55.56


In [13]:
X = df['translation']
y = df['WER'].apply(float)

In [65]:
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy import sparse

word_vect = TfidfVectorizer(analyzer='word', ngram_range=(1, 5))
char_vect = TfidfVectorizer(analyzer='char', ngram_range=(2, 6))

X_vect_word = word_vect.fit_transform(X)
X_vect_char = char_vect.fit_transform(X)

X_vect = sparse.hstack((X_vect_word, X_vect_char))
X_vect.shape

(1667, 109392)

In [66]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_vect, y, test_size=0.2, random_state=42)

In [None]:
from sklearn.metrics import mean_squared_error as mse
from math import sqrt
from sklearn.neural_network import MLPRegressor

mlp = MLPRegressor(solver='adam', max_iter=200, random_state=42, verbose=True)

mlp.fit(X_train, y_train)

Iteration 1, loss = 1059.29939423
Iteration 2, loss = 1041.22631075
Iteration 3, loss = 1020.08878670
Iteration 4, loss = 996.35815669
Iteration 5, loss = 970.24543272
Iteration 6, loss = 941.23246934
Iteration 7, loss = 909.65314684
Iteration 8, loss = 875.77321320
Iteration 9, loss = 839.43061267
Iteration 10, loss = 802.10482801
Iteration 11, loss = 763.17633007
Iteration 12, loss = 723.16459118
Iteration 13, loss = 683.35653943
Iteration 14, loss = 643.37381510
Iteration 15, loss = 605.05005349
Iteration 16, loss = 567.81104544
Iteration 17, loss = 531.25363444
Iteration 18, loss = 497.00795431
Iteration 19, loss = 465.20632514
Iteration 20, loss = 435.34506969
Iteration 21, loss = 407.27167971
Iteration 22, loss = 381.13535202
Iteration 23, loss = 356.95742342
Iteration 24, loss = 335.00841724
Iteration 25, loss = 314.17447714
Iteration 26, loss = 294.36209892
Iteration 27, loss = 275.02078291
Iteration 28, loss = 256.87475678
Iteration 29, loss = 239.27918215
Iteration 30, loss =

In [63]:
y_pred = mlp.predict(X_test)

score = sqrt(mse(y_test, y_pred))
print(score)

23.599058727916447


In [64]:
print(y_test[:5])
print(y_pred[:5])

940     20.00
987     20.00
405      0.00
721     66.67
1612    25.00
Name: WER, dtype: float64
[ 29.46628404  25.03106322  21.36611111  58.37441193  11.55677338]
