In [1]:
from sklearn import preprocessing, neighbors, linear_model, multioutput
import tensorflow as tf
import pycountry
import os
import numpy as np
import argparse
import lang2vec.lang2vec as l2v

In [2]:
gpus = tf.config.experimental.list_physical_devices("GPU")

In [3]:
gpus

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU'),
 PhysicalDevice(name='/physical_device:GPU:1', device_type='GPU'),
 PhysicalDevice(name='/physical_device:GPU:2', device_type='GPU'),
 PhysicalDevice(name='/physical_device:GPU:3', device_type='GPU')]

In [4]:
if gpus:
    try:
        tf.config.experimental.set_visible_devices(gpus[2], "GPU")
    except Exception as e:
        print(e)

In [5]:
class Classifyer(tf.keras.Model):
    
    def __init__(self, units1, units2, units3):
        
        super(Classifyer, self).__init__()
        self.dense1 = tf.keras.layers.Dense(units1, activation=tf.math.sigmoid)
        self.dense2 = tf.keras.layers.Dense(units2, activation=tf.math.sigmoid)
        self.dense3 = tf.keras.layers.Dense(units3, activation=tf.nn.softmax)
        
    def call(self, inputs):
        x = self.dense1(inputs)
        x = self.dense2(x)
        return self.dense3(x)

In [6]:
loss_object = tf.keras.losses.SparseCategoricalCrossentropy()

In [7]:
def loss_function(real, pred):
    return loss_object(real, pred)

In [8]:
def train_epoch(model, optimizer, inputs, label):
    
    with tf.GradientTape() as tape:
        pre = model(inputs)
        loss = loss_function(label, pre)
        
    variables = model.trainable_variables
    gradients = tape.gradient(loss, variables)
    optimizer.apply_gradients(zip(gradients, variables))
    
    return loss

In [9]:
source_dir = "/data/rrjin/Graduation/data/bible-corpus/parallel_text"

In [10]:
def get_language_alpha3(language_code):
    if len(language_code) == 2:
        ans = pycountry.languages.get(alpha_2 = language_code)
    elif len(language_code) == 3:
        ans = pycountry.languages.get(alpha_3 = language_code)
    else:
        return "-1"
    if ans is not None:
        return ans.alpha_3
    else:
        return "unknown language"


def check_alpha3(alpha3):
    if alpha3 != "unknown language" and alpha3 in l2v.LANGUAGES:
        return True
    return False

In [11]:
langcode_to_alpha3 = {"jap": "jpn"}

for lang in os.listdir(source_dir):
    # Get ISO 639-3 codes according to abbreviations of languages
    s = lang[:-4]
    language1, language2 = s.split("-")
    language1_alpha3, language2_alpha3 = get_language_alpha3(language1), get_language_alpha3(language2)
    if check_alpha3(language1_alpha3):
        langcode_to_alpha3[language1] = language1_alpha3
    if check_alpha3(language2_alpha3):
        langcode_to_alpha3[language2] = language2_alpha3

langcode_to_alpha3.pop("en")

lang_alpha3 = list(langcode_to_alpha3.values())
feature_name = "syntax_wals"
features = l2v.get_features(lang_alpha3, feature_name, header=True)

features_geo = l2v.get_features(lang_alpha3, "geo", header=True)

lang_alpha3.sort()  # fix order

X = [features_geo[lang] for lang in lang_alpha3]
train_data_rate = 0.7

In [None]:
score_dict = {}
for feat in range(len(features["CODE"])):
    Y = [features[lang][feat] if features[lang][feat] != "--" else -1 for lang in lang_alpha3]

    idx = [i for i in range(len(Y)) if Y[i] != -1]

    train_set = np.array([(X[i], Y[i]) for i in idx])

    if len(train_set) == 0:
        print("Feature {} is not available in all 101 languages!".format(features["CODE"][feat]))
        continue

    lab_enc = preprocessing.LabelEncoder()
    train_set[:, 1] = lab_enc.fit_transform(train_set[:, 1])

    X_train = train_set[:int(len(train_set) * train_data_rate), 0]
    Y_train = train_set[:int(len(train_set) * train_data_rate), 1]
    
    X_test = train_set[int(len(train_set) * train_data_rate):, 0]
    Y_test = train_set[int(len(train_set) * train_data_rate):, 1]

    if len(X_train) == 0:
        print("Feature {} has no train data!".format(features["CODE"][feat]))
        continue

    if len(X_test) == 0:
        print("Feature {} has no test data!".format(features["CODE"][feat]))
        continue

    if np.all(Y_train == Y_train[0]):
        print("Feature {} has only one class!".format(features["CODE"][feat]))
        continue
    
    X_train = tf.convert_to_tensor(X_train.tolist(), dtype=tf.float32)
    Y_train = tf.convert_to_tensor(Y_train.tolist(), dtype=tf.float32)
    
    X_test = tf.convert_to_tensor(X_test.tolist(), dtype=tf.float32)
    Y_test = tf.convert_to_tensor(Y_test.tolist(), dtype=tf.float32)
    
#     print(X_train.dtype)
#     print(Y_train.shape)
    
    model = Classifyer(50, 50, 2)
    
    optimizer = tf.keras.optimizers.Adam()
    
    EPOCHS = 10000
    for epoch in range(EPOCHS):
        loss = train_epoch(model, optimizer, X_train, Y_train)
        
#         if epoch % 10000 == 0:
#             print("Loss is {}".format(loss))
    
    predict_y = model(X_test)
    predict_y = tf.math.argmax(predict_y, axis=1)
    
    predict_y = tf.cast(predict_y, dtype=tf.float32)
    
    score = tf.math.reduce_sum(tf.cast(tf.math.equal(Y_test, predict_y), dtype=tf.float32)) / predict_y.shape[0]
    
#     logistic_model = linear_model.LogisticRegression(max_iter=3000)
#     clf = logistic_model.fit(X_train.tolist(), Y_train.tolist())
#     score = clf.score(X_test.tolist(), Y_test.tolist())
    score_dict[features["CODE"][feat]] = score
    print("Feature {} accuracy is {}, train dataset has {} element, test dataset has {} element".format(features["CODE"][feat], score, len(X_train), len(X_test)))
#     break

Feature S_SVO accuracy is 0.5789473652839661, train dataset has 44 element, test dataset has 19 element
Feature S_SOV accuracy is 0.4736842215061188, train dataset has 44 element, test dataset has 19 element
Feature S_VSO accuracy is 0.8947368264198303, train dataset has 44 element, test dataset has 19 element
Feature S_VOS accuracy is 0.8421052694320679, train dataset has 44 element, test dataset has 19 element
Feature S_OVS has only one class!
Feature S_OSV has only one class!
Feature S_SUBJECT_BEFORE_VERB accuracy is 0.8095238208770752, train dataset has 49 element, test dataset has 21 element
Feature S_SUBJECT_AFTER_VERB accuracy is 0.6666666865348816, train dataset has 47 element, test dataset has 21 element
Feature S_OBJECT_AFTER_VERB accuracy is 0.761904776096344, train dataset has 47 element, test dataset has 21 element
Feature S_OBJECT_BEFORE_VERB accuracy is 0.5714285969734192, train dataset has 47 element, test dataset has 21 element
Feature S_SUBJECT_BEFORE_OBJECT accuracy 