In [8]:
import os
import re
import pandas as pd
from langdetect import detect
import textract
import json
from rank_bm25 import BM25Okapi
from urllib.request import urlopen
from bs4 import BeautifulSoup
import wikipedia
import spacy
import pdftotext
import PyPDF2
import re
from pdf2image import convert_from_path
from sentence_transformers import SentenceTransformer
import scipy
import numpy as np


class Extractor:
    def __init__(self):
        os.environ["TOKENIZERS_PARALLELISM"] = "false"

        self.df = pd.read_excel("data/grammars.xlsx")
        self.lang_list = ["ca", "zh", "en", "fr", "de", "it", "pt", "ru", "es"]
        self.models = dict()
        self.stopwords = dict()
        self.embedder = SentenceTransformer('bert-base-multilingual-cased')

        modelPath = "bert-base-multilingual-cased"

        self.embedder.save(modelPath)
        self.embedder = SentenceTransformer(modelPath)

        with open("data/language_files.json", 'r') as file:
            self.language_files = json.load(file)

        for lang in self.lang_list:
            exec("from spacy.lang.%s.stop_words import STOP_WORDS" % (lang))
            exec("self.stopwords[lang] = spacy.lang.%s.stop_words.STOP_WORDS" % (lang))
            if lang in ("en", "zh"):
                model_name = lang + "_core_web_sm"
            else:
                model_name = lang + "_core_news_sm"

            self.models[lang] = spacy.load(model_name, disable=['parser', 'ner'])

    def get_lang(self, filename):
        text = textract.process(filename).decode("utf-8") 
        lang = detect(text)
        return lang

    def first_letter(self, s):
        m = re.search(r'[a-z]', s, re.I)
        if m is not None:
            return s[m.start()]
        return "A"

    def end_of_sentence(self, text):
        text = text.strip("\n ")
        stop = ('...', '.', '?', '!', '!!!', '…')
        for item in stop:
            if text.endswith(item):
                return True
        return False

    def digits(self, text):
        num = 0
        for i in text:
            if i.isdigit():
                num += 1
        return num

    def get_term(self, term, language):
        if language == "en":
            return term
        # get languages
        soup = BeautifulSoup(urlopen('http://en.wikipedia.org/wiki/' + (term[:1].upper() + term[1:]).replace(' ', '_')), features="lxml")
        interwikihead = soup.find('li', class_=('interlanguage-link interwiki-' + language + ' mw-list-item'))

        try:
            title = re.split(' \(| –', interwikihead.a.get('title'))[0]
            return title.lower()
        except:
            return None 

    def get_description(self, term, language):
        soup = BeautifulSoup(urlopen('http://en.wikipedia.org/wiki/' + (term[:1].upper() + term[1:]).replace(' ', '_')), features="lxml")
        interwikihead = soup.find('li', class_=('interlanguage-link interwiki-' + language + ' mw-list-item'))

        try:
            if language == "en":
                title = term[:1].upper() + term[1:]
            else:
                title = interwikihead.a.get('title').split(u' – ')[0]
            wikipedia.set_lang(language)
            page = wikipedia.page(title)
            return page.summary
        except:
            return None 

    def get_new_paragraphs(self, paragraphs, page_numbers):
        new_paragraphs = [paragraphs[0]]
        new_page_numbers = dict()
        new_page_numbers[paragraphs[0]] = [page_numbers[0]]
        for i in range(1, len(paragraphs)):
            paragraph = paragraphs[i].strip(" \n")
            if len(paragraph) > 0:
                if (not self.end_of_sentence(new_paragraphs[-1])) | self.first_letter(paragraphs[i]).islower():
                    index = new_page_numbers[new_paragraphs[-1]]
                    del new_page_numbers[new_paragraphs[-1]]
                    new_paragraphs[-1] += paragraphs[i]
                    if index[-1] != page_numbers[i]:
                        index.append(page_numbers[i])
                    new_page_numbers[new_paragraphs[-1]] = index
                else:
                    new_paragraphs.append(paragraphs[i])
                    new_page_numbers[paragraphs[i]] = [page_numbers[i]]
        return new_paragraphs, new_page_numbers

    def make_dir(self, fname):

        directory = os.path.dirname(fname)
        if not os.path.exists(directory):
            os.system("mkdir -p \"{directory}\"")
        os.system("touch \"{os.path.basename(new_fname)}\"")

    def rerank(self, query, entries, embeddings_dict):

        corpus_embeddings = self.embedder.encode(entries)

        query_embedding = self.embedder.encode(query)

        # Find the closest 5 sentences of the corpus for each query sentence based on cosine similarity
        closest_n = 5
        distances = scipy.spatial.distance.cdist(query_embedding, corpus_embeddings, "cosine")[0]

        results = zip(range(len(distances)), distances)
        results = sorted(results, key=lambda x: x[1])

        ans = []

        for idx, distance in results[0:closest_n]:
            ans.append(entries[idx])

        return ans

    def extract(self, lang_about, query, method, description=True):

        #try:

            ans = []
            image_files = []
            fnames = []
            i = 0
            filenames = self.language_files[lang_about]
            for item in filenames:

                fname = item[0]
                short_name = fname

                lang = item[1]
                absolute_path = "data/"
                fname = absolute_path + fname
                fname_par = (absolute_path + "Grammars_Paragraphs/" + os.path.basename(fname)).replace("pdf", "json")
                fname_lem = (absolute_path + "Grammars_Lemmas/" + os.path.basename(fname)).replace("pdf", "json")
                fname_num = (absolute_path + "Grammars_Page_Numbers/" + os.path.basename(fname)).replace("pdf", "json")


                nlp = self.models[lang]
                stopwords_lang = self.stopwords[lang]

                if not os.path.exists(fname_par):
                    
                    command = "rclone copy \"gdrive:" + short_name + "\" \"data/" + os.path.dirname(short_name) + "\" --no-traverse --drive-chunk-size 32M -P"
                    print(command)
                    os.system(command)

                    with open(fname, 'rb') as f:
                        pdf = pdftotext.PDF(f)
                    page_numbers = []
                    paragraphs = []
                    lemmatized_paragraphs = []
                    for j in range (len(pdf)):
                        addition = re.split('  ', pdf[j])
                        paragraphs.extend(addition)
                        for paragraph in addition:
                            page_numbers.append(j)
                    new_paragraphs, new_page_numbers = self.get_new_paragraphs(paragraphs, page_numbers)

                    for paragraph in new_paragraphs:
                        lemmatized_paragraph = []
                        doc = nlp(paragraph.lower())
                        for token in doc:
                            if token.lemma_ not in stopwords_lang and token.is_alpha:
                                lemmatized_paragraph.append(token.lemma_)
                        lemmatized_paragraphs.append(lemmatized_paragraph)

                    self.make_dir(fname_lem)
                    self.make_dir(fname_par)
                    self.make_dir(fname_num)

                    with open(fname_lem, 'w') as outfile:
                        lem = json.dumps(lemmatized_paragraphs)
                        outfile.write(lem)
                    with open(fname_par, 'w') as outfile:
                        par = json.dumps(new_paragraphs)
                        outfile.write(par)
                    with open(fname_num, 'w') as outfile:
                        num = json.dumps(new_page_numbers)
                        outfile.write(num)

                else:
                    with open(fname_par, 'r') as file:
                        new_paragraphs = json.load(file)
                    with open(fname_lem, 'r') as file:
                        lemmatized_paragraphs = json.load(file)
                    with open(fname_num, 'r') as file:
                        new_page_numbers = json.load(file)

                pdf_file = PyPDF2.PdfReader(fname)
                bm25 = BM25Okapi(lemmatized_paragraphs)

                if description:
                    term = self.get_term(query, lang)
                    fname_desc = "data/Grammars_Descriptions/" + term + ".json"
                    fname_desc_lem = "data/Grammars_Descriptions/" + term + "_lemmatized.json"

                    if not os.path.exists(fname_desc):
                        desc = self.get_description(query, lang)
                        desc_nlp = nlp(self.get_description(query, lang))
                        lemmatized_desc = []

                        for token in desc_nlp:
                            if token.lemma_ not in stopwords_lang and token.is_alpha:
                                lemmatized_desc.append(token.lemma_)

                        self.make_dir(fname_desc)
                        self.make_dir(fname_desc_lem)

                        with open(fname_desc, 'w') as outfile:
                            file_desc = json.dumps(desc)
                            outfile.write(file_desc)
                        with open(fname_desc_lem, 'w') as outfile:
                            file_desc_lem = json.dumps(lemmatized_desc)
                            outfile.write(file_desc_lem)

                    else:
                        with open(fname_desc, 'r') as file:
                            desc = json.load(file)
                        with open(fname_desc_lem, 'r') as file:
                            lemmatized_desc = json.load(file)

                    query_translated = lemmatized_desc

                else:
                    term = nlp(self.get_term(query, lang))
                    query_translated = []

                    for token in term:
                        if token.lemma_ not in stopwords_lang and token.is_alpha:
                            query_translated.append(token.lemma_)

                top_n = bm25.get_top_n(query_translated, new_paragraphs, n=5)

                if method == "BM25":
                    ans += top_n

                else:
                    reranked = self.rerank(query_translated, top_n, dict())
                    ans += reranked

                fnames.append(short_name)

            fname_indices = []
            
            return ans

        #except:
            #pass

Список всех языков, которые есть в базе данных, файл language_files.json

In [23]:
language_files = dict()
lang_set = set()

data = pd.read_excel("/home/aruhaizen/project/linguistic_data_extraction/grammars_database.xlsx")
for index, row in data.iterrows():
    if type(row["the language described"]) != float:
        lang_set.add(row["the language described"])
        if row["the language described"] not in language_files:
            language_files[row["the language described"]] = []
        language_files[row["the language described"]].append([row["full path"].replace("Language_Stuff", "Grammars"), row["meta language"]])

with open("language_files.json", "w") as outfile:
    json.dump(language_files, outfile)

In [24]:
lang_list = list(lang_set)
print(lang_list)

['Cogui', 'Lule', 'Tibetan', 'Karelian', 'Samaritan Aramaic', 'Albanian-Gheg', 'Kafa', 'Pampangan']


In [11]:
wals = pd.read_csv("wals_word_order.csv")
targets = dict(zip(wals.name, wals.description))

In [13]:
for i in lang_set:
    if i in targets:
            print(i)

Yaqui
Wyandot
Udmurt
Oksapmin
Garo
Hidatsa
Salinan
Mekens
Yurok
Biloxi
Macushi
Pipil
Barupu
Choctaw
Rikbaktsa
Basque
Blackfoot
Apurinã
Thai
Yagua
Khmer
Kuna
Piro
Gitksan
Usarufa
Carib
Maricopa
Nenets
Haida
Hup
Amahuaca
Menya
Ayoreo
Quileute
Evenki
Desano
Dâw
Kharia
Hupa
Mosetén
Yuchi
Galo
Wiyot
Jingpho
Santa
Sherpa
Chrau
Chukchi
Burmese
Camling
Mohawk
Balti
Lisu
Apatani
Crow
Passamaquoddy-Maliseet
Estonian
Dhimal
Temiar
Thulung
Timucua
Uyghur
Cantonese
Mundari
Magar
Mara
Turkmen
Zuni
Mansi
Ket
Wichita
Azerbaijani
Cherokee
Jamsay
Canela
Mizo
Skou
Khanty
Pirahã
O'odham
Luiseño
Burushaski
Darma
Pilagá
Awa
Seneca
Mandarin
Moghol
Mutsun
Korean
Kutenai
Baure
Cubeo
Abau
Uzbek
Yaminahua
Abui
Warembori
Japanese
Nivkh
Kham
Aleut
Seri
Koasati
Cashibo
Washo
Tuvan
Massachusett
Santali
Kwaza
Palikur
Even
Klamath
Ordos
Kayapó
Kamasau
Tutelo
Karajá
Mikasuki
Dagur
Cholón
Cocama
Kadiwéu
Bisu
Apinayé
Itelmen
Guaraní
Lavukaleve
Awtuw
Golin
Lepcha
Savosavo
Mian
Ainu
Edolo
Tshangla
Tonkawa
Kusunda
German
Cu

In [5]:
extractor = Extractor()

In [6]:
def get_df(langs):
    data = dict()
    data["language"] = []
    data["extracted"] = []
    data["target"] = []
    for lang in langs:
        #try:
            if lang not in data["language"]:
                target = extractor.extract(lang, "Word order", "BM25")
                text = ""
                for item in target:
                    text += item
                    text += "\n\n-------------------------------\n\n"
                data["language"].append(lang)
                data["extracted"].append(text)
                data["target"].append(targets[lang])
        #except:
            #pass
    return pd.DataFrame(data)

In [7]:
df_train = get_df(["Khanty", "Khmer", "Guajajara", "Chukchi"])
df_train

Unnamed: 0,language,extracted,target
0,Khanty,As to the order of the elements of the constr...,SOV
1,Khmer,Should you order the food?\n\n\n-------------...,SVO
2,Guajajara,Aqui estão alguns exemplos de orações e períod...,VSO
3,Chukchi,PRAGMATICS OF SENTENCE FORMChapter 19\nelement...,No dominant order


In [8]:
df_val = get_df(["Basque", "Burushaski", "Seneca"])
df_val

Unnamed: 0,language,extracted,target
0,Basque,"In (1c), the sentence contains a transitive ve...",SOV
1,Burushaski,Brsk. lacks the rich derivational machinery ch...,SOV
2,Seneca,145\nA Grammar of the Seneca Language\n146\nA ...,No dominant order


In [9]:
import torch
import numpy as np
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-cased")
labels = {"SOV": 0,
          "SVO": 1,
          "VSO": 2,
          "No dominant order": 3
          }

class Dataset(torch.utils.data.Dataset):

    def __init__(self, df):

        self.labels = [labels[label] for label in df["target"]]
        self.texts = [tokenizer(text, 
                               padding='max_length', max_length = 512, truncation=True,
                                return_tensors="pt") for text in df["extracted"]]

    def classes(self):
        return self.labels

    def __len__(self):
        return len(self.labels)

    def get_batch_labels(self, idx):
        # Fetch a batch of labels
        return np.array(self.labels[idx])

    def get_batch_texts(self, idx):
        # Fetch a batch of inputs
        return self.texts[idx]

    def __getitem__(self, idx):

        batch_texts = self.get_batch_texts(idx)
        batch_y = self.get_batch_labels(idx)

        return batch_texts, batch_y

In [10]:
from torch import nn
from transformers import BertModel

class BertClassifier(nn.Module):

    def __init__(self, dropout=0.5):

        super(BertClassifier, self).__init__()

        self.bert = BertModel.from_pretrained("bert-base-multilingual-cased")
        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(768, 5)
        self.relu = nn.ReLU()

    def forward(self, input_id, mask):

        _, pooled_output = self.bert(input_ids= input_id, attention_mask=mask,return_dict=False)
        dropout_output = self.dropout(pooled_output)
        linear_output = self.linear(dropout_output)
        final_layer = self.relu(linear_output)

        return final_layer

In [12]:
from torch.optim import Adam
from tqdm import tqdm

def train(model, train_data, val_data, learning_rate, epochs):

    train, val = Dataset(train_data), Dataset(val_data)

    train_dataloader = torch.utils.data.DataLoader(train, batch_size=2, shuffle=True)
    val_dataloader = torch.utils.data.DataLoader(val, batch_size=2)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    criterion = nn.CrossEntropyLoss()
    optimizer = Adam(model.parameters(), lr= learning_rate)

    if use_cuda:

            model = model.cuda()
            criterion = criterion.cuda()

    for epoch_num in range(epochs):

            total_acc_train = 0
            total_loss_train = 0

            for train_input, train_label in tqdm(train_dataloader):

                train_label = train_label.to(device)
                mask = train_input['attention_mask'].to(device)
                input_id = train_input['input_ids'].squeeze(1).to(device)

                output = model(input_id, mask)
                
                batch_loss = criterion(output, train_label.long())
                total_loss_train += batch_loss.item()
                
                acc = (output.argmax(dim=1) == train_label).sum().item()
                total_acc_train += acc

                model.zero_grad()
                batch_loss.backward()
                optimizer.step()
            
            total_acc_val = 0
            total_loss_val = 0

            with torch.no_grad():

                for val_input, val_label in val_dataloader:

                    val_label = val_label.to(device)
                    mask = val_input['attention_mask'].to(device)
                    input_id = val_input['input_ids'].squeeze(1).to(device)

                    output = model(input_id, mask)

                    batch_loss = criterion(output, val_label.long())
                    total_loss_val += batch_loss.item()
                    
                    acc = (output.argmax(dim=1) == val_label).sum().item()
                    total_acc_val += acc
            
            print(
                f'Epochs: {epoch_num + 1} | Train Loss: {total_loss_train / len(train_data): .3f} \
                | Train Accuracy: {total_acc_train / len(train_data): .3f} \
                | Val Loss: {total_loss_val / len(val_data): .3f} \
                | Val Accuracy: {total_acc_val / len(val_data): .3f}')
                  
EPOCHS = 5
model = BertClassifier()
LR = 1e-6
              
train(model, df_train, df_val, LR, EPOCHS)

100%|██████████| 2/2 [00:00<00:00,  2.21it/s]


Epochs: 1 | Train Loss:  0.804                 | Train Accuracy:  0.250                 | Val Loss:  1.064                 | Val Accuracy:  0.000


100%|██████████| 2/2 [00:00<00:00,  2.75it/s]


Epochs: 2 | Train Loss:  0.802                 | Train Accuracy:  0.000                 | Val Loss:  0.968                 | Val Accuracy:  1.000


100%|██████████| 2/2 [00:00<00:00,  2.90it/s]


Epochs: 3 | Train Loss:  0.793                 | Train Accuracy:  0.250                 | Val Loss:  1.035                 | Val Accuracy:  0.333


100%|██████████| 2/2 [00:00<00:00,  2.72it/s]


Epochs: 4 | Train Loss:  0.835                 | Train Accuracy:  0.000                 | Val Loss:  1.002                 | Val Accuracy:  0.667


100%|██████████| 2/2 [00:00<00:00,  2.45it/s]

Epochs: 5 | Train Loss:  0.789                 | Train Accuracy:  0.250                 | Val Loss:  1.052                 | Val Accuracy:  0.667





Feature 24A: Locus of Marking in Possessive Noun Phrases

Feature 23A: Locus of Marking in the Clause
