In [1]:
import pandas
import sklearn.metrics
import torch
import numpy
from gensim.models import KeyedVectors
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Perceptron
from sklearn.neural_network import MLPClassifier

## Retrieve example pairs from lexique (-euse, -rice)

In [2]:
#retrieve our examples
df = pandas.read_excel("Lexique-query-2023-04-11 13-58-53.xlsx")
def find_class(word):
    return int("euse" in word)
def get_feminine_word(neutral_lemma, examples):
    feminine = ""
    for lemma, word in examples:
        if lemma==neutral_lemma:
            feminine = word
    return feminine
examples = list(zip(list(df["lemme"]), list(df["Word"])))

### Loading vectors from our own implementation (2.5m words)

In [3]:
#load our own vectors
def vec( word):
    return numpy.array(data[word_to_idx[word]], dtype=numpy.float32)
own_model = torch.load("./model499.pth", map_location=torch.device('cpu'))
idx_to_word = own_model["idx_to_word"]
word_to_idx = own_model["word_to_idx"]
data = own_model["cbow_state_dict"]
data = data["embeddings.weight"].data
own_examples = [(vec(lemme), find_class(word)) for lemme, word in examples if lemme in idx_to_word]

### Loading FRCOW(8.8bn words) vectors

In [4]:
#load FRCOWS(8.8bn) vectors
def load_frcows(path):
    embeddings={}
    with open(path, "r", encoding="utf-8") as file:
        lines = file.readlines()
        for e in lines:
            token = e.split(" ")[0]
            # token = token[0:token.rfind("_")]
            embedding = numpy.array(e.split(" ")[1:], dtype=numpy.float32)
            embeddings[token] = embedding
    return embeddings

embeds = load_frcows("lemma-A-pos-small.txt")
frcows_examples = [(embeds[lemma], find_class(word)) for lemma, word in examples if lemma in embeds.keys()]

### Loading preprocessed fasttext vectors from our examples

In [5]:
loaded_vectors = KeyedVectors.load('eur_vectors.kv')
fasttext_examples = [(loaded_vectors.get_vector(lemma), find_class(get_feminine_word(lemma, examples))) for lemma in loaded_vectors.key_to_index.keys()]

# Split in train/test corpus
* A = frcow 2.5m
* B = frcow 8.8bn
* C = fasttext

In [6]:
X_train_A, X_test_A, y_train_A, y_test_A = train_test_split([example[0] for example in own_examples], [example[1] for example in own_examples], test_size=0.2, random_state=42)
X_train_B, X_test_B, y_train_B, y_test_B = train_test_split([example[0] for example in frcows_examples], [example[1] for example in frcows_examples], test_size=0.2, random_state=42)
X_train_C, X_test_C, y_train_C, y_test_C = train_test_split([example[0] for example in fasttext_examples], [example[1] for example in fasttext_examples], test_size=0.2, random_state=42)

In [7]:
#frcow 2.5m
clf_A = MLPClassifier(max_iter=500)
clf_A.fit(X_train_A, y_train_A)
pred_A = clf_A.predict(X_test_A)
print("accuracy on clf_A(frcow 2.5m):",sklearn.metrics.accuracy_score(pred_A, y_test_A))

#frcow 8.8bn
clf_B = MLPClassifier(max_iter=500)
clf_B.fit(X_train_B, y_train_B)
pred_B = clf_B.predict(X_test_B)
print("accuracy on clf_B(frcow 8.8bn):",sklearn.metrics.accuracy_score(pred_B, y_test_B))

#fasttext
clf_C = MLPClassifier(max_iter=500)
clf_C.fit(X_train_C, y_train_C)
pred_C = clf_C.predict(X_test_C)
print("accuracy on clf_C(fasttext):",sklearn.metrics.accuracy_score(pred_C, y_test_C))


accuracy on clf_A(frcow 2.5m): 0.5
accuracy on clf_B(frcow 8.8bn): 0.8888888888888888
accuracy on clf_C(fasttext): 0.9104477611940298


In [8]:
counter = 0
for example in examples:
    if find_class(example[1]) == 1:
        counter += 1
print(counter/len(examples))

0.7872168284789643


In [9]:
#frcow 2.5m
clf_A = Perceptron(max_iter=500)
clf_A.fit(X_train_A, y_train_A)
pred_A = clf_A.predict(X_test_A)
print("Perceptron accuracy on clf_A(frcow 2.5m):",sklearn.metrics.accuracy_score(pred_A, y_test_A))

#frcow 8.8bn
clf_B = Perceptron(max_iter=500)
clf_B.fit(X_train_B, y_train_B)
pred_B = clf_B.predict(X_test_B)
print("Perceptron accuracy on clf_B(frcow 8.8bn):",sklearn.metrics.accuracy_score(pred_B, y_test_B))

#fasttext
clf_C = Perceptron(max_iter=500)
clf_C.fit(X_train_C, y_train_C)
pred_C = clf_C.predict(X_test_C)
print("Perceptron accuracy on clf_C(fasttext):",sklearn.metrics.accuracy_score(pred_C, y_test_C))


Perceptron accuracy on clf_A(frcow 2.5m): 0.25
Perceptron accuracy on clf_B(frcow 8.8bn): 0.8477366255144033
Perceptron accuracy on clf_C(fasttext): 0.9054726368159204


In [10]:
a = 1