In [11]:
import pandas
import sklearn.metrics
import torch
import numpy
from gensim.models import KeyedVectors
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Perceptron
from sklearn.neural_network import MLPClassifier

## Retrieve example pairs from lexique (-euse, -rice)

In [12]:
#retrieve our examples
df = pandas.read_excel("Lexique-query-2023-04-11 13-58-53.xlsx")
def find_class(word):
    return int("euse" in word)
def get_feminine_word(neutral_lemma, examples):
    feminine = ""
    for lemma, word in examples:
        if lemma==neutral_lemma:
            feminine = word
    return feminine
examples = list(zip(list(df["lemme"]), list(df["Word"])))

### Loading vectors from our own implementation (2.5m words) and (500m words)

In [13]:
#load our own vectors
def vec(data, word_to_idx, word):
    return numpy.array(data[word_to_idx[word]], dtype=numpy.float32)

#load FRCOWS(2.5m) vectors
own_model_thin = torch.load("./model106.pth", map_location=torch.device('cpu'))
idx_to_word_thin = own_model_thin["idx_to_word"]
word_to_idx_thin = own_model_thin["word_to_idx"]
data_thin = own_model_thin["cbow_state_dict"]
data_thin = data_thin["embeddings.weight"].data
own_examples_thin = [(vec(data_thin, word_to_idx_thin, lemme), find_class(word)) for lemme, word in examples if lemme in idx_to_word_thin]

#load FRCOWS(500m) vectors
own_model_thick = torch.load("./model4.pth", map_location=torch.device('cpu'))
idx_to_word_thick = own_model_thick["idx_to_word"]
data_thick = own_model_thick["cbow_state_dict"]
word_to_idx_thick = own_model_thick["word_to_idx"]
data_thick = data_thick["embeddings.weight"].data
own_examples_thick = [(vec(data_thick, word_to_idx_thick, lemme), find_class(word)) for lemme, word in examples if lemme in idx_to_word_thick]


### Loading FRCOW(8.8bn words) vectors

In [14]:
#load FRCOWS(8.8bn) vectors
def load_frcows(path):
    embeddings={}
    with open(path, "r", encoding="utf-8") as file:
        lines = file.readlines()
        for e in lines:
            token = e.split(" ")[0]
            # token = token[0:token.rfind("_")]
            embedding = numpy.array(e.split(" ")[1:], dtype=numpy.float32)
            embeddings[token] = embedding
    return embeddings

embeds = load_frcows("lemma-A-pos-small.txt")
frcows_examples = [(embeds[lemma], find_class(word)) for lemma, word in examples if lemma in embeds.keys()]

### Loading preprocessed fasttext vectors from our examples

In [15]:
loaded_vectors = KeyedVectors.load('eur_vectors.kv')
fasttext_examples = [(loaded_vectors.get_vector(lemma), find_class(get_feminine_word(lemma, examples))) for lemma in loaded_vectors.key_to_index.keys()]

# Split in train/test corpus
* thick = frcow 500m
* thin = frcow 2.5m
* B = frcow 8.8bn
* C = fasttext

In [16]:
X_train, X_test, y_train, y_test = train_test_split([example[0] for example in own_examples_thick], [example[1] for example in own_examples_thick], test_size=0.2, random_state=42)
X_train_A, X_test_A, y_train_A, y_test_A = train_test_split([example[0] for example in own_examples_thin], [example[1] for example in own_examples_thin], test_size=0.2, random_state=42)
X_train_B, X_test_B, y_train_B, y_test_B = train_test_split([example[0] for example in frcows_examples], [example[1] for example in frcows_examples], test_size=0.2, random_state=42)
X_train_C, X_test_C, y_train_C, y_test_C = train_test_split([example[0] for example in fasttext_examples], [example[1] for example in fasttext_examples], test_size=0.2, random_state=42)

In [25]:
#frcow 500m
clf_score=0
for count in range(100):
    clf = MLPClassifier(max_iter=500)
    clf.fit(X_train, y_train)
    pred = clf.predict(X_test)
    clf_score += sklearn.metrics.accuracy_score(pred, y_test)
print("Multi-Layer Perceptron accuracy on clf(frcow 500m):",clf_score/100)

#frcow 2.5m
clf_A_score=0
for count in range(100):
    clf_A = MLPClassifier(max_iter=500)
    clf_A.fit(X_train_A, y_train_A)
    pred_A = clf_A.predict(X_test_A)
    clf_A_score += sklearn.metrics.accuracy_score(pred_A, y_test_A)
print("Multi-Layer Perceptron accuracy on clf_A(frcow 2.5m):",clf_A_score/100)


#frcow 8.8bn
clf_B_score=0
for count in range(100):
    clf_B = MLPClassifier(max_iter=500)
    clf_B.fit(X_train_B, y_train_B)
    pred_B = clf_B.predict(X_test_B)
    clf_B_score += sklearn.metrics.accuracy_score(pred_B, y_test_B)
print("Multi-Layer Perceptron accuracy on clf_B(frcow 8.8bn):",clf_B_score/100)

#fasttext
clf_C_score = 0
for count in range(100):
    clf_C = MLPClassifier(max_iter=500)
    clf_C.fit(X_train_C, y_train_C)
    pred_C = clf_C.predict(X_test_C)
    clf_C_score += sklearn.metrics.accuracy_score(pred_C, y_test_C)
print("Multi-Layer Perceptron accuracy on clf_C(fasttext):",clf_C_score/100)


Multi-Layer Perceptron accuracy on clf_A(frcow 500m): 0.8379166666666664
Multi-Layer Perceptron accuracy on clf_A(frcow 2.5m): 0.6625000000000003
Multi-Layer Perceptron accuracy on clf_B(frcow 8.8bn): 0.8933333333333324




Multi-Layer Perceptron accuracy on clf_C(fasttext): 0.92


In [18]:
counter = 0
for example in examples:
    if find_class(example[1]) == 1:
        counter += 1
print(counter/len(examples))

0.7872168284789643


In [26]:
#frcow 500m
clf_score = 0
for count in range(100):
    clf = Perceptron(max_iter=500)
    clf.fit(X_train, y_train)
    pred = clf.predict(X_test)
    clf_score += sklearn.metrics.accuracy_score(pred, y_test)
print("Perceptron accuracy on clf_A(frcow 500m):", clf_score/100)

#frcow 2.5m
clf_A_score = 0
for count in range(100):
    clf_A = Perceptron(max_iter=500)
    clf_A.fit(X_train_A, y_train_A)
    pred_A = clf_A.predict(X_test_A)
    clf_A_score += sklearn.metrics.accuracy_score(pred_A, y_test_A)
print("Perceptron accuracy on clf_A(frcow 2.5m):",clf_A_score/100)

#frcow 8.8bn
clf_B_score = 0
for count in range(100):
    clf_B = Perceptron(max_iter=500)
    clf_B.fit(X_train_B, y_train_B)
    pred_B = clf_B.predict(X_test_B)
    clf_B_score += sklearn.metrics.accuracy_score(pred_B, y_test_B)
print("Perceptron accuracy on clf_B(frcow 8.8bn):",clf_B_score/100)

#fasttext
clf_C_score = 0
for count in range(100):
    clf_C = Perceptron(max_iter=500)
    clf_C.fit(X_train_C, y_train_C)
    clf_C_score += sklearn.metrics.accuracy_score(pred_C, y_test_C)
print("Perceptron accuracy on clf_C(fasttext):",clf_C_score/100)


Perceptron accuracy on clf_A(frcow 500m): 0.796875
Perceptron accuracy on clf_A(frcow 2.5m): 0.6071428571428559
Perceptron accuracy on clf_B(frcow 8.8bn): 0.8477366255144019
Perceptron accuracy on clf_C(fasttext): 0.9203980099502477


In [20]:
a = 1