In [401]:
import os
import string
from collections import defaultdict
import re
import numpy as np

np.random.seed(42)

In [402]:
def letter_freq(text: str):
    text = [x for x in text.lower() if x in string.ascii_lowercase]
    total_len = len(text)

    freq = defaultdict(int)
    for c in string.ascii_lowercase:
        freq[c] = 0
        
    for c in text:
        freq[c] += 1
    return np.array(list(freq.values())) / total_len

In [403]:
letter_freq("aaab")

array([0.75, 0.25, 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
       0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
       0.  , 0.  , 0.  , 0.  ])

In [404]:
def load_data(path: str):
    files = os.listdir(path)
    data = []
    labels = []
    for file in files:
        with open(os.path.join(path, file), "r", encoding="utf-8") as f:
            file_data = letter_freq(f.read())
            if file[0].isdigit():
                language = re.match(r'[A-Za-z0-9]+_([a-z]{2})\.txt', file)
            else:
                language = re.match(r'([a-z]{2})_[A-Za-z0-9]+\.txt', file)
            data.append(file_data)
            labels.append(language.group(1))
    return np.array(data), np.array(labels)

In [405]:
data, labels = load_data("./dataset/train")
data

array([[8.53052297e-02, 2.70431497e-02, 3.61900974e-02, 3.55935574e-02,
        1.17120700e-01, 2.06800557e-02, 2.00835156e-02, 5.78643866e-02,
        6.32332472e-02, 6.16424737e-03, 6.95963412e-03, 3.97693378e-02,
        2.08789024e-02, 6.18413203e-02, 6.40286339e-02, 1.43169616e-02,
        1.98846689e-04, 7.01928813e-02, 7.17836548e-02, 1.03797972e-01,
        3.99681845e-02, 7.75502088e-03, 1.39192682e-02, 9.94233446e-04,
        1.43169616e-02, 0.00000000e+00],
       [1.13550341e-01, 9.84102952e-03, 4.50416351e-02, 3.36866011e-02,
        1.36260409e-01, 1.02195307e-02, 1.93035579e-02, 6.81302044e-03,
        1.00681302e-01, 0.00000000e+00, 0.00000000e+00, 6.69947010e-02,
        2.57380772e-02, 6.51021953e-02, 8.51627555e-02, 2.91445874e-02,
        2.27100681e-03, 6.92657078e-02, 5.86676760e-02, 8.36487509e-02,
        2.08175625e-02, 1.40045420e-02, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 3.78501136e-03],
       [7.56153261e-02, 2.20756153e-02, 2.53742705e-02

In [406]:
labels

array(['en', 'it', 'en', 'it', 'en', 'it'], dtype='<U2')

In [407]:
def encode_labels(labels):
    return np.where(labels=="en", 1, -1)

In [408]:
encoded_labels = encode_labels(labels)
encoded_labels

array([ 1, -1,  1, -1,  1, -1])

In [409]:
def train(data, labels, epochs, learning_rate):
    num_features = len(data[0])
    weights = np.random.uniform(-0.5, 0.5, num_features)
    bias = np.random.uniform(-0.5, 0.5)

    def activation(x):
        return 1 if x >= 0 else -1

    for epoch in range(epochs):
        ok = 0
        for x, y in zip(data, labels):
            output = np.sum(weights * x) + bias
            prediction = activation(output)

            if prediction != y:
                weights += x * y + learning_rate
                bias += y#learning_rate * y
            else:
                ok += 1
        print(f"Epoch {epoch+1}/{epochs} accuracy: {ok/len(data)}")
    
    return weights, bias

In [410]:
weights, bias = train(data, encoded_labels, epochs=15, learning_rate=0.1)
weights, bias

Epoch 1/15 accuracy: 0.16666666666666666
Epoch 2/15 accuracy: 0.0
Epoch 3/15 accuracy: 0.3333333333333333
Epoch 4/15 accuracy: 1.0
Epoch 5/15 accuracy: 1.0
Epoch 6/15 accuracy: 1.0
Epoch 7/15 accuracy: 1.0
Epoch 8/15 accuracy: 1.0
Epoch 9/15 accuracy: 1.0
Epoch 10/15 accuracy: 1.0
Epoch 11/15 accuracy: 1.0
Epoch 12/15 accuracy: 1.0
Epoch 13/15 accuracy: 1.0
Epoch 14/15 accuracy: 1.0
Epoch 15/15 accuracy: 1.0


(array([1.06444027, 2.0454838 , 1.62690974, 1.5802715 , 1.03672045,
        1.23596838, 1.02827835, 2.19520257, 1.09166988, 1.74439274,
        1.0868056 , 1.73102586, 1.77307692, 1.1534651 , 0.97882375,
        1.05627055, 1.28734754, 1.47392771, 1.53959769, 1.3214834 ,
        1.62182012, 1.08410599, 1.39629294, 1.38041439, 1.54900451,
        1.72234521]),
 np.float64(-1.3003262178416404))

In [411]:
def test(data, labels, weights, bias):
    def activation(x):
        return 1 if x >= 0 else -1
    
    ok = 0
    for x, y in zip(data, labels):
        output = np.sum(weights * x) + bias
        prediction = activation(output)
        if prediction == y:
            ok += 1

    print(f"Accuracy test: {ok/len(data)}")

In [412]:
data, labels = load_data("./dataset/test")
data

array([[0.0877193 , 0.03508772, 0.        , 0.        , 0.15789474,
        0.01754386, 0.        , 0.07017544, 0.05263158, 0.        ,
        0.01754386, 0.03508772, 0.03508772, 0.05263158, 0.10526316,
        0.01754386, 0.01754386, 0.05263158, 0.0877193 , 0.14035088,
        0.01754386, 0.        , 0.        , 0.        , 0.        ,
        0.        ],
       [0.07042254, 0.        , 0.04225352, 0.01408451, 0.09859155,
        0.02816901, 0.01408451, 0.05633803, 0.09859155, 0.        ,
        0.        , 0.04225352, 0.04225352, 0.09859155, 0.07042254,
        0.01408451, 0.        , 0.04225352, 0.09859155, 0.04225352,
        0.05633803, 0.02816901, 0.01408451, 0.        , 0.02816901,
        0.        ],
       [0.04761905, 0.        , 0.01587302, 0.04761905, 0.14285714,
        0.01587302, 0.01587302, 0.07936508, 0.0952381 , 0.        ,
        0.        , 0.01587302, 0.01587302, 0.0952381 , 0.06349206,
        0.04761905, 0.        , 0.07936508, 0.06349206, 0.07936508,
      

In [413]:
labels = encode_labels(labels)
test(data, labels, weights, bias)

Accuracy test: 0.75


In [414]:
# def test(weights, bias, text):
#     def activation(x):
#         return 1 if x >= 0 else -1

#     data = letter_freq(text)
#     output = np.sum(weights * data) + bias
#     prediction = activation(output)
#     return "en" if prediction == 1 else "it"

In [415]:
# text = "SMS Friedrich Carl was an armored cruiser of the Imperial German Navy. A member of the Prinz Adalbert class, the ship was intended to act as a scout for the fleet's battleships and to patrol the German colonial empire. The Prinz Adalbert class was based on the earlier armored cruiser Prinz Heinrich, but with improved armament and armor. Built in the early 1900s, Friedrich Carl served in the German fleet from 1904 to 1909, which included a period as flagship of the reconnaissance squadron and a cruise to the Mediterranean Sea."
# test(weights, bias, text)

In [416]:
# text = "that Samuel Lander founded the Williamston Female College in an abandoned hotel?"
# test(weights, bias, text)

In [417]:
# text = "Operazione Quercia (in tedesco Unternehmen Eiche) fu il nome in codice dato all'operazione militare condotta il 12 settembre 1943 dai paracadutisti tedeschi della 2. Fallschirmjäger-Division che portò alla liberazione di Benito Mussolini dalla prigionia a Campo Imperatore, sul Gran Sasso. Dopo essere stato arrestato il 25 luglio 1943, Mussolini venne condotto in varie località e alla fine trasferito a Campo Imperatore a fine estate del 1943, una zona isolata e raggiungibile solo tramite funivia, dove era guardato a vista. Per non rischiare di farlo cadere in mano agli Alleati, Adolf Hitler ordinò al generale dei paracadutisti Kurt Student di organizzare una missione per la liberazione del Duce servendosi dei suoi Fallschirmjäger, a cui vennero aggregati, per ragioni politiche, sedici uomini del Servizio di sicurezza (Sicherheitsdienst - SD) delle SS agli ordini del capitano Otto Skorzeny."
# test(weights, bias, text)

In [418]:
# text = "Il 12 settembre, pochi giorni dopo il Proclama Badoglio che annunciava la resa incondizionata delle forze italiane agli Alleati, i paracadutisti tedeschi lanciarono un audace assalto per liberare Mussolini, che si risolse con successo e senza perdite per gli assalitori. Grazie ai suoi contatti diretti con Ernst Kaltenbrunner e Heinrich Himmler, fin da subito Skorzeny riuscì a imporre la propria versione distorta e autocelebrativa dei fatti avvenuti sul Gran Sasso"
# test(weights, bias, text)

In [419]:
for lr in [1, 0.1, 0.01, 0.001, 0.0001]:
    print(lr, 30 * "*")
    data, labels = load_data("./dataset/train")
    encoded_labels = encode_labels(labels)

    weights, bias = train(data, encoded_labels, epochs=15, learning_rate=lr)
    data, labels = load_data("./dataset/test")
    labels = encode_labels(labels)
    test(data, labels, weights, bias)

1 ******************************
Epoch 1/15 accuracy: 0.5
Epoch 2/15 accuracy: 0.6666666666666666
Epoch 3/15 accuracy: 1.0
Epoch 4/15 accuracy: 1.0
Epoch 5/15 accuracy: 1.0
Epoch 6/15 accuracy: 1.0
Epoch 7/15 accuracy: 1.0
Epoch 8/15 accuracy: 1.0
Epoch 9/15 accuracy: 1.0
Epoch 10/15 accuracy: 1.0
Epoch 11/15 accuracy: 1.0
Epoch 12/15 accuracy: 1.0
Epoch 13/15 accuracy: 1.0
Epoch 14/15 accuracy: 1.0
Epoch 15/15 accuracy: 1.0
Accuracy test: 0.7
0.1 ******************************
Epoch 1/15 accuracy: 0.16666666666666666
Epoch 2/15 accuracy: 0.16666666666666666
Epoch 3/15 accuracy: 0.0
Epoch 4/15 accuracy: 0.16666666666666666
Epoch 5/15 accuracy: 0.0
Epoch 6/15 accuracy: 0.16666666666666666
Epoch 7/15 accuracy: 0.0
Epoch 8/15 accuracy: 0.3333333333333333
Epoch 9/15 accuracy: 0.16666666666666666
Epoch 10/15 accuracy: 0.3333333333333333
Epoch 11/15 accuracy: 1.0
Epoch 12/15 accuracy: 1.0
Epoch 13/15 accuracy: 1.0
Epoch 14/15 accuracy: 1.0
Epoch 15/15 accuracy: 1.0
Accuracy test: 0.7
0.01 **