In [1]:
import os
import string
from collections import defaultdict
import re

In [2]:
def letter_freq(text: str):
    text = [x for x in text.lower() if x in string.ascii_lowercase]
    total_len = len(text)

    freq = defaultdict(int)
    for c in string.ascii_lowercase:
        freq[c] = 0
        
    for c in text:
        freq[c] += 1
    return {c:v/total_len for c,v in freq.items()}

In [3]:
letter_freq("aaab")

{'a': 0.75,
 'b': 0.25,
 'c': 0.0,
 'd': 0.0,
 'e': 0.0,
 'f': 0.0,
 'g': 0.0,
 'h': 0.0,
 'i': 0.0,
 'j': 0.0,
 'k': 0.0,
 'l': 0.0,
 'm': 0.0,
 'n': 0.0,
 'o': 0.0,
 'p': 0.0,
 'q': 0.0,
 'r': 0.0,
 's': 0.0,
 't': 0.0,
 'u': 0.0,
 'v': 0.0,
 'w': 0.0,
 'x': 0.0,
 'y': 0.0,
 'z': 0.0}

In [4]:
def load_data(path: str):
    files = os.listdir(path)
    results = []
    for file in files:
        with open(os.path.join(path, file), "r", encoding="utf-8") as f:
            file_data = letter_freq(f.read())
            language = re.match(r'[A-Za-z0-9]+_([a-z]{2})\.txt', file)
            results.append([file_data, language.group(1)])
    return results

In [5]:
train_data = load_data("./dataset/train")
train_data

[[{'a': 0.08530522966792603,
   'b': 0.02704314973155697,
   'c': 0.03619009743487771,
   'd': 0.03559355736726984,
   'e': 0.11712069994034599,
   'f': 0.020680055677072977,
   'g': 0.020083515609465103,
   'h': 0.05786438655796381,
   'i': 0.06323324716643468,
   'j': 0.006164247365281368,
   'k': 0.006959634122091867,
   'l': 0.03976933784052496,
   'm': 0.0208789023662756,
   'n': 0.0618413203420163,
   'o': 0.06402863392324518,
   'p': 0.014316961622588983,
   'q': 0.00019884668920262477,
   'r': 0.07019288128852655,
   's': 0.07178365480214755,
   't': 0.10379797176377013,
   'u': 0.03996818452972758,
   'v': 0.007755020878902366,
   'w': 0.013919268244183734,
   'x': 0.000994233446013124,
   'y': 0.014316961622588983,
   'z': 0.0},
  'en'],
 [{'a': 0.11355034065102196,
   'b': 0.00984102952308857,
   'c': 0.045041635124905374,
   'd': 0.03368660105980318,
   'e': 0.13626040878122633,
   'f': 0.010219530658591975,
   'g': 0.019303557910673733,
   'h': 0.006813020439061317,
   'i'

In [6]:
def encode_labels(labels):
    unique_values = list(set(labels))
    unique_values.sort()
    return {k:i+1 for i, k in enumerate(unique_values)}

In [7]:
encoded_labels = encode_labels([label for freq, label in train_data])
encoded_labels

{'en': 1, 'it': 2}

In [8]:
encoded_labels["it"]

2

In [9]:
train_data = [[freq, encoded_labels[label]] for freq, label in train_data]
train_data

[[{'a': 0.08530522966792603,
   'b': 0.02704314973155697,
   'c': 0.03619009743487771,
   'd': 0.03559355736726984,
   'e': 0.11712069994034599,
   'f': 0.020680055677072977,
   'g': 0.020083515609465103,
   'h': 0.05786438655796381,
   'i': 0.06323324716643468,
   'j': 0.006164247365281368,
   'k': 0.006959634122091867,
   'l': 0.03976933784052496,
   'm': 0.0208789023662756,
   'n': 0.0618413203420163,
   'o': 0.06402863392324518,
   'p': 0.014316961622588983,
   'q': 0.00019884668920262477,
   'r': 0.07019288128852655,
   's': 0.07178365480214755,
   't': 0.10379797176377013,
   'u': 0.03996818452972758,
   'v': 0.007755020878902366,
   'w': 0.013919268244183734,
   'x': 0.000994233446013124,
   'y': 0.014316961622588983,
   'z': 0.0},
  1],
 [{'a': 0.11355034065102196,
   'b': 0.00984102952308857,
   'c': 0.045041635124905374,
   'd': 0.03368660105980318,
   'e': 0.13626040878122633,
   'f': 0.010219530658591975,
   'g': 0.019303557910673733,
   'h': 0.006813020439061317,
   'i': 0

In [10]:
X_train = [list(freq.values()) for freq, label in train_data]
X_train

[[0.08530522966792603,
  0.02704314973155697,
  0.03619009743487771,
  0.03559355736726984,
  0.11712069994034599,
  0.020680055677072977,
  0.020083515609465103,
  0.05786438655796381,
  0.06323324716643468,
  0.006164247365281368,
  0.006959634122091867,
  0.03976933784052496,
  0.0208789023662756,
  0.0618413203420163,
  0.06402863392324518,
  0.014316961622588983,
  0.00019884668920262477,
  0.07019288128852655,
  0.07178365480214755,
  0.10379797176377013,
  0.03996818452972758,
  0.007755020878902366,
  0.013919268244183734,
  0.000994233446013124,
  0.014316961622588983,
  0.0],
 [0.11355034065102196,
  0.00984102952308857,
  0.045041635124905374,
  0.03368660105980318,
  0.13626040878122633,
  0.010219530658591975,
  0.019303557910673733,
  0.006813020439061317,
  0.10068130204390613,
  0.0,
  0.0,
  0.06699470098410296,
  0.025738077214231644,
  0.06510219530658592,
  0.08516275548826646,
  0.0291445874337623,
  0.002271006813020439,
  0.06926570779712339,
  0.0586676760030280

In [11]:
y_train = [label for freq, label in train_data]
y_train

[1, 2, 1, 2, 1, 2]

In [None]:
def train(data, labels, epochs, learning_rate):
    num_features = len(data[0])
    weights = [0] * num_features
    bias = 0

    def activiation(x):
        return 1 if x >= 0 else -1

    for epoch in range(epochs):
        for x, y in zip(data, labels):
            output = sum(w * f for w, f in zip(weights, x)) + bias
            prediction = activation(output)

            

In [None]:
train(X_train, y_train, epochs=10, learning_rate=0.1)