In [1]:
# Ucitaj numpy, pandas i re
import numpy as np
import pandas as pd
import re

In [2]:
# Ucitaj trening podatke
data_train = pd.read_csv('train.csv')

# Postavi nazive stupaca (posto ih nema)
data_train.columns = ['Rating', 'Title', 'Review']

# Izbaci stupac Title
data_train = data_train.drop(columns=['Title'])

# Nasumicno uzmi 2.5% podataka
data_train = data_train.sample(frac=0.0002)

# Smanji sav Rating za 1 (1, 2) -> (0, 1)
data_train['Rating'] = data_train['Rating'].apply(lambda x: x - 1)

# Resetiraj index
data_train = data_train.reset_index(drop=True)

# Prikazi podatke
print(data_train.head())

   Rating                                             Review
0       0  I got this for my daughter when she was one. S...
1       0  The CD's arrival took longer than expected may...
2       0  The waffles stuck so bad that I had to turn th...
3       0  As an avid student of Chaos Theory and Fractal...
4       0  The ONLY reason I gave this one star is becaus...


In [3]:
# Funkcija za stvaranje bag-of-words vektora
def calculateBOW(words, sentence):
    vector = dict.fromkeys(words, 0)
    for word in sentence:
        vector[word] = sentence.count(word)
    return vector

# Funkcija koja postavlja recenice u lowercase, mice interpunkcijske znakove i stvara matricu rijeci svake recenice
def transform_data(data):
    sentences = []
    for i in data:
        sentences.append(re.sub(r"[^a-zA-Z0-9]", " ", i.lower()).split())
    return sentences

# Funkcija koja vraca unikatne rijeci
def get_unique_words(sentences):
    vocabulary = set()
    for i in sentences:
        for j in i:
            vocabulary.add(j)
    return vocabulary

# Rasclani recenice na rijeci
sentences_train = transform_data(data_train['Review'])

# Stvori popis unikatnih rijeci
vocabulary = get_unique_words(sentences_train)

# Ispis koliko ima unikatnih rijeci
print(f'Unique words: {len(vocabulary)}')

# Izracunaj bag-of-words za svaku recenicu
bow = []
for i in sentences_train:
    bow.append(calculateBOW(vocabulary, i))

# Iz liste bow stvori DataFrame objekt
data_reviews = pd.DataFrame(bow)

# Ispisi dimenzije podataka
print(f'Data shape: {data_reviews.shape}')

Unique words: 7628
Data shape: (720, 7628)


In [4]:
# Pretvori pandas DataFrame u numpy array
X_train = data_reviews.to_numpy()[:int(data_reviews.shape[0] * 0.7)]
y_train = data_train['Rating'].to_numpy()[:int(data_reviews.shape[0] * 0.7)].reshape(-1, 1)

X_test = data_reviews.to_numpy()[int(data_reviews.shape[0] * 0.7):]
y_test = data_train['Rating'].to_numpy()[int(data_reviews.shape[0] * 0.7):].reshape(-1, 1)

# Ispisi dimenzije podataka
print(f'X_train shape: {X_train.shape}, y_train shape: {y_train.shape}')
print(f'X_test shape: {X_test.shape}, y_test shape: {y_test.shape}')

X_train shape: (503, 7628), y_train shape: (503, 1)
X_test shape: (217, 7628), y_test shape: (217, 1)


In [5]:
# Funkcija logisticke regresije za predvidanje
def lr_h(x, w):
    x = np.append(np.ones((x.shape[0], 1)), x, axis=1)
    return 1 / (1 + np.exp(-1 * x @ w))

# Funkcija pogreske logisticke regresije
def cross_entropy_error(X, y, w):
    return np.mean(-y.reshape(-1, 1) * np.log(np.clip(lr_h(X, w), 1e-7, 1 - 1e-7)) - (1 - y.reshape(-1, 1)) * np.log(1 - np.clip(lr_h(X, w), 1e-7, 1 - 1e-7)))

# Funkcija kojom se trenira model logisticke regresije
def lr_train(X, y, eta=0.01, max_iter=2000, alpha=0, epsilon=0.0001, trace=False):
    
    weight_matrix = []
    last_error = 0
    w = np.zeros((X.shape[1] + 1, 1))

    for i in range(max_iter + 1):

        weight_matrix.append(w.copy())

        cur_error = cross_entropy_error(X, y, w)
        if np.abs(cur_error - last_error) < epsilon:
            break

        dw = np.zeros((X.shape[1] + 1, 1))

        for j in range(X.shape[0]):

            h = lr_h(X[j].reshape(1, -1), w)
            dw = dw - (h - y[j].reshape(1, 1)) * np.append(1, X[j]).reshape(-1, 1)

        w[0] = w[0] + eta * dw[0]
        w[1:] = w[1:] * (1 - eta * alpha) + eta * dw[1:]

    weight_matrix = np.array(weight_matrix)

    if trace:
        return w, weight_matrix
    else:
        return w

In [8]:
# Treniraj model, vrati tezine
weights = lr_train(X_train, y_train, max_iter=2000, alpha=0.3)

print(lr_h(X_train, weights).round()[-5:])
print(y_train[-5:])

# Ispis pogreske
print(f'Error: {cross_entropy_error(X_train, y_train, weights)}')

[[0.]
 [0.]
 [0.]
 [0.]
 [0.]]
[[0]
 [0]
 [0]
 [0]
 [0]]
Error: 0.015027815598625678


In [9]:
print(lr_h(X_test, weights).round()[:5])
print(y_test[:5])

# Ispis pogreske
print(f'Error: {cross_entropy_error(X_test, y_test, weights)}')

[[1.]
 [1.]
 [1.]
 [0.]
 [1.]]
[[1]
 [1]
 [0]
 [0]
 [0]]
Error: 0.9644855486502654
