In [1]:
# Ucitaj numpy, pandas i re
import numpy as np
import pandas as pd
import re

In [2]:
# Ucitaj trening podatke
data_train = pd.read_csv('500train.csv')

data_train.columns = ['Rating', 'Title', 'Review']

data_train = data_train.drop(columns=['Title'])

data_train['Rating'] = data_train['Rating'].apply(lambda x: x - 1)

# Prikazi podatke
data_train.head()

Unnamed: 0,Rating,Review
0,1,I'm reading a lot of reviews saying that this ...
1,1,This soundtrack is my favorite music of all ti...
2,1,I truly like this soundtrack and I enjoy video...
3,1,"If you've played the game, you know how divine..."
4,1,I am quite sure any of you actually taking the...


In [3]:
# Funkcija za stvaranje bag-of-words vektora
def calculateBOW(words, sentence):
    vector = dict.fromkeys(words, 0)
    for word in sentence:
        vector[word] = sentence.count(word)
    return vector

# Funkcija koja postavlja recenice u lowercase, mice interpunkcijske znakove i stvara matricu rijeci svake recenice
def transform_data(data):
    sentences = []
    for i in data:
        sentences.append(re.sub(r"[^a-zA-Z0-9]", " ", i.lower()).split())
    return sentences

# Funkcija koja vraca unikatne rijeci
def get_unique_words(sentences):
    vocabulary = set()
    for i in sentences:
        for j in i:
            vocabulary.add(j)
    return vocabulary

# Rasclani recenice na rijeci
sentences_train = transform_data(data_train['Review'])

# Stvori popis unikatnih rijeci
vocabulary = get_unique_words(sentences_train)

# Ispis koliko ima unikatnih rijeci
print(f'Unique words: {len(vocabulary)}')

# Izracunaj bag-of-words za svaku recenicu
bow = []
for i in sentences_train:
    bow.append(calculateBOW(vocabulary, i))

# Iz liste bow stvori DataFrame objekt
data_reviews = pd.DataFrame(bow)

# Ispisi dimenzije podataka
print(f'Data shape: {data_reviews.shape}')

Unique words: 5712
Data shape: (499, 5712)


In [4]:


# Pretvori pandas DataFrame u numpy array
X_train = data_reviews.to_numpy()[:int(data_reviews.shape[0] * 0.7)]
y_train = data_train['Rating'].to_numpy()[:int(data_reviews.shape[0] * 0.7)].reshape(-1, 1)

X_test = data_reviews.to_numpy()[int(data_reviews.shape[0] * 0.7):]
y_test = data_train['Rating'].to_numpy()[int(data_reviews.shape[0] * 0.7):].reshape(-1, 1)

# Ispisi dimenzije podataka
print(f'X_train shape: {X_train.shape}, y_train shape: {y_train.shape}')
print(f'X_test shape: {X_test.shape}, y_test shape: {y_test.shape}')

X_train shape: (349, 5712), y_train shape: (349, 1)
X_test shape: (150, 5712), y_test shape: (150, 1)


In [5]:
# Funkcija logisticke regresije za predvidanje
def lr_h(x, w):
    x = np.append(np.ones((x.shape[0], 1)), x, axis=1)
    return 1 / (1 + np.exp(-1 * x @ w))

# Funkcija pogreske logisticke regresije
def cross_entropy_error(X, y, w):
    return np.mean(-y.reshape(-1, 1) * np.log(np.clip(lr_h(X, w), 1e-7, 1 - 1e-7)) - (1 - y.reshape(-1, 1)) * np.log(1 - np.clip(lr_h(X, w), 1e-7, 1 - 1e-7)))

# Funkcija kojom se trenira model logisticke regresije
def lr_train(X, y, eta=0.01, max_iter=2000, alpha=0, epsilon=0.0001, trace=False):
    
    weight_matrix = []
    last_error = 0
    w = np.zeros((X.shape[1] + 1, 1))

    for i in range(max_iter + 1):

        weight_matrix.append(w.copy())

        cur_error = cross_entropy_error(X, y, w)
        if np.abs(cur_error - last_error) < epsilon:
            break

        dw = np.zeros((X.shape[1] + 1, 1))

        for j in range(X.shape[0]):

            h = lr_h(X[j].reshape(1, -1), w)
            dw = dw - (h - y[j].reshape(1, 1)) * np.append(1, X[j]).reshape(-1, 1)

        w[0] = w[0] + eta * dw[0]
        w[1:] = w[1:] * (1 - eta * alpha) + eta * dw[1:]

    weight_matrix = np.array(weight_matrix)

    if trace:
        return w, weight_matrix
    else:
        return w

In [6]:
# Treniraj model, vrati tezine
weights = lr_train(X_train, y_train, max_iter=200, alpha=0.5)

# Ispis pogreske
print(f'Error: {cross_entropy_error(X_train, y_train, weights)}')
print(f'Accuracy: {np.mean(lr_h(X_train, weights).round() == y_train)}')

Error: 0.01232794718740838
Accuracy: 1.0


In [7]:
# Ispis pogreske
print(f'Error: {cross_entropy_error(X_test, y_test, weights)}')
print(f'Accuracy: {np.mean(lr_h(X_test, weights).round() == y_test)}')

Error: 1.899304830065305
Accuracy: 0.6133333333333333
