In [2]:
# run this cell to import nltk
import nltk
from os import getcwd
import numpy as np
import pandas as pd
from nltk.corpus import twitter_samples 

In [3]:
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from collections import Counter 
import re

def remove_special_characters(text):
    pattern = r'[^a-zA-Z\s]'
    cleaned_text = re.sub(pattern, '', text)
    return cleaned_text

def is_link(word):
    return word.startswith("http")

def flatten(words):
    result = []
    for ws in words:
        for w in ws:
            result.append(w)
    return result

def process_tweet(tweet, stopwords):
    words = tweet.split()
    mask = lambda w : (w not in stopwords and not is_link(w) and not w.startswith("#") and not w.startswith("@") and len(w) > 1)
    words = [remove_special_characters(w) for w in words if mask(w)]
    words = [w for w in words if w != ""]
    porter_stemmer = PorterStemmer()
    stemmed_words = [porter_stemmer.stem(word) for word in words]
    return stemmed_words
    
all_positive_tweets = twitter_samples.strings('positive_tweets.json')
all_negative_tweets = twitter_samples.strings('negative_tweets.json')    
stopwords = stopwords.words('english')
ws = [process_tweet(w, stopwords) for w in all_positive_tweets]
ws = [process_tweet(w, stopwords) for w in all_negative_tweets]
positive_ws = flatten(ws)
negative_ws = flatten(ws)
all_words = positive_ws + negative_ws
ds = dict(Counter(all_words))
ds = {k:np.array([v, all_words.index(k)]) for k,v in ds.items()}
X = np.array(list(ds.values())).astype(float)
X[:,0] = (X[:,0] - X[:,0].mean()) / X[:,0].std()
X[:,1] = (X[:,1] - X[:,1].mean()) / X[:,1].std()

zeros = [0]*(len(X)//2)
ones  = [1]*(len(X)//2)
labels = np.array([*ones, *zeros])
ys = labels.reshape(-1,1)
y = ys.astype(int)

In [4]:
def sigmoid(s):
    return 1/(1+np.exp(-s))

def error(y_hat, y):
    errors = (y_hat-y)**2
    return errors.sum() / len(y)

def forward(X, W):
    s = X @ W
    y_pred = sigmoid(s)
    return y_pred

def loss(y_pred, y_):
    return sum((y_pred - y_)**2).item()

def sample(N, K):
    xs = list(range(N))
    return np.random.choice(xs, size=K, replace=False)

def split_train_test(X, y, ratio=0.3):
    N = len(X)
    idx = sample(N, int(N*ratio))
    not_idx = [n for n in range(N) if n not in idx]
    return X[not_idx], y[not_idx], X[idx], y[idx] 

X_train, y_train, X_test, y_test = split_train_test(X, y)
print(f"{sum(y_test)/len(y_test)}%, {sum(y_train)/len(y_train)}%") 

[0.48647125]%, [0.5057943]%


In [None]:
epoch = 100000
W = np.random.random(2).reshape(-1,1)
N = len(X)
ni = 100

for e in range(0,epoch):
    y_pred = forward(X_train, W)
    L_train = loss(y_pred, y_train)
    acc_train = round((y_pred == y_train).sum()/N * 100, 3)
    
    ### Backward ###
    DL_dw = 1/N*sum((y_pred-y_train)*X_train)

    ### Stadt Abstieg ###
    W = W.T - ni*DL_dw
    W = W.reshape(2,1)
    
    ### L test ###
    y_pred = forward(X_test, W)
    L_test = loss(y_pred, y_test)
    acc_test = round((y_pred == y_test).sum()/N * 100, 3)
    
    print(f"{e}, {acc_train}%, train_loss={round(L_train, 3)}, test_loss={round(L_test, 3)}")


In [7]:
import torch
from torch import nn
import torch.nn.functional as F

Xt = torch.from_numpy(X_train).float()
yt = torch.from_numpy(y_train).float()
print(Xt.shape, yt.shape)

linear = nn.Linear(2, 1)
h = linear(Xt)
s = F.sigmoid(h)
loss = nn.CrossEntropyLoss()
Xt

loss(Xt, yt)

torch.Size([4142, 2]) torch.Size([4142, 1])


RuntimeError: 0D or 1D target tensor expected, multi-target not supported