In [1]:
# run this cell to import nltk
import nltk
from os import getcwd
import numpy as np
import pandas as pd
from nltk.corpus import twitter_samples 

In [6]:
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from collections import Counter 
import re

def remove_special_characters(text):
    pattern = r'[^a-zA-Z\s]'
    cleaned_text = re.sub(pattern, '', text)
    return cleaned_text

def is_link(word):
    return word.startswith("http")

def flatten(words):
    result = []
    for ws in words:
        for w in ws:
            result.append(w)
    return result

def process_tweet(tweet, stopwords):
    words = tweet.split()
    mask = lambda w : (w not in stopwords and not is_link(w) and not w.startswith("#") and not w.startswith("@") and len(w) > 1)
    words = [remove_special_characters(w) for w in words if mask(w)]
    words = [w for w in words if w != ""]
    porter_stemmer = PorterStemmer()
    stemmed_words = [porter_stemmer.stem(word) for word in words]
    return stemmed_words

def normalize(X):
    X[:,0] = (X[:,0] - X[:,0].mean()) / X[:,0].std()
    X[:,1] = (X[:,1] - X[:,1].mean()) / X[:,1].std()
    return X

    
all_positive_tweets = twitter_samples.strings('positive_tweets.json')
all_negative_tweets = twitter_samples.strings('negative_tweets.json')    
stopwords = stopwords.words('english')

ws = [process_tweet(w, stopwords) for w in all_positive_tweets]
ws = [process_tweet(w, stopwords) for w in all_negative_tweets]
positive_ws = flatten(ws)
negative_ws = flatten(ws)
all_words = positive_ws + negative_ws



ds = dict(Counter(all_words))
ds = {k:np.array([v, all_words.index(k)]) for k,v in ds.items()}
X = np.array(list(ds.values())).astype(float)

X  = normalize(X)

zeros = [0]*(len(X)//2)
ones  = [1]*(len(X)//2)
labels = np.array([*ones, *zeros])
ys = labels.reshape(-1,1)
y = ys.astype(int)

SyntaxError: expected ':' (1434039569.py, line 30)

In [5]:
ds

{'hopeless': array([4, 0]),
 'tmr': array([6, 1]),
 'everyth': array([34,  2]),
 'kid': array([40,  3]),
 'section': array([6, 4]),
 'ikea': array([2, 5]),
 'cute': array([86,  6]),
 'shame': array([36,  7]),
 'im': array([950,   8]),
 'nearli': array([6, 9]),
 'month': array([44, 10]),
 'that': array([310,  11]),
 'heart': array([54, 12]),
 'slide': array([ 2, 13]),
 'wast': array([10, 14]),
 'basket': array([ 2, 15]),
 'ketchburn': array([ 2, 16]),
 'hate': array([114,  17]),
 'japanes': array([ 8, 18]),
 'call': array([58, 19]),
 'bani': array([ 4, 20]),
 'me': array([476,  21]),
 'dang': array([ 4, 22]),
 'start': array([86, 23]),
 'next': array([80, 24]),
 'week': array([114,  25]),
 'work': array([264,  26]),
 'oh': array([184,  27]),
 'god': array([30, 28]),
 'babi': array([94, 29]),
 'face': array([40, 30]),
 'make': array([204,  31]),
 'smile': array([20, 32]),
 'neighbour': array([ 2, 34]),
 'motor': array([ 2, 35]),
 'ask': array([58, 36]),
 'said': array([66, 37]),
 'updat'

In [549]:
def sigmoid(s):
    return 1/(1+np.exp(-s))

def error(y_hat, y):
    errors = (y_hat-y)**2
    return errors.sum() / len(y)

def forward(X, W):
    s = X @ W
    y_pred = sigmoid(s)
    return y_pred

def loss(y_pred, y_):
    return sum((y_pred - y_)**2).item()

def sample(N, K):
    xs = list(range(N))
    return np.random.choice(xs, size=K, replace=False)

def split_train_test(X, y, ratio=0.3):
    N = len(X)
    idx = sample(N, int(N*ratio))
    not_idx = [n for n in range(N) if n not in idx]
    return X[not_idx], y[not_idx], X[idx], y[idx] 

X_train, y_train, X_test, y_test = split_train_test(X, y)
print(f"{sum(y_test)/len(y_test)}%, {sum(y_train)/len(y_train)}%") 

[0.49436302]%, [0.50241429]%


In [551]:
epoch = 100000
W = np.random.random(2).reshape(-1,1)
N = len(X)
ni = 100

for e in range(0,epoch):
    y_pred = forward(X_train, W)
    L_train = loss(y_pred, y_train)
    acc_train = round((y_pred == y_train).sum()/N * 100, 3)
    
    ### Backward ###
    DL_dw = 1/N*sum((y_pred-y_train)*X_train)

    ### Stadt Abstieg ###
    W = W.T - ni*DL_dw
    W = W.reshape(2,1)
    
    ### L test ###
    y_pred = forward(X_test, W)
    L_test = loss(y_pred, y_test)
    acc_test = round((y_pred == y_test).sum()/N * 100, 3)
    
    print(f"{e}, {acc_train}%, train_loss={round(L_train, 3)}, test_loss={round(L_test, 3)}")



0, 0.0%, train_loss=1393.648, test_loss=67.586
1, 13.404%, train_loss=180.074, test_loss=66.171
2, 12.796%, train_loss=177.185, test_loss=64.758
3, 12.086%, train_loss=174.267, test_loss=63.347
4, 11.511%, train_loss=171.319, test_loss=61.939
5, 10.97%, train_loss=168.341, test_loss=60.538
6, 10.446%, train_loss=165.332, test_loss=59.142
7, 9.99%, train_loss=162.294, test_loss=57.753
8, 9.398%, train_loss=159.227, test_loss=56.371
9, 8.79%, train_loss=156.134, test_loss=54.997
10, 8.249%, train_loss=153.016, test_loss=53.63
11, 7.488%, train_loss=149.878, test_loss=52.272
12, 6.575%, train_loss=146.721, test_loss=50.923
13, 5.984%, train_loss=143.551, test_loss=49.584
14, 5.646%, train_loss=140.372, test_loss=48.256
15, 5.291%, train_loss=137.187, test_loss=46.939
16, 5.156%, train_loss=134.001, test_loss=45.637
17, 4.868%, train_loss=130.818, test_loss=44.35
18, 4.716%, train_loss=127.642, test_loss=43.08
19, 4.581%, train_loss=124.479, test_loss=41.83
20, 4.496%, train_loss=121.332, 

KeyboardInterrupt: 

In [552]:
import torch
from torch import nn
import torch.nn.functional as F

Xt = torch.from_numpy(X_train).float()
yt = torch.from_numpy(y_train).float()
print(Xt.shape, yt.shape)

linear = nn.Linear(2, 1)
h = linear(Xt)
s = F.sigmoid(h)
loss = nn.CrossEntropyLoss()
Xt

loss(Xt, yt)

torch.Size([4142, 2]) torch.Size([4142, 1])


RuntimeError: 0D or 1D target tensor expected, multi-target not supported