In [1]:
# run this cell to import nltk
import nltk
from os import getcwd
import numpy as np
import pandas as pd
from nltk.corpus import twitter_samples 

In [2]:
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from collections import Counter 
import re

def remove_special_characters(text):
    pattern = r'[^a-zA-Z\s]'
    cleaned_text = re.sub(pattern, '', text)
    return cleaned_text

def is_link(word):
    return word.startswith("http")

def flatten(words):
    result = []
    for ws in words:
        for w in ws:
            result.append(w)
    return result

def process_tweet(tweet, stopwords):
    words = tweet.split()
    mask = lambda w : (w not in stopwords and not is_link(w) and not w.startswith("#") and not w.startswith("@") and len(w) > 1)
    words = [remove_special_characters(w) for w in words if mask(w)]
    words = [w for w in words if w != ""]
    porter_stemmer = PorterStemmer()
    stemmed_words = [porter_stemmer.stem(word) for word in words]
    return stemmed_words

def normalize(X):
    X[:,0] = (X[:,0] - X[:,0].mean()) / X[:,0].std()
    X[:,1] = (X[:,1] - X[:,1].mean()) / X[:,1].std()
    return X

    
all_positive_tweets = twitter_samples.strings('positive_tweets.json')
all_negative_tweets = twitter_samples.strings('negative_tweets.json')    
stopwords = stopwords.words('english')

ws = [process_tweet(w, stopwords) for w in all_positive_tweets]
ws = [process_tweet(w, stopwords) for w in all_negative_tweets]
positive_ws = flatten(ws)
negative_ws = flatten(ws)
all_words = positive_ws + negative_ws



ds = dict(Counter(all_words))
ds = {k:np.array([v, all_words.index(k)]) for k,v in ds.items()}
X = np.array(list(ds.values())).astype(float)

X  = normalize(X)

zeros = [0]*(len(X)//2)
ones  = [1]*(len(X)//2)
labels = np.array([*ones, *zeros])
ys = labels.reshape(-1,1)
y = ys.astype(int)

In [3]:
def remove_special_characters(text):
    pattern = r'[^a-zA-Z\s]'
    cleaned_text = re.sub(pattern, '', text)
    return cleaned_text

remove_special_characters("#asdas")

'asdas'

In [4]:
S = " #FollowFriday @France_Inte @PKuchly57 @Milipol_Paris for being top engaged members in my community this week :)"
process_tweet(S, all_words)

['be', 'engag', 'member', 'commun', 'thi']

In [5]:
ds = dict(Counter(all_words))
ds

{'hopeless': 4,
 'tmr': 6,
 'everyth': 34,
 'kid': 40,
 'section': 6,
 'ikea': 2,
 'cute': 86,
 'shame': 36,
 'im': 950,
 'nearli': 6,
 'month': 44,
 'that': 310,
 'heart': 54,
 'slide': 2,
 'wast': 10,
 'basket': 2,
 'ketchburn': 2,
 'hate': 114,
 'japanes': 8,
 'call': 58,
 'bani': 4,
 'me': 476,
 'dang': 4,
 'start': 86,
 'next': 80,
 'week': 114,
 'work': 264,
 'oh': 184,
 'god': 30,
 'babi': 94,
 'face': 40,
 'make': 204,
 'smile': 20,
 'neighbour': 2,
 'motor': 2,
 'ask': 58,
 'said': 66,
 'updat': 22,
 'search': 6,
 'whytahuodyy': 2,
 'sialan': 2,
 'athabasca': 2,
 'glacier': 2,
 'realli': 264,
 'good': 200,
 'mampg': 4,
 'idea': 20,
 'never': 114,
 'go': 446,
 'meet': 64,
 'mare': 2,
 'ivan': 2,
 'happi': 50,
 'trip': 20,
 'keep': 68,
 'safe': 10,
 'see': 250,
 'soon': 90,
 'tire': 100,
 'hahahah': 6,
 'with': 24,
 'knee': 4,
 'replac': 8,
 'get': 466,
 'amp': 188,
 'day': 300,
 'ouch': 6,
 'relat': 4,
 'sweet': 14,
 'n': 2,
 'sour': 4,
 'kind': 22,
 'bipolar': 2,
 'peopl': 150

In [6]:
def sigmoid(s):
    return 1/(1+np.exp(-s))

def error(y_hat, y):
    errors = (y_hat-y)**2
    return errors.sum() / len(y)

def forward(X, W):
    s = X @ W
    y_pred = sigmoid(s)
    return y_pred

def loss(y_pred, y_):
    return sum((y_pred - y_)**2).item()

def sample(N, K):
    xs = list(range(N))
    return np.random.choice(xs, size=K, replace=False)

def split_train_test(X, y, ratio=0.3):
    N = len(X)
    idx = sample(N, int(N*ratio))
    not_idx = [n for n in range(N) if n not in idx]
    return X[not_idx], y[not_idx], X[idx], y[idx] 

X_train, y_train, X_test, y_test = split_train_test(X, y)
print(f"{sum(y_test)/len(y_test)}%, {sum(y_train)/len(y_train)}%") 

epoch = 100000
W = np.random.random(2).reshape(-1,1)
N = len(X)
ni = 1

for e in range(0,epoch):
    y_pred = forward(X_train, W)
    L_train = loss(y_pred, y_train)
    acc_train = round((y_pred == y_train).sum()/N * 100, 3)
    
    ### Backward ###
    DL_dw = 1/N*sum((y_pred-y_train)*X_train)

    ### Stadt Abstieg ###
    W = W.T - ni*DL_dw
    W = W.reshape(2,1)
    
    ### L test ###
    y_pred = forward(X_test, W)
    L_test = loss(y_pred, y_test)
    acc_test = round((y_pred == y_test).sum()/N * 100, 3)
    
    print(f"{e}, {acc_train}%, train_loss={round(L_train, 3)}, test_loss={round(L_test, 3)}")



[0.50845547]%, [0.49637856]%
0, 0.0%, train_loss=1691.194, test_loss=549.814
1, 0.0%, train_loss=1282.828, test_loss=416.75
2, 0.0%, train_loss=970.366, test_loss=325.525
3, 0.0%, train_loss=757.12, test_loss=264.747
4, 0.0%, train_loss=615.794, test_loss=223.411
5, 0.0%, train_loss=520.077, test_loss=194.245
6, 0.0%, train_loss=452.702, test_loss=172.877
7, 0.0%, train_loss=403.384, test_loss=156.69
8, 0.0%, train_loss=366.014, test_loss=144.076
9, 0.0%, train_loss=336.853, test_loss=134.006
10, 0.0%, train_loss=313.53, test_loss=125.803
11, 0.0%, train_loss=294.487, test_loss=119.005
12, 0.0%, train_loss=278.661, test_loss=113.287
13, 0.0%, train_loss=265.311, test_loss=108.415
14, 0.0%, train_loss=253.902, test_loss=104.219
15, 0.0%, train_loss=244.042, test_loss=100.569
16, 0.0%, train_loss=235.437, test_loss=97.366
17, 0.0%, train_loss=227.86, test_loss=94.534
18, 0.0%, train_loss=221.139, test_loss=92.013
19, 0.0%, train_loss=215.134, test_loss=89.754
20, 0.0%, train_loss=209.737

In [552]:
import torch
from torch import nn
import torch.nn.functional as F

Xt = torch.from_numpy(X_train).float()
yt = torch.from_numpy(y_train).float()
print(Xt.shape, yt.shape)

linear = nn.Linear(2, 1)
h = linear(Xt)
s = F.sigmoid(h)
loss = nn.CrossEntropyLoss()
Xt

loss(Xt, yt)

torch.Size([4142, 2]) torch.Size([4142, 1])


RuntimeError: 0D or 1D target tensor expected, multi-target not supported