In [303]:
import numpy as np
import pandas as pd
import pickle
import math
import matplotlib.pyplot as plt

from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split

import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim

from skorch import NeuralNetClassifier, NeuralNetBinaryClassifier

import json

np.random.seed(42)

torch.manual_seed(0)
torch.cuda.manual_seed(0)

In [362]:
DATASET_COLUMNS = ["target", "text"]

data = pd.read_csv("./data/processed_full.csv", sep=',')
data = data[:30000]
data.columns = DATASET_COLUMNS
data = data[data['text'].notnull()]
data.head(100)

Unnamed: 0,target,text
0,1,tommcfli like amsterdam?
1,1,excit tonight!
2,1,woo! finish game diana go tablet sandwich. awe...
3,0,pleas talk
4,1,shefali morn shefaly..th sun' alway blinding.....
...,...,...
95,1,carlosdejesu figur vid twitter yet. workin' th...
96,0,"sjc0815 well, 2605 briana shay offici gone fir..."
97,0,naturegrrl hard believe. &amp; hard believ see...
98,0,bust-up offic good start week


In [363]:
def tokenize_texts(texts_list):
    all_words = set()
    for text in texts_list:
        words = str(text).split()
        all_words.update(words)

    word_to_index = {word: i for i, word in enumerate(all_words)}

    with open('word_to_index.pkl', 'wb') as f:  # Sözlüğü kaydet
        pickle.dump(word_to_index, f)

    tokenized_texts = []
    for text in texts_list:
        words = text.split()
        tokenized_texts.append([word_to_index[word] for word in words])

    return tokenized_texts, word_to_index

def pad_tokenized_texts(tokenized_texts, max_length=None):
    if not max_length:
        max_length = max([len(text) for text in tokenized_texts])

    padded_texts = []
    for text in tokenized_texts:
        if len(text) < max_length:
            text += [0] * (max_length - len(text))
        padded_texts.append(text)

    return padded_texts

def pad_tokenized_text(tokenized_text, max_length=None):
    if len(tokenized_text) < max_length:
        tokenized_text += [0] * (max_length - len(tokenized_text))

    return tokenized_text

def get_tokenized_sentence(sentence, word_to_index):
    words = sentence.split()
    tokenized_sentence = [word_to_index.get(word, 0) for word in words]
    return tokenized_sentence

texts = data['text']
labels = data['target']

tokenized_texts, word_to_index = tokenize_texts(texts)
padded_texts = pad_tokenized_texts(tokenized_texts)

In [364]:
vocab_size = len(word_to_index) + 1
max_length = len(padded_texts[0])

In [372]:
labels.reset_index(drop=True, inplace=True)

tensor_padded_texts = torch.tensor(padded_texts).long()

labels_tensor = torch.tensor(labels.to_numpy()).float()  # or `.long()` if your labels are integers
reshaped_labels = labels_tensor.view(-1, 1)

In [381]:
class AdvancedModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(AdvancedModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.layer1 = nn.Linear(embedding_dim, 32)
        self.layer4 = nn.Linear(32, 1)
        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.embedding(x)
        x = self.relu(self.layer1(x))
        x = self.sigmoid(self.layer4(x))
        return x

In [382]:
# Hyperparameters
input_dim = max_length
learning_rate = 0.01
epochs = 1000

# Create the model
model = AdvancedModel(vocab_size, input_dim)

# Loss and optimizer
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

for epoch in range(epochs):
    model.train()
    
    optimizer.zero_grad()
    outputs = model(tensor_padded_texts)
    outputs = torch.mean(outputs, dim=1, keepdim=True)
    outputs = outputs[:, 0, :]

    loss = criterion(outputs, reshaped_labels)
    loss.backward()
    optimizer.step()
    
    print(f"Epoch {epoch+1}/{epochs}, Loss: {loss.item()}")

model.eval()  # Set the model to evaluation mode

Epoch 1/1000, Loss: 0.6931425929069519
Epoch 2/1000, Loss: 0.6966422200202942
Epoch 3/1000, Loss: 0.6920924782752991
Epoch 4/1000, Loss: 0.6917058825492859
Epoch 5/1000, Loss: 0.6920918822288513
Epoch 6/1000, Loss: 0.6908047199249268
Epoch 7/1000, Loss: 0.6890093684196472
Epoch 8/1000, Loss: 0.6873288154602051
Epoch 9/1000, Loss: 0.6859438419342041
Epoch 10/1000, Loss: 0.6845901012420654
Epoch 11/1000, Loss: 0.6829667091369629
Epoch 12/1000, Loss: 0.6808612942695618
Epoch 13/1000, Loss: 0.6782693862915039
Epoch 14/1000, Loss: 0.6753161549568176
Epoch 15/1000, Loss: 0.6721377372741699
Epoch 16/1000, Loss: 0.6688145995140076
Epoch 17/1000, Loss: 0.6653598546981812
Epoch 18/1000, Loss: 0.6617099642753601
Epoch 19/1000, Loss: 0.657901406288147
Epoch 20/1000, Loss: 0.6540579199790955
Epoch 21/1000, Loss: 0.650206983089447
Epoch 22/1000, Loss: 0.6463309526443481
Epoch 23/1000, Loss: 0.6424964666366577
Epoch 24/1000, Loss: 0.638778567314148
Epoch 25/1000, Loss: 0.6351789832115173
Epoch 26/100

KeyboardInterrupt: 

In [384]:
with torch.no_grad():
    outputs = model(tensor_padded_texts)
    outputs = torch.mean(outputs, dim=1, keepdim=True)
    outputs = outputs[:, 0, :]
    predicted = (outputs >= 0.5).float()
    accuracy = (predicted == reshaped_labels.float()).sum() / len(reshaped_labels)
    
print(f"Training complete. Accuracy: {accuracy.item() * 100}%")

Training complete. Accuracy: 89.1369640827179%


In [402]:
bad_sentence = "dead dead dead fuck fuck die die "
good_sentence = "i have killed to many stupids"
sentence = good_sentence
with open('word_to_index.pkl', 'rb') as f:
    loaded_word_to_index = pickle.load(f)

tokenized_sentence = pad_tokenized_text(get_tokenized_sentence(sentence, loaded_word_to_index), max_length)


In [403]:
single_sentence = np.array(tokenized_sentence).reshape(1, -1)
scaled_single_sentence = scaler.transform(single_sentence)
scaled_single_sentence = torch.tensor(scaled_single_sentence).float()

In [404]:
single_sentence = torch.tensor(scaled_single_sentence).long().unsqueeze(0)
model.eval()
# Tahmin yap
with torch.no_grad():
    outputs = model(single_sentence)
    outputs = torch.mean(outputs, dim=1, keepdim=True)
    outputs = outputs[:, 0, :]
    outputs = torch.mean(outputs, dim=1)
    
# Tahminin sonucunu ikili sınıflandırma için bir eşik değeriyle (threshold) karşılaştır
predicted_label = (outputs >= 0.5).int().item()

print(f"Predicted Label: {predicted_label}")

Predicted Label: 1


  single_sentence = torch.tensor(scaled_single_sentence).long().unsqueeze(0)
