In [1]:
import numpy as np
import pandas as pd
import pickle
import math
import matplotlib.pyplot as plt

from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split

import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim

from skorch import NeuralNetClassifier, NeuralNetBinaryClassifier, NeuralNetRegressor

import json
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import re


np.random.seed(42)

torch.manual_seed(0)
torch.cuda.manual_seed(0)

In [2]:
DATASET_COLUMNS = ["target", "text"]

data = pd.read_csv("./data/processed_full.csv", sep=',')
data = data[:30000]
data.columns = DATASET_COLUMNS
data = data[data['text'].notnull()]
data.head(10)

Unnamed: 0,target,text
0,1,tommcfli like amsterdam?
1,1,excit tonight!
2,1,woo! finish game diana go tablet sandwich. awe...
3,0,pleas talk
4,1,shefali morn shefaly..th sun' alway blinding.....
5,1,sammi gone like ever. twitter facebook
6,1,leav chicago month
7,0,ilovemyego see...th phone rang right...and ans...
8,1,think today
9,0,casper1201 bracelet broke today too.


In [3]:
def tokenize_texts(texts_list):
    all_words = set()
    for text in texts_list:
        words = str(text).split()
        all_words.update(words)

    word_to_index = {word: i for i, word in enumerate(all_words)}

    with open('word_to_index.pkl', 'wb') as f:  # Sözlüğü kaydet
        pickle.dump(word_to_index, f)

    tokenized_texts = []
    for text in texts_list:
        words = text.split()
        tokenized_texts.append([word_to_index[word] for word in words])

    return tokenized_texts, word_to_index

def pad_tokenized_texts(tokenized_texts, max_length=None):
    if not max_length:
        max_length = max([len(text) for text in tokenized_texts])

    padded_texts = []
    for text in tokenized_texts:
        if len(text) < max_length:
            text += [0] * (max_length - len(text))
        padded_texts.append(text)

    return padded_texts

def pad_tokenized_text(tokenized_text, max_length=None):
    if len(tokenized_text) < max_length:
        tokenized_text += [0] * (max_length - len(tokenized_text))

    return tokenized_text

def get_tokenized_sentence(sentence, word_to_index):
    words = sentence.split()
    tokenized_sentence = [word_to_index.get(word, 0) for word in words]
    return tokenized_sentence

def preprocess_sentence(sentence):
    stemmer = PorterStemmer()
    
    # Twitter mentions, URLs, and non-alphabetic characters removal
    sentence = re.sub("@", "", sentence)
    sentence = re.sub(r"http\S+", "", sentence)
    sentence = re.sub("[^a-zA-Z]", " ", sentence)
    
    # Convert to lowercase and remove stopwords
    sentence = sentence.lower()
    sentence = ' '.join([word for word in sentence.split() if word not in stopwords.words('english')])
    
    # Remove words with less than 4 characters
    sentence = ' '.join([w for w in sentence.split() if len(w) > 3])
    
    # Stemming
    sentence = ' '.join([stemmer.stem(w) for w in sentence.split()])
    
    return sentence

texts = data['text']
labels = data['target']

tokenized_texts, word_to_index = tokenize_texts(texts)
padded_texts = pad_tokenized_texts(tokenized_texts)

In [4]:
vocab_size = len(word_to_index) + 1
max_length = len(padded_texts[0])

In [5]:
labels.reset_index(drop=True, inplace=True)

tensor_padded_texts = torch.tensor(padded_texts)

from sklearn.preprocessing import StandardScaler, MinMaxScaler

scaler = MinMaxScaler((0, 1))
scaled_data = scaler.fit_transform(tensor_padded_texts.numpy())
tensor_padded_texts = torch.tensor(scaled_data).float()

labels_tensor = torch.tensor(labels.to_numpy()).long()  # or `.long()` if your labels are integers
#reshaped_labels = labels_tensor.view(-1, 1).float()

In [6]:
import torch
import torch.nn as nn
import torch.optim as optim

class AdvancedModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(AdvancedModel, self).__init__()
        self.layer1 = nn.Linear(embedding_dim, 10)
        #self.layer2 = nn.Linear(100, 32)
        self.layer5 = nn.Linear(10, 2)  # 2 output units for 2 classes
        self.layers = [self.layer1, self.layer5]
        self.relu = nn.ReLU()
        # Sigmoid is removed because CrossEntropyLoss includes softmax

    def forward(self, x):
        x = self.relu(self.layer1(x))
        #x = self.relu(self.layer2(x))
        x = self.layer5(x)  # Removed sigmoid
        return x

input_dim = max_length
learning_rate = 0.01
epochs = 5000

# Create the model
model = AdvancedModel(vocab_size, input_dim)

# Loss and optimizer
criterion = nn.CrossEntropyLoss()  # Changed to CrossEntropyLoss
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
outputs = None
for epoch in range(epochs):
    model.train()
    
    optimizer.zero_grad()
    outputs = model(tensor_padded_texts)
    
    loss = criterion(outputs, labels_tensor)  # Assuming reshaped_labels are long tensor containing the indices of classes
    loss.backward()
    optimizer.step()
    
    print(f"Epoch {epoch+1}/{epochs}, Loss: {loss.item()}")

Epoch 1/5000, Loss: 0.6957759857177734
Epoch 2/5000, Loss: 0.6940717697143555
Epoch 3/5000, Loss: 0.6934428215026855
Epoch 4/5000, Loss: 0.6937252879142761
Epoch 5/5000, Loss: 0.6937609910964966
Epoch 6/5000, Loss: 0.6934735774993896
Epoch 7/5000, Loss: 0.6931800842285156
Epoch 8/5000, Loss: 0.6930227279663086
Epoch 9/5000, Loss: 0.6929871439933777
Epoch 10/5000, Loss: 0.6930117011070251
Epoch 11/5000, Loss: 0.6930342316627502
Epoch 12/5000, Loss: 0.693031907081604
Epoch 13/5000, Loss: 0.6929987072944641
Epoch 14/5000, Loss: 0.6929458975791931
Epoch 15/5000, Loss: 0.692891001701355
Epoch 16/5000, Loss: 0.6928473114967346
Epoch 17/5000, Loss: 0.6928202509880066
Epoch 18/5000, Loss: 0.6928073167800903
Epoch 19/5000, Loss: 0.6927987933158875
Epoch 20/5000, Loss: 0.6927856206893921
Epoch 21/5000, Loss: 0.6927633881568909
Epoch 22/5000, Loss: 0.6927372813224792
Epoch 23/5000, Loss: 0.6927076578140259
Epoch 24/5000, Loss: 0.6926801204681396
Epoch 25/5000, Loss: 0.6926566958427429
Epoch 26/50

KeyboardInterrupt: 

In [7]:
with torch.no_grad():
    model.eval()
    outputs = model(tensor_padded_texts)
    _, predicted_class = torch.max(outputs, 1)  # Argmax along dimension 1
    
    correct_count = (predicted_class == labels_tensor).sum().item()
    accuracy = correct_count / len(labels_tensor) * 100  # Calculate the accuracy percentage

print(f"Training complete. Accuracy: {accuracy}%")

Training complete. Accuracy: 53.21385662795355%


In [8]:
bad_sentence = "dead dead dead fuck fuck die die "
good_sentence = "lets plant tree everywhere with love"
good_sentence2 = "i want to hugging all of you with love."
sentence = preprocess_sentence(bad_sentence)
print(sentence)
with open('word_to_index.pkl', 'rb') as f:
    loaded_word_to_index = pickle.load(f)

tokenized_sentence = pad_tokenized_text(get_tokenized_sentence(sentence, loaded_word_to_index), max_length)

dead dead dead fuck fuck


In [9]:
single_sentence = np.array(tokenized_sentence).reshape(1, -1)
scaled_single_sentence = scaler.transform(single_sentence)
scaled_single_sentence = torch.tensor(scaled_single_sentence).float()

In [10]:
single_sentence = torch.tensor(scaled_single_sentence).float().unsqueeze(0)
model.eval()
# Tahmin yap
with torch.no_grad():
    outputs = model(single_sentence)
    
# Tahminin sonucunu ikili sınıflandırma için bir eşik değeriyle (threshold) karşılaştır
predicted_label = (outputs >= 0.5).int().item()

print(f"Predicted Label: {predicted_label}")

  single_sentence = torch.tensor(scaled_single_sentence).float().unsqueeze(0)


RuntimeError: a Tensor with 2 elements cannot be converted to Scalar

In [11]:
base_scaling_factor = 10**4
model_name='model_1'

model_data = {}

model_json = {}
layers = model.layers
num_layers = len(layers)
for i in range(num_layers):
    weight_key = f"layers.{i}.weight"
    bias_key = f"layers.{i}.bias"
    
    layer_weights = layers[i].weight
    layer_bias = layers[i].bias
    
    layer_weights = layer_weights.detach().numpy().flatten()
    layer_bias = layer_bias.detach().numpy().flatten()
    
    print(len(layer_weights))
    print(len(layer_bias))
    
    model_json[f"w{i+1}"] = (layer_weights * base_scaling_factor).round().astype(int).tolist()
    model_json[f"b{i+1}"] = (layer_bias * base_scaling_factor ** (i + 1)).round().astype(int).tolist()
# Save to JSON
with open(f"{model_name}_parameters.json", "w") as f:
    json.dump(model_json, f, indent=4)

print(f"Saved model parameters at {model_name}_parameters.json")

270
10
20
2
Saved model parameters at model_1_parameters.json


In [13]:
n_samples = 10

samples_json = {}

for (i, sample) in enumerate(zip(tensor_padded_texts[:n_samples], labels_tensor[:n_samples])):
    samples_json[f"in{i+1}"] = (sample[0]*(base_scaling_factor)).round().to(torch.int64).tolist()
    samples_json[f"out{i+1}"] = sample[1].tolist()
    
# Save to JSON
with open(f"{model_name}_samples.json", "w") as f:
    print(samples_json)
    json.dump(samples_json, f, indent=4)

print(f"Saved test samples at {model_name}_samples.json")

{'in1': [2880, 8656, 6076, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'out1': 1, 'in2': [5200, 3232, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'out2': 1, 'in3': [5323, 7657, 1579, 8762, 3656, 2388, 1432, 3693, 7828, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'out3': 1, 'in4': [8267, 1109, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'out4': 0, 'in5': [5523, 1394, 328, 3732, 2845, 5311, 2542, 4050, 3781, 5257, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'out5': 1, 'in6': [4806, 1978, 8656, 2933, 3367, 5540, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'out6': 1, 'in7': [1934, 335, 718, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'out7': 1, 'in8': [2649, 4718, 952, 1824, 3485, 9184, 8222, 4005, 4606, 759, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'out8': 0, 'in9': [7939, 3921, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,