In [1]:
import numpy as np
from tqdm.notebook import tqdm

import gensim
from gensim.models import Word2Vec
import re

import threading

In [2]:
import gensim.downloader as api
wv = api.load('word2vec-google-news-300')

### Load data

In [4]:
from datasets import load_dataset

data_files = {
    "train": "literotica-stories-train.jsonl", 
    "test": "literotica-stories-test.jsonl"
}

stories_dataset = load_dataset("json", data_files=data_files)
stories_dataset

  from pandas.core.computation.check import NUMEXPR_INSTALLED


DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 98552
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 24638
    })
})

In [5]:
import nltk
from nltk.corpus import stopwords
# nltk.download('stopwords')

def preprocess(raw_text):
    lower_text = raw_text.lower()
    words = re.sub(r'[^a-zA-Z]', ' ', lower_text)
    words_list = re.split('\s+', words)
    return [word for word in words_list if word not in stopwords.words('english') and word != '']

In [19]:
def get_train_data(split='train'):
    X, y = [], []
    
    for entry in tqdm(stories_dataset[split]):
        filtered_words = preprocess(entry['text'])
        vectors = np.fromiter(map(lambda word: wv[word] if word in wv else np.zeros(300), filtered_words), dtype=np.ndarray)
        story_embedding = vectors.mean(axis=0)
        X.append(story_embedding)
        y.append(entry['label'])
            
        
    return X, y

In [20]:
X_train, y_train = get_train_data(split='train')
X_test, y_test = get_train_data(split='test')

  0%|          | 0/98552 [00:00<?, ?it/s]

  0%|          | 0/24638 [00:00<?, ?it/s]

### Train baseline model

In [35]:
from sklearn.linear_model import LogisticRegression

X_train, y_train = np.array(X_train), np.array(y_train)

model = LogisticRegression(solver='liblinear')
model.fit(X_train, y_train)

LogisticRegression(solver='liblinear')

### Evaluate model

In [41]:
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score, precision_score, recall_score

X_test, y_test = np.array(X_test), np.array(y_test)

y_pred = model.predict(X_test)

print('Precision: ', precision_score(y_pred, y_test, average='weighted'))
print('Recall: ', recall_score(y_pred, y_test, average='weighted'))
print('Accuracy: ', accuracy_score(y_pred, y_test))

Precision:  0.7167109704749532
Recall:  0.450487012987013
Accuracy:  0.450487012987013
