In [None]:
import nltk
import numpy as np

from nltk.corpus import movie_reviews

nltk.download('movie_reviews')

NUM_SAMPLES = 1000
fileids = np.asarray(movie_reviews.fileids())

idx = np.concatenate([
    np.arange(NUM_SAMPLES // 2), 
    np.arange(len(fileids) - NUM_SAMPLES // 2, len(fileids))
])

print(fileids[idx])

In [None]:
from collections import Counter
from nltk.corpus import movie_reviews

reviews = []
labels = []

for fileid in fileids[idx]:
    reviews.append(movie_reviews.raw(fileid))
    labels.append(fileid.split('/')[0])
    
print(reviews[2])
print(labels[2])
print(Counter(labels))

In [None]:
import os

from openai import OpenAI

key_path = os.path.join(os.path.expanduser('~'), '.keys', 'openai')

with open(key_path, 'r') as f:
    key = f.read()
    
client = OpenAI(api_key=key)

In [None]:
from openai import OpenAI
from tqdm import tqdm



def embed(text, model="text-embedding-3-small"):
    text = text.replace("\n", " ")
    
    return client.embeddings.create(input = [text], model=model).data[0].embedding

embeddings = [embed(review) for review in tqdm(reviews)]


In [None]:
np.random.seed(42)

X = np.asarray(embeddings)
y = np.asarray(labels)
print(X.shape)
shuffled_idx = np.random.permutation(np.arange(len(X)))
X = X[shuffled_idx]
y = y[shuffled_idx]

train_idx = int(len(X) * 0.8)

train_X = X[:train_idx]
train_y = y[:train_idx]
test_X = X[train_idx:]
test_y = y[train_idx:]

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score


model = LogisticRegression(penalty='l1', solver='liblinear')
model.fit(train_X, train_y)

train_predictions = model.predict_proba(train_X)[:,1]
train_auc = roc_auc_score(train_y, train_predictions)
print(f'Train AUC: {train_auc:.2f}')

test_predictions = model.predict_proba(test_X)[:,1]
test_auc = roc_auc_score(test_y, test_predictions)
print(f'Test AUC: {test_auc:.2f}')

In [None]:
import matplotlib.pyplot as plt

from sklearn.decomposition import PCA


pca = PCA(n_components=2)
transformed = pca.fit_transform(X)

for label, color in [('neg', 'red'), ('pos', 'blue')]:
    idx = np.where(y == label)
    plt.scatter(transformed[idx,0], transformed[idx,1], c=color)
    
plt.show()