# Logistic Regression
For the maths and explanations, go check: https://web.stanford.edu/~jurafsky/slp3/5.pdf

In [8]:
from datasets import load_dataset
from nltk.tokenize import word_tokenize, sent_tokenize
import pandas as pd
from sklearn.model_selection import train_test_split

from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

In [None]:
ds = load_dataset("jniimi/tripadvisor-review-rating")
raw_data = pd.DataFrame(ds['train'])

text = 'review'
label = 'overall'

df = raw_data.drop(columns=['stay_year', 'post_date', 'freq', 'lang'])

# Drop the rows with missing data
df = df.dropna()

# Drop the duplicates
df = df.drop_duplicates()

# Shuffle the data
df = df.sample(frac=1).reset_index(drop=True)

# Split the data into train and test
train_df, test_df = train_test_split(df, test_size=0.2)

train_df, val_df = train_test_split(train_df, test_size=0.2)

Generating train split: 100%|██████████| 201295/201295 [00:01<00:00, 125075.28 examples/s]


In [None]:
train_df


In [9]:
preprocess = ColumnTransformer(transformers=[
    ('prompt_embedding', TfidfVectorizer(tokenizer=word_tokenize), text),
    ('scaler', StandardScaler(), text)

])

pipeline = Pipeline(steps=[
    ('preprocess', preprocess),
    ('classifier', LogisticRegression())
])

X_train_injection = train_df[[text]].reset_index(drop=True)
y_train_injection = train_df[label]

X_test_injection = test_df[[text]].reset_index(drop=True)
y_test_injection = test_df[label]

pipeline.fit(X_train_injection, y_train_injection)



ValueError: could not convert string to float: "Needed a place near Convention Center\nAfter reading other reviews of this property I worried that I made a mistake making this reservation. Upon arrive, I decided it was the smartest decision I've made in a long time. VERY economical and a decent room. 15 minutes (and I'm slow) walk to the convention center and I was the envy of all of my fellow conference participants who were paying upwards of $250 per night at the fancy hotels. I stayed for 4 nights and will stay there again if I ever need a hotel in San Antonio. Thanks for a great stay!"

In [None]:
# accuracy = pipeline.score(test_df[['Degree', 'User Prompt']].reset_index(drop=True), test_df['Prompt injection'])
accuracy = pipeline.score(X_test_injection, y_test_injection)
print("Accuracy:", accuracy)

Accuracy: 0.672694304379145


In [None]:
injections_predicted = pipeline.predict(X_test_injection)
injections_predicted

array([1, 1, 1, ..., 0, 1, 1], shape=(3184,))