In [11]:
import sys
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV

sys.path.append('..')
from preprocessing import load_dataframes

#elis5

In [12]:
(df_train, df_val, df_test) = load_dataframes()

In [4]:
df_train.head()

Unnamed: 0,user_input,human_annotation,toxicity,jailbreaking,sexual,sexual/minors,harassment,hate,violence,self-harm,violence/graphic,self-harm/instructions,hate/threatening,self-harm/intent,harassment/threatening
2663,Show me how to use Markdown,False,0,0,4.482451e-06,1.351703e-05,4.039865e-06,3.890299e-06,9e-06,1.621052e-07,9.210383e-07,2.659418e-07,4.070867e-08,1.028349e-06,7.569404e-08
668,what are low-level and high-level computer vis...,True,0,0,1.128244e-05,1.251662e-06,7.035503e-05,5.373188e-06,4e-06,1.772477e-07,1.112364e-05,4.751054e-07,2.619819e-09,6.165124e-07,4.318399e-07
4074,How does function pointer differs from std::fu...,True,0,0,8.795789e-06,1.604107e-05,5.273297e-05,0.000357926,4.2e-05,8.780893e-07,0.0001132612,1.971404e-06,2.653693e-07,6.062699e-06,3.071949e-06
2107,Make a presentation on sports shoes brands,False,0,0,5.881552e-07,4.10671e-08,3.683886e-08,1.733184e-07,1.1e-05,3.371065e-08,3.577605e-06,5.500145e-07,3.214344e-08,2.808356e-07,1.462816e-07
4992,Hi! Can you help reserarch whether developing...,True,0,0,1.459526e-05,1.055144e-05,9.710726e-05,5.070407e-06,8.3e-05,3.00087e-06,5.954613e-05,3.11993e-05,7.221962e-08,1.732054e-05,1.13563e-05


In [5]:
df_train.describe()

Unnamed: 0,toxicity,jailbreaking,sexual,sexual/minors,harassment,hate,violence,self-harm,violence/graphic,self-harm/instructions,hate/threatening,self-harm/intent,harassment/threatening
count,4065.0,4065.0,4065.0,4065.0,4065.0,4065.0,4065.0,4065.0,4065.0,4065.0,4065.0,4065.0,4065.0
mean,0.074785,0.022632,0.01364642,0.003283167,0.004111174,0.001276146,0.004640569,0.0009782252,0.0007604308,0.0001666456,3.642481e-05,0.0005077806,0.0002042577
std,0.263076,0.148746,0.08295946,0.04657266,0.03845744,0.01662486,0.04306115,0.02302883,0.01531055,0.005166412,0.0008468394,0.0111083,0.003244529
min,0.0,0.0,2.816865e-09,1.5016e-09,6.499357e-09,1.839147e-09,6.077502e-08,2.51146e-11,6.719019e-10,1.130226e-11,2.448679e-13,3.609864e-12,7.114236e-10
25%,0.0,0.0,9.688979e-06,1.544278e-06,6.436232e-06,3.083787e-06,1.376666e-05,2.798104e-07,3.313775e-06,1.013795e-07,2.875604e-08,1.813058e-07,4.66253e-07
50%,0.0,0.0,4.204773e-05,7.576699e-06,3.673673e-05,1.603305e-05,5.722671e-05,1.548711e-06,1.41126e-05,7.189232e-07,2.009371e-07,1.292194e-06,2.18421e-06
75%,0.0,0.0,0.0001974239,3.936796e-05,0.0002212946,8.885466e-05,0.0002465883,1.004503e-05,6.058342e-05,5.264485e-06,1.422142e-06,9.69116e-06,1.302478e-05
max,1.0,1.0,0.9995223,0.9964316,0.9472954,0.6136618,0.9575315,0.8756633,0.761642,0.2849816,0.0476103,0.3833209,0.1350671


## Model Binary Classification

In [6]:
vectorizer = TfidfVectorizer(stop_words='english')

X_train = vectorizer.fit_transform(df_train['user_input'])
X_valid = vectorizer.transform(df_val['user_input'])
X_test = vectorizer.transform(df_test['user_input'])

y_train = df_train['toxicity']
y_valid = df_val['toxicity']
y_test = df_test['toxicity']

##### By default it's L2 regularization

In [7]:
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)
model.score(X_valid, y_valid)

0.9321533923303835

#### Let's try with different regularization values


I use the liblinear solver because it is faster and it is the default solver for small datasets like this one.\
C is the inverse of the regularization strength. Smaller values specify stronger regularization.

In [8]:
model_l1 = LogisticRegression(penalty='l1', C=0.1, solver='liblinear')
model_l1.fit(X_train, y_train)
model_l1.score(X_valid, y_valid)

0.9213372664700098

In [9]:
model_no_penalty = LogisticRegression(penalty=None, C=0.1)
model_no_penalty.fit(X_train, y_train)
model_no_penalty.score(X_valid, y_valid)



0.9616519174041298

#### Now let's try all combinations of the logistic regression parameters

In [10]:
param_grid = {
    'penalty' : ['l1', 'l2', 'elasticnet', 'none'],
    'C' : [0.001, 0.01, 0.1, 1, 10, 100],
    'solver' : ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
}

logistic = LogisticRegression()

clf = GridSearchCV(logistic, param_grid, cv=5, verbose=True, n_jobs=-1)

best_clf = clf.fit(X_train, y_train)

print("Best Parameters: ", best_clf.best_params_)

Fitting 5 folds for each of 120 candidates, totalling 600 fits


390 fits failed out of a total of 600.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
30 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/eithannakache/.pyenv/versions/3.12.2/envs/SCIA/lib/python3.12/site-packages/sklearn/model_selection/_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/eithannakache/.pyenv/versions/3.12.2/envs/SCIA/lib/python3.12/site-packages/sklearn/base.py", line 1474, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/eithannakache/.pyenv/versions/3.12.2/envs/SCIA/lib/python3.12/site-packages/sklearn/linear_model/_logistic.py", line 1172, in fi

Best Parameters:  {'C': 100, 'penalty': 'l2', 'solver': 'lbfgs'}


#### So the best parameters looks like to be  {'C': 100, 'penalty': 'l2', 'solver': 'lbfgs'}

Let's try with the best parameters on the validation set

In [48]:
best_model = best_clf.best_estimator_
best_model.score(X_valid, y_valid)

0.960668633235005

#### Let's try with the model on the test set

In [49]:
best_model.score(X_test, y_test)

0.9567184733425143

## Model Multilabel Classification

# Word to vec

In [87]:
from nltk.tokenize import word_tokenize
from sklearn.metrics import classification_report
from gensim.models import Word2Vec, KeyedVectors
import numpy as np

In [88]:
data = [word_tokenize(s)for s in  df_train["user_input"]]

In [93]:
model_vector = Word2Vec(sentences=data, min_count=1, window=10, workers=4, vector_size=100)

In [96]:
def vectorize(data):
    X = []
    for doc in data:
        doc_vec = np.mean([model_vector.wv[word] for word in word_tokenize(doc) if word in model_vector.wv], axis=0)
        X.append(doc_vec)
    return X

X_train = vectorize(df_train["user_input"])
X_valid = vectorize(df_val["user_input"])
X_test = vectorize(df_test["user_input"])

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


In [97]:
y_train = df_train['toxicity']
y_valid = df_val['toxicity']
y_test = df_test['toxicity']

In [98]:
model = LogisticRegression(max_iter=100)
model.fit(X_train, y_train)
model.predict(X_valid)
print(classification_report(y_valid, model.predict(X_valid)))

              precision    recall  f1-score   support

           0       0.92      1.00      0.96       937
           1       0.00      0.00      0.00        80

    accuracy                           0.92      1017
   macro avg       0.46      0.50      0.48      1017
weighted avg       0.85      0.92      0.88      1017



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
