In [33]:
import pandas as pd
import numpy as np
import joblib
from sklearn.metrics import accuracy_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

In [34]:
dataset = pd.read_csv('../../data/fraction_preprocessed_data.csv',encoding='ISO-8859-1')
dataset = dataset.drop(['Unnamed: 0'], axis=1)
# dataset = dataset.dropna(subset=['Text'])
# dataset = dataset.sample(frac=0.1, random_state=42)
# dataset.to_csv('../../data/fraction_preprocessed_data.csv')
dataset

Unnamed: 0,Text,Source,Human
287452,ING AsiaPacific Companys Problems Research Pap...,Human,1
222199,Crisis Love Inquiry Essay Critical Writing fol...,Human,1
453600,Sure sex segregation makes lot sense many spor...,Human,1
276338,Christianity Islam Values Essay Christianity f...,Human,1
78217,Becca liked swim practiced everyday hours ente...,GLM-130B,0
...,...,...,...
663613,Mass Eoghan Chada 10 brother Ruairi 5 said St ...,OPT-30B,0
285976,Asian Teachers Polish Lesson Perfection Stigle...,Human,1
679335,Move knife slowly avoid slipping accidentally ...,OPT-6.7B,0
775773,Good dreams likely occur person feeling relaxe...,Text-Davinci-003,0


In [35]:
# sanity null/NaN check
print(f'Checking Null Values')
print(dataset.isnull().sum())
print()
print(f'Checking NaN values')
print(dataset.isna().sum())
# drop is any exist
dataset.dropna(inplace=True)

Checking Null Values
Text      0
Source    0
Human     0
dtype: int64

Checking NaN values
Text      0
Source    0
Human     0
dtype: int64


In [36]:
# load the pre-fitted vectorizer
tfidf = TfidfVectorizer(max_features=10000, ngram_range=(1, 3))
X = tfidf.fit_transform(dataset['Text'])

In [37]:
y = dataset['Human']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [22]:
# initializing scaler to adjust features to scale evenly among each other
scaler = StandardScaler(with_mean=False)
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [38]:
joblib.dump(X_train, '../../models/Supervised/Variables/X_train.pkl')
joblib.dump(X_test, '../../models/Supervised/Variables/X_test.pkl')
joblib.dump(y_train, '../../models/Supervised/Variables/y_train.pkl')
joblib.dump(y_test, '../../models/Supervised/Variables/y_test.pkl')
joblib.dump(tfidf, '../../models/Supervised/Variables/log_tfidf_vectorizer.pkl')
joblib.dump(scaler, '../../models/Supervised/Variables/log_scaler.pkl')

['../../models/Supervised/Variables/log_scaler.pkl']

In [39]:
log = LogisticRegression(C=1, penalty='l2', solver='liblinear', class_weight='balanced', random_state=42)
log.fit(X_train_scaled, y_train)

In [40]:
# predictions
preds_log = log.predict(X_test_scaled)
pred_proba = log.predict_proba(X_test_scaled)[:, 1]

In [41]:
# performance checks
print("Accuracy:", accuracy_score(y_test, preds_log))
print("ROC AUC Score:", roc_auc_score(y_test, preds_log))
print("Classification Report:\n", classification_report(y_test, preds_log))

Accuracy: 0.812979276253248
ROC AUC Score: 0.8122206982889376
Classification Report:
               precision    recall  f1-score   support

           0       0.84      0.82      0.83      8818
           1       0.78      0.81      0.79      6961

    accuracy                           0.81     15779
   macro avg       0.81      0.81      0.81     15779
weighted avg       0.81      0.81      0.81     15779



In [10]:
# Perform cross-validation
cv_scores = cross_val_score(log, X_train_scaled, y_train, cv=5, scoring='accuracy')

# Display cross-validation scores
print("Cross-Validation Accuracy Scores:", cv_scores)
print("Mean Cross-Validation Accuracy:", cv_scores.mean())

Cross-Validation Accuracy Scores: [0.79695793 0.80250337 0.79830468 0.80557756 0.79694185]
Mean Cross-Validation Accuracy: 0.800057078656588


In [42]:
joblib.dump(log, '../../models/Supervised/logistic_regression_model.pkl')

['../../models/Supervised/logistic_regression_model.pkl']

In [11]:
parameters = {
    'penalty': ['l1', 'l2'],
    'C': [0.1, 1, 10],
    'solver': ['liblinear', 'saga']
}

In [12]:
grid_log = LogisticRegression(C=1, penalty='l2', solver='liblinear', random_state=42)

In [13]:
# search grid for optimal hyperparameters
grid_search = GridSearchCV(grid_log, parameters, cv=5, scoring='accuracy', verbose=10)

In [14]:
grid_search.fit(X_train_scaled, y_train)

Fitting 5 folds for each of 12 candidates, totalling 60 fits
[CV 1/5; 1/12] START C=0.1, penalty=l1, solver=liblinear........................
[CV 1/5; 1/12] END C=0.1, penalty=l1, solver=liblinear;, score=0.818 total time=   1.7s
[CV 2/5; 1/12] START C=0.1, penalty=l1, solver=liblinear........................
[CV 2/5; 1/12] END C=0.1, penalty=l1, solver=liblinear;, score=0.827 total time=   1.8s
[CV 3/5; 1/12] START C=0.1, penalty=l1, solver=liblinear........................
[CV 3/5; 1/12] END C=0.1, penalty=l1, solver=liblinear;, score=0.823 total time=   1.8s
[CV 4/5; 1/12] START C=0.1, penalty=l1, solver=liblinear........................
[CV 4/5; 1/12] END C=0.1, penalty=l1, solver=liblinear;, score=0.829 total time=   2.0s
[CV 5/5; 1/12] START C=0.1, penalty=l1, solver=liblinear........................
[CV 5/5; 1/12] END C=0.1, penalty=l1, solver=liblinear;, score=0.821 total time=   2.5s
[CV 1/5; 2/12] START C=0.1, penalty=l1, solver=saga.............................




[CV 1/5; 2/12] END C=0.1, penalty=l1, solver=saga;, score=0.815 total time= 1.2min
[CV 2/5; 2/12] START C=0.1, penalty=l1, solver=saga.............................




[CV 2/5; 2/12] END C=0.1, penalty=l1, solver=saga;, score=0.821 total time= 1.3min
[CV 3/5; 2/12] START C=0.1, penalty=l1, solver=saga.............................




[CV 3/5; 2/12] END C=0.1, penalty=l1, solver=saga;, score=0.821 total time= 1.3min
[CV 4/5; 2/12] START C=0.1, penalty=l1, solver=saga.............................




[CV 4/5; 2/12] END C=0.1, penalty=l1, solver=saga;, score=0.829 total time= 1.3min
[CV 5/5; 2/12] START C=0.1, penalty=l1, solver=saga.............................




[CV 5/5; 2/12] END C=0.1, penalty=l1, solver=saga;, score=0.819 total time= 1.3min
[CV 1/5; 3/12] START C=0.1, penalty=l2, solver=liblinear........................
[CV 1/5; 3/12] END C=0.1, penalty=l2, solver=liblinear;, score=0.803 total time=  11.8s
[CV 2/5; 3/12] START C=0.1, penalty=l2, solver=liblinear........................
[CV 2/5; 3/12] END C=0.1, penalty=l2, solver=liblinear;, score=0.809 total time=   9.7s
[CV 3/5; 3/12] START C=0.1, penalty=l2, solver=liblinear........................
[CV 3/5; 3/12] END C=0.1, penalty=l2, solver=liblinear;, score=0.807 total time=  12.0s
[CV 4/5; 3/12] START C=0.1, penalty=l2, solver=liblinear........................
[CV 4/5; 3/12] END C=0.1, penalty=l2, solver=liblinear;, score=0.813 total time=   9.2s
[CV 5/5; 3/12] START C=0.1, penalty=l2, solver=liblinear........................
[CV 5/5; 3/12] END C=0.1, penalty=l2, solver=liblinear;, score=0.802 total time=  11.3s
[CV 1/5; 4/12] START C=0.1, penalty=l2, solver=saga.....................



[CV 1/5; 4/12] END C=0.1, penalty=l2, solver=saga;, score=0.810 total time=   5.3s
[CV 2/5; 4/12] START C=0.1, penalty=l2, solver=saga.............................




[CV 2/5; 4/12] END C=0.1, penalty=l2, solver=saga;, score=0.818 total time=   5.4s
[CV 3/5; 4/12] START C=0.1, penalty=l2, solver=saga.............................




[CV 3/5; 4/12] END C=0.1, penalty=l2, solver=saga;, score=0.817 total time=   5.4s
[CV 4/5; 4/12] START C=0.1, penalty=l2, solver=saga.............................




[CV 4/5; 4/12] END C=0.1, penalty=l2, solver=saga;, score=0.825 total time=   5.4s
[CV 5/5; 4/12] START C=0.1, penalty=l2, solver=saga.............................




[CV 5/5; 4/12] END C=0.1, penalty=l2, solver=saga;, score=0.814 total time=   5.4s
[CV 1/5; 5/12] START C=1, penalty=l1, solver=liblinear..........................
[CV 1/5; 5/12] END C=1, penalty=l1, solver=liblinear;, score=0.800 total time=  10.0s
[CV 2/5; 5/12] START C=1, penalty=l1, solver=liblinear..........................
[CV 2/5; 5/12] END C=1, penalty=l1, solver=liblinear;, score=0.807 total time=  10.1s
[CV 3/5; 5/12] START C=1, penalty=l1, solver=liblinear..........................
[CV 3/5; 5/12] END C=1, penalty=l1, solver=liblinear;, score=0.804 total time=  10.1s
[CV 4/5; 5/12] START C=1, penalty=l1, solver=liblinear..........................
[CV 4/5; 5/12] END C=1, penalty=l1, solver=liblinear;, score=0.811 total time=   9.8s
[CV 5/5; 5/12] START C=1, penalty=l1, solver=liblinear..........................
[CV 5/5; 5/12] END C=1, penalty=l1, solver=liblinear;, score=0.803 total time=   7.5s
[CV 1/5; 6/12] START C=1, penalty=l1, solver=saga...............................




[CV 1/5; 6/12] END C=1, penalty=l1, solver=saga;, score=0.811 total time= 1.5min
[CV 2/5; 6/12] START C=1, penalty=l1, solver=saga...............................




[CV 2/5; 6/12] END C=1, penalty=l1, solver=saga;, score=0.818 total time= 1.5min
[CV 3/5; 6/12] START C=1, penalty=l1, solver=saga...............................




[CV 3/5; 6/12] END C=1, penalty=l1, solver=saga;, score=0.817 total time= 1.5min
[CV 4/5; 6/12] START C=1, penalty=l1, solver=saga...............................




[CV 4/5; 6/12] END C=1, penalty=l1, solver=saga;, score=0.826 total time= 1.5min
[CV 5/5; 6/12] START C=1, penalty=l1, solver=saga...............................




[CV 5/5; 6/12] END C=1, penalty=l1, solver=saga;, score=0.814 total time= 1.5min
[CV 1/5; 7/12] START C=1, penalty=l2, solver=liblinear..........................
[CV 1/5; 7/12] END C=1, penalty=l2, solver=liblinear;, score=0.798 total time=  49.2s
[CV 2/5; 7/12] START C=1, penalty=l2, solver=liblinear..........................
[CV 2/5; 7/12] END C=1, penalty=l2, solver=liblinear;, score=0.804 total time=  31.0s
[CV 3/5; 7/12] START C=1, penalty=l2, solver=liblinear..........................
[CV 3/5; 7/12] END C=1, penalty=l2, solver=liblinear;, score=0.801 total time=  27.8s
[CV 4/5; 7/12] START C=1, penalty=l2, solver=liblinear..........................
[CV 4/5; 7/12] END C=1, penalty=l2, solver=liblinear;, score=0.807 total time=  56.8s
[CV 5/5; 7/12] START C=1, penalty=l2, solver=liblinear..........................
[CV 5/5; 7/12] END C=1, penalty=l2, solver=liblinear;, score=0.798 total time=  31.7s
[CV 1/5; 8/12] START C=1, penalty=l2, solver=saga...............................




[CV 1/5; 8/12] END C=1, penalty=l2, solver=saga;, score=0.810 total time=   5.3s
[CV 2/5; 8/12] START C=1, penalty=l2, solver=saga...............................




[CV 2/5; 8/12] END C=1, penalty=l2, solver=saga;, score=0.818 total time=   5.4s
[CV 3/5; 8/12] START C=1, penalty=l2, solver=saga...............................




[CV 3/5; 8/12] END C=1, penalty=l2, solver=saga;, score=0.817 total time=   5.4s
[CV 4/5; 8/12] START C=1, penalty=l2, solver=saga...............................




[CV 4/5; 8/12] END C=1, penalty=l2, solver=saga;, score=0.825 total time=   5.3s
[CV 5/5; 8/12] START C=1, penalty=l2, solver=saga...............................




[CV 5/5; 8/12] END C=1, penalty=l2, solver=saga;, score=0.814 total time=   5.3s
[CV 1/5; 9/12] START C=10, penalty=l1, solver=liblinear.........................




[CV 1/5; 9/12] END C=10, penalty=l1, solver=liblinear;, score=0.796 total time= 8.7min
[CV 2/5; 9/12] START C=10, penalty=l1, solver=liblinear.........................




[CV 2/5; 9/12] END C=10, penalty=l1, solver=liblinear;, score=0.803 total time=22.1min
[CV 3/5; 9/12] START C=10, penalty=l1, solver=liblinear.........................
[CV 3/5; 9/12] END C=10, penalty=l1, solver=liblinear;, score=0.800 total time=  22.8s
[CV 4/5; 9/12] START C=10, penalty=l1, solver=liblinear.........................




[CV 4/5; 9/12] END C=10, penalty=l1, solver=liblinear;, score=0.806 total time=18.3min
[CV 5/5; 9/12] START C=10, penalty=l1, solver=liblinear.........................




[CV 5/5; 9/12] END C=10, penalty=l1, solver=liblinear;, score=0.797 total time= 5.6min
[CV 1/5; 10/12] START C=10, penalty=l1, solver=saga.............................




[CV 1/5; 10/12] END C=10, penalty=l1, solver=saga;, score=0.810 total time= 1.6min
[CV 2/5; 10/12] START C=10, penalty=l1, solver=saga.............................




[CV 2/5; 10/12] END C=10, penalty=l1, solver=saga;, score=0.818 total time= 1.6min
[CV 3/5; 10/12] START C=10, penalty=l1, solver=saga.............................




[CV 3/5; 10/12] END C=10, penalty=l1, solver=saga;, score=0.817 total time= 1.6min
[CV 4/5; 10/12] START C=10, penalty=l1, solver=saga.............................




[CV 4/5; 10/12] END C=10, penalty=l1, solver=saga;, score=0.825 total time= 1.6min
[CV 5/5; 10/12] START C=10, penalty=l1, solver=saga.............................




[CV 5/5; 10/12] END C=10, penalty=l1, solver=saga;, score=0.814 total time= 1.6min
[CV 1/5; 11/12] START C=10, penalty=l2, solver=liblinear........................
[CV 1/5; 11/12] END C=10, penalty=l2, solver=liblinear;, score=0.795 total time= 1.1min
[CV 2/5; 11/12] START C=10, penalty=l2, solver=liblinear........................
[CV 2/5; 11/12] END C=10, penalty=l2, solver=liblinear;, score=0.803 total time=  56.5s
[CV 3/5; 11/12] START C=10, penalty=l2, solver=liblinear........................
[CV 3/5; 11/12] END C=10, penalty=l2, solver=liblinear;, score=0.799 total time=  53.7s
[CV 4/5; 11/12] START C=10, penalty=l2, solver=liblinear........................
[CV 4/5; 11/12] END C=10, penalty=l2, solver=liblinear;, score=0.805 total time= 1.2min
[CV 5/5; 11/12] START C=10, penalty=l2, solver=liblinear........................
[CV 5/5; 11/12] END C=10, penalty=l2, solver=liblinear;, score=0.796 total time= 1.1min
[CV 1/5; 12/12] START C=10, penalty=l2, solver=saga.....................



[CV 1/5; 12/12] END C=10, penalty=l2, solver=saga;, score=0.810 total time=   5.5s
[CV 2/5; 12/12] START C=10, penalty=l2, solver=saga.............................




[CV 2/5; 12/12] END C=10, penalty=l2, solver=saga;, score=0.818 total time=   5.3s
[CV 3/5; 12/12] START C=10, penalty=l2, solver=saga.............................




[CV 3/5; 12/12] END C=10, penalty=l2, solver=saga;, score=0.817 total time=   5.3s
[CV 4/5; 12/12] START C=10, penalty=l2, solver=saga.............................




[CV 4/5; 12/12] END C=10, penalty=l2, solver=saga;, score=0.825 total time=   5.4s
[CV 5/5; 12/12] START C=10, penalty=l2, solver=saga.............................




[CV 5/5; 12/12] END C=10, penalty=l2, solver=saga;, score=0.814 total time=   5.3s


In [15]:
print(f"Best Hyperparameters: {grid_search.best_params_}")
print(f"Best Cross-Validation Accuracy: {grid_search.best_score_}")

Best Hyperparameters: {'C': 0.1, 'penalty': 'l1', 'solver': 'liblinear'}
Best Cross-Validation Accuracy: 0.823538776788485


In [16]:
best_model = grid_search.best_estimator_
y_test_pred = best_model.predict(X_test)
print("GS Accuracy Test:", accuracy_score(y_test, y_test_pred))
print("GS ROC AUC Score:", roc_auc_score(y_test, preds_log))
print("GS Classification Report:\n", classification_report(y_test, preds_log))

GS Accuracy Test: 0.5588440332086951
GS ROC AUC Score: 0.8122206982889376
GS Classification Report:
               precision    recall  f1-score   support

           0       0.84      0.82      0.83      8818
           1       0.78      0.81      0.79      6961

    accuracy                           0.81     15779
   macro avg       0.81      0.81      0.81     15779
weighted avg       0.81      0.81      0.81     15779



In [17]:
# Perform cross-validation
cv_scores = cross_val_score(grid_log, X_train_scaled, y_train, cv=5, scoring='accuracy')

# Display cross-validation scores
print("Cross-Validation Accuracy Scores:", cv_scores)
print("Mean Cross-Validation Accuracy:", cv_scores.mean())

Cross-Validation Accuracy Scores: [0.7979878  0.80392934 0.8014735  0.80676596 0.79781334]
Mean Cross-Validation Accuracy: 0.8015939884228152


In [21]:
joblib.dump(grid_log, '../../models/Supervised/grid_logistic_regression_model.pkl')

['../../models/Supervised/grid_logistic_regression_model.pkl']