# IA DETECT - Supervised Machine Learning
## Model saved in pickle file

In [1]:
#!pip install joblib --user
#!pip install spacy
#!pip install nltk
#!pip install xgboost
#!pip install textblob
#!python -m spacy download en_core_web_sm
#import nltk
#nltk.download('punkt')
#nltk.download('averaged_perceptron_tagger')
#nltk.download('brown')

import numpy as np
import pandas as pd
import re
import spacy
import nltk

import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from xgboost import XGBClassifier
from sklearn.metrics import roc_curve, auc
import joblib
from sklearn.preprocessing import StandardScaler
import xgboost as xgb

from textblob import TextBlob 
import statistics
from spacy.symbols import PUNCT
from collections import Counter
nlp = spacy.load("en_core_web_sm")

## .csv already processed with feature engineering

In [2]:
train_full = pd.read_csv("train_full.csv", low_memory=False)
train_full.shape[0]

94890

## subsampling

In [3]:
no = train_full[train_full["generated"]==0]
yes = train_full[train_full["generated"]==1]
print(no.shape[0])
print(yes.shape[0])
train_full_x = pd.concat([no[:43000], yes], axis=0)
print(train_full_x.shape[0])

50115
44775
87775


## Feature engineering for actual input data

In [4]:
def sentiment(text1):
    blob = TextBlob(text1)
    blob.tags
    blob.noun_phrases
    list_Sentiment = []
    for sentence in blob.sentences:
        list_Sentiment.append(sentence.sentiment.polarity)        
    st_dev = statistics.pstdev(list_Sentiment)
    mean = statistics.mean(list_Sentiment)    
    return mean, st_dev
    
def find_dif(text1, text2):            
    blob1 = TextBlob(text1)
    blob2 = TextBlob(text2)
    tokens_text1 = blob1.words
    tokens_text2 = blob2.words
    diferency = set(tokens_text1) ^ set(tokens_text2)    
    return diferency

def process_blob(strText):    
    strText_Blob = TextBlob(strText) 
    correct_Text = str(strText_Blob.correct())
    diferency = find_dif(strText, correct_Text) 
    
    _mean, _stddev = sentiment(correct_Text)
    
    if diferency:
        return len(diferency)/2, _mean, _stddev        
    else:
        return 0, _mean, _stddev
    

def numPunct(text_objective):
    doc = nlp(text_objective)
    try:
        num_words = len(doc)
        stopwords = [token for token in doc if token.is_stop]
        number_of_stopwords = len(stopwords)    
        punctuation_tokens = [token for token in doc if token.pos == PUNCT]
        number_of_punctuation_marks = len(punctuation_tokens)
        
        unique_words = set(token.text for token in doc)
        count_unique_words = len(unique_words)
        
        words_doc = [token.text.lower() for token in doc if token.is_alpha]
        frecuency_words = Counter(words_doc)
        
        most_word, frec_word = frecuency_words.most_common(1)[0]
            
        val_numwords = num_words/100
       
        porc_sw = number_of_stopwords/val_numwords
        porc_punt = number_of_punctuation_marks/val_numwords
        porc_unique_words = count_unique_words/val_numwords
    
        repet_it = sum(1 for token in doc if token.text.lower() == "it")
        repet_is = sum(1 for token in doc if token.text.lower() == "is")
      
    except IndexError:      
        print("An exception occurred") 
        return 0, 0, 0, 0, 0, 0, "0", 0, 0, 0, 0
    
    print(f"Number of words: {num_words}")
    print(f"Number of Stop Words: {number_of_stopwords}")
    print(f"Number of punctuation marks: {number_of_punctuation_marks}")
    print(f"Number of unique words: {count_unique_words}")
    print(f"Number of  it: {repet_it}")
    print(f"Number of  is: {repet_is}")
    print(f"Most repeated word: {most_word}")
    print(f"Repeated word frequency: {frec_word}")
    
    print(f"Percentage of stopwords: {porc_sw}")
    print(f"Percentage of punctuation marks: {porc_punt}")
    print(f"Percentage of unique words: {porc_unique_words}")
    
    return num_words, number_of_stopwords, number_of_punctuation_marks, count_unique_words, repet_it, repet_is, most_word, frec_word, porc_sw, porc_punt, porc_unique_words

## Train and save, load the model saved in the pickle file and use it for prediction!!!
### The search for better hyperparameters and the importance of characteristics has already been done.

In [5]:
scaler = StandardScaler()
numeric_cols = ["bad_words", "w_mean",  "w_std", "num_words", "num_stop_words", "num_signs", "num_u_words", "it", "is", "frec_r_word", "per_sw", "per_sign", "perc_u_w"]

X = train_full_x[numeric_cols]
y = train_full_x["generated"]

array_X = np.array(X)
array_y = np.array(y)

X_train, X_test, y_train, y_test = train_test_split(array_X, array_y, test_size=0.2, random_state=42)

xgb_classifier = XGBClassifier(learning_rate=0.1, max_depth=6, n_estimators=300)
xgb_classifier.fit(X_train, y_train)
    
y_prob = xgb_classifier.predict_proba(X_test)[:, 1]

fpr, tpr, thresholds = roc_curve(y_test, y_prob)
roc_auc = auc(fpr, tpr)
    
joblib.dump(xgb_classifier, 'xg_model.pkl')
xg_model = joblib.load('xg_model.pkl')
score = xg_model.score(X_train, y_train)

print("Test score: {0:.2f} %".format(100 * score))
print("Curva ROC: ", roc_auc)

Test score: 87.91 %
Curva ROC:  0.9227015554823788


## We use the complete csv that only has the generated columns and text
### The test_texts.csv file contains various data that we did not use for training, so we use it for testing.

In [6]:
test_text = pd.read_csv("test_texts.csv", low_memory=False)
test_text.shape[0]

94890

## Test ---> We try with the texts of the CSV

In [19]:
row_index =89562
t1 = test_text["text"][row_index]
bad_words, w_mean, w_std = process_blob(t1)
num_words, num_stop_words, num_signs, num_u_words, it, is_, repeat_word, frec_r_word, per_sw, per_sign, perc_u_w = numPunct(t1)

Number of words: 256
Number of Stop Words: 113
Number of punctuation marks: 29
Number of unique words: 133
Number of  it: 4
Number of  is: 5
Most repeated word: the
Repeated word frequency: 12
Percentage of stopwords: 44.140625
Percentage of punctuation marks: 11.328125
Percentage of unique words: 51.953125


## Shows the probability of being AI-generated text.
## The closer it is to 1 the more likely it is to be AI-generated.

In [20]:
list_values = [bad_words, w_mean, w_std, num_words, num_stop_words, num_signs, num_u_words, it, is_, frec_r_word, per_sw, per_sign, perc_u_w]
array_values = np.array(list_values)
y_prob = xg_model.predict_proba([array_values])[:, 1]
print("Probability IA generated: ", y_prob[0])
print("Value of csv: ", test_text["generated"][row_index])

Probability IA generated:  0.9478052
Value of csv:  1


## We show the text used and the value of "generated" in the csv

In [21]:
print(test_text["text"][row_index])
print("--------------------->Value of generated in csv: ", test_text["generated"][row_index])

['Computer simulation is a technique for using a computer to model the behavior of a system or process. It involves creating a mathematical model of the system or process, and then using a computer to run simulations based on that model. The goal of a simulation is to mimic the behavior of the system or process as closely as possible, in order to gain insights, make predictions, or test hypotheses.\n\n\n\nSimulations can be used in a wide range of fields, including engineering, science, business, and social science. For example, simulations can be used to study the behavior of aircraft, predict the spread of diseases, or analyze economic trends. Simulations are often used in situations where it is not practical or possible to conduct experiments in the real world, or when it is more cost-effective to perform simulations rather than build and test physical prototypes.\n\n\n\nThere are many different types of computer simulations, ranging from simple models with a few variables to comple