# Training the Model

In [22]:
import re

# Function to clean the data
def clean_data(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        documents = file.readlines()


    cleaned_documents = []
    for doc in documents:
        doc = doc.lower()
        # Remove all punctuation except ", . ?!"
        doc = re.sub(r"[^\w\s,.?!]", "", doc)
        # Convert to lower case
        # Add <START> and <END> tokens
        doc = f"START {doc.strip()} END"
        cleaned_documents.append(doc)
    
    return cleaned_documents


## Reading the data

In [23]:
# Read the first text file
gpt_path = '/Users/dehaay/Desktop/humvgpt 2/gpt.txt' 
gpt_text = clean_data(gpt_path)

# Read the second text file
human_path = '/Users/dehaay/Desktop/humvgpt 2/hum.txt'
human_text = clean_data(human_path)

In [24]:
def data_parti(text,train_ratio):
    split_index = int(len(text) * train_ratio)
    train = text[:split_index]
    test = text[split_index:]
    print(len(train),len(test))
    return train,test


## Data partition

In [25]:
gpt_train,gpt_test = data_parti(gpt_text,0.9)

human_train,human_test = data_parti(human_text,0.9)

18358 2040
35741 3972


## Generate n-grams

In [26]:

# Converts them [[word_array_of_paragrapgh1],[word_array_of_paragrapgh2]]
from collections import Counter

def tokenize_paragraph(data):
    
    tokenized_paragraphs = []

    # Iterate over each paragraph in data
    for paragraph in data:
        # Tokenize the paragraph by words using re.findall
        # The pattern \b\w+\b matches sequences of word characters that are bounded by word boundaries
        tokens = re.findall(r'\w+|[.]', paragraph)

        # Append the list of tokens (words) for the current paragraph to tokenized_paragraphs
        tokenized_paragraphs.append(tokens)
        
    return tokenized_paragraphs

# Function to generate n-grams from tokenized paragraphs
def generate_n_grams_from_paragraphs(tokenized_paragraphs, n):
    n_grams_list = []
    for paragraph_tokens in tokenized_paragraphs:
        n_grams_list.extend(list(zip(*[paragraph_tokens[i:] for i in range(n)])))
    return n_grams_list

In [27]:
gpt_train_tokenized = tokenize_paragraph(gpt_train)
gpt_test_tokenized = tokenize_paragraph(gpt_test)

human_train_tokenized = tokenize_paragraph(human_train)
human_test_tokenized = tokenize_paragraph(human_test)

# Generate bigrams and trigrams for the training set
gpt_bigrams_train = Counter(generate_n_grams_from_paragraphs(gpt_train_tokenized, 2))
gpt_trigrams_train = Counter(generate_n_grams_from_paragraphs(gpt_train_tokenized, 3))

# Generate bigrams and trigrams for the test set
gpt_bigrams_test = Counter(generate_n_grams_from_paragraphs(gpt_test_tokenized, 2))
gpt_trigrams_test = Counter(generate_n_grams_from_paragraphs(gpt_test_tokenized, 3))

# Generate bigrams and trigrams for the training set
human_bigrams_train = Counter(generate_n_grams_from_paragraphs(human_train_tokenized, 2))
human_trigrams_train = Counter(generate_n_grams_from_paragraphs(human_train_tokenized, 3))

# Generate bigrams and trigrams for the test set
human_bigrams_test = Counter(generate_n_grams_from_paragraphs(human_test_tokenized, 2))
human_trigrams_test = Counter(generate_n_grams_from_paragraphs(human_test_tokenized, 3))

In [15]:
combined_biagram_train = gpt_bigrams_train + human_bigrams_train
combined_biagram_test = gpt_bigrams_test + human_bigrams_test

combined_trigram_train = gpt_trigrams_train + human_trigrams_train
combined_trigram_test = gpt_trigrams_test + human_trigrams_test

## OOV Rates

In [16]:
# Total number of bigrams in the test set (counting repeats)
total_bigrams_test = sum(combined_biagram_test.values())

# Bigrams in the test set that are not in the training set
oov_bigrams_test = {bigram: count for bigram, count in combined_biagram_test.items() if bigram not in combined_biagram_train}

# Number of OOV bigrams in the test set (counting repeats)
oov_bigrams_count = sum(oov_bigrams_test.values())

# Calculate the OOV rate for bigrams
oov_rate_bigrams = (oov_bigrams_count / total_bigrams_test) * 100


# Total number of trigrams in the test set (counting repeats)
total_trigrams_test = sum(combined_trigram_test.values())

# Trigrams in the test set that are not in the training set
oov_trigrams_test = {trigram: count for trigram, count in combined_trigram_test.items() if trigram not in combined_trigram_train}

# Number of OOV trigrams in the test set (counting repeats)
oov_trigrams_count = sum(oov_trigrams_test.values())

# Calculate the OOV rate for trigrams
oov_rate_trigrams = (oov_trigrams_count / total_trigrams_test) * 100



In [17]:
print(f"OOV Rate for Bigrams: {oov_rate_bigrams:.2f}%")
print(f"OOV Rate for Trigrams: {oov_rate_trigrams:.2f}%")

OOV Rate for Bigrams: 10.72%
OOV Rate for Trigrams: 37.89%


# Assess Accuracy

In [30]:
# Combine n-gram frequencies into a structured format
n_gram_models = {
    'bigram': {
        'human': human_bigrams_train,
        'gpt': gpt_bigrams_train,
    },
    'trigram': {
        'human': human_trigrams_train,
        'gpt': gpt_trigrams_train,
    }
}


# Assume vocab_size is known (total unique bigrams and trigrams across both human and GPT training sets)
vocab_size_bigrams = len(set(n_gram_models['bigram']['human']) | set(n_gram_models['bigram']['gpt']))
vocab_size_trigrams = len(set(n_gram_models['trigram']['human']) | set(n_gram_models['trigram']['gpt']))

vocab_size = {
    'bigram': vocab_size_bigrams,
    'trigram': vocab_size_trigrams
}

vocab_size

{'bigram': 1116390, 'trigram': 3240641}

In [28]:
from collections import defaultdict

# Pre-compute unigram and bigram counts for each label
def precompute_counts(model_counts):
    precomputed = {'bigram': {}, 'trigram': {}}
    
    for type_of_gram in model_counts:
        for label in model_counts[type_of_gram]:
            if type_of_gram == 'bigram':
                # Precompute sum of counts for each unigram as the first word in a bigram
                unigram_counts = defaultdict(int)
                for bigram, count in model_counts[type_of_gram][label].items():
                    unigram_counts[bigram[0]] += count
                precomputed[type_of_gram][label] = unigram_counts
            elif type_of_gram == 'trigram':
                # Precompute sum of counts for each bigram as the first two words in a trigram
                bigram_counts = defaultdict(int)
                for trigram, count in model_counts[type_of_gram][label].items():
                    bigram_counts[trigram[:2]] += count
                precomputed[type_of_gram][label] = bigram_counts
    
    return precomputed

# Modified function using precomputed sums
def laplace_smoothed_probability(n_gram, label, type_of_gram, model_counts, precomputed_counts, vocab_size):
    count_of_ngram = model_counts[type_of_gram][label].get(n_gram, 0)
    
    if type_of_gram == 'bigram':
        unigram_count = precomputed_counts[type_of_gram][label].get(n_gram[0], 0)
        denominator = unigram_count + vocab_size[type_of_gram]
    elif type_of_gram == 'trigram':
        bigram_part = n_gram[:2]
        bigram_count = precomputed_counts[type_of_gram][label].get(bigram_part, 0)
        denominator = bigram_count + vocab_size[type_of_gram]
    else:
        return "wrong type"
    
    return (count_of_ngram + 1) / denominator

# Precompute counts before using the function
precomputed_counts = precompute_counts(n_gram_models)



NameError: name 'n_gram_models' is not defined

In [23]:
total_bigrams_human = sum(human_bigrams_train.values())
total_bigrams_gpt = sum(gpt_bigrams_train.values())

# Calculate the overall total of bigrams
total_bigrams = total_bigrams_human + total_bigrams_gpt

# Calculate P(y) for each class
P_y_human = total_bigrams_human / total_bigrams
P_y_gpt = total_bigrams_gpt / total_bigrams

def classify_document(document_n_grams, n_gram_type, n_gram_models, vocab_size):
    """
    Classify a document based on its n-grams.

    Parameters:
    - document_n_grams: Counter object containing the document's n-grams.
    - n_gram_type: 'bigram' or 'trigram' specifying the type of n-gram.
    - n_gram_models: The structured n-gram models containing counts for human and GPT.
    - vocab_size: The size of the vocabulary for the n-gram type.
    - total_counts: Total counts of n-grams in the training data.

    Returns:
    - Classification label: 'human' or 'gpt'.
    """
    # Initialize log probabilities to 0 (since we'll be summing logs)
    log_prob_human = 0
    log_prob_gpt = 0
    

    # Calculate log probabilities for the document under each model
    for n_gram, count in document_n_grams.items():
        prob_human = laplace_smoothed_probability(n_gram,'human',n_gram_type,n_gram_models,precomputed_counts,vocab_size)

        prob_gpt =  laplace_smoothed_probability(n_gram,'gpt', n_gram_type,n_gram_models,precomputed_counts,vocab_size)
        # Use log probabilities to avoid underflow
        log_prob_human += math.log(prob_human) * count
        log_prob_gpt += math.log(prob_gpt) * count

    log_prob_human += math.log(P_y_human)
    log_prob_gpt += math.log(P_y_gpt)
    
    return 'human' if log_prob_human > log_prob_gpt else 'gpt'


In [24]:
import math

def evaluate_model(test_data, n_gram_type, n_gram_models, vocab_size):
    correct = 0
    #document : each tokenized paragrapgh
    #true_label: classification of the document
    for document, true_label in test_data:
        
        # Generate n-grams for the document
        document_n_grams = Counter(generate_n_grams_from_paragraphs([document], 2 if n_gram_type == 'bigram' else 3))
        
        # Classify the document
        predicted_label = classify_document(document_n_grams, n_gram_type, n_gram_models, vocab_size)
        
        # Check if the prediction is correct
        if predicted_label == true_label:
            correct += 1
    
    # Calculate accuracy
    accuracy = correct / len(test_data)
    return accuracy

# Prepare the test data (assuming you have a way to combine human and GPT test sets with labels)
test_data = [(doc, 'human') for doc in human_test_tokenized] + [(doc, 'gpt') for doc in gpt_test_tokenized]

# Vocabulary sizes and total counts need to be dictionaries for each n-gram type
vocab_size = {'bigram': vocab_size_bigrams, 'trigram': vocab_size_trigrams}
#total_counts = {'bigram': total_bigrams_counts, 'trigram': total_trigrams_counts}

# Evaluate models
bigram_accuracy = evaluate_model(test_data, 'bigram', n_gram_models, vocab_size)
trigram_accuracy = evaluate_model(test_data, 'trigram', n_gram_models, vocab_size)

print(f"Bigram Model Accuracy: {bigram_accuracy * 100:.2f}%")
print(f"Trigram Model Accuracy: {trigram_accuracy * 100:.2f}%")


Bigram Model Accuracy: 95.64%
Trigram Model Accuracy: 93.00%


# Sentence generation

In [17]:
def get_counts(sentence,original_dict,n):
    word_list = []
    frequencies = []
    for words,frequency in original_dict.items():

        #If the first n-1 words of the n_gram == last n-1 words of our sentence
        if list(words)[:(n-1)] == sentence[-(n-1):]:

            word_list.append(list(words)[-1])
            frequencies.append(frequency)
        else:
            word_list.append(list(words)[-1])
            frequencies.append(0)
    return word_list,frequencies



def get_distribution(freq,T = 50):
    counts_values = np.array(freq)
    exp_counts = np.exp(counts_values / T)
    probabilities = exp_counts / np.sum(exp_counts)
    return probabilities

In [18]:
def generate_sentence(original_dict,word_lim = 20,n = 2, T = 50):
    sentence = ["START"] * (n-1)
    index = 0

    while index < word_lim + (n-2):
        if sentence[index] == "END":
            break

        word_list,freq = get_counts(sentence,original_dict,n)
        probs = get_distribution(freq,T = T)

        next_word = np.random.choice(word_list, p=probs)
        sentence.append(next_word)
        index += 1

    if sentence[-1] != "END":
        sentence.append("END")

    return ' '.join(sentence)

In [19]:
print("------Human Bigram------")
for i in range(5):
    print(generate_sentence(human_bigrams_train,n=2,T = 50), end= "\n")
print("------Gpt Bigram------")
for i in range(5):
    print(generate_sentence(gpt_bigrams_train,n=2,T = 50), end= "\n")

------Human Bigram------
START the same thing END
START the same light divisions . END
START the same you can be a lot of the same numerals magnetism area 6 regulatory owned x who was a END
START the same and the same will be a lot of the same is a lot of the same speakers public END
START the same theoretical conflicts the same lhc is a lot of the same builders on the same photo reduce conversion END
------Gpt Bigram------
START the same want to the same the same deed . END
START the same bear applying due to the same paparazzo if you are a good idea to the same raised there END
START the same otheraspartame simply true magnetic and the same you are a good idea to the same is a good END
START the same bureaux ships stroked would be a good idea to the same contexts very at the same ads or END
START the united states they are a good idea to the same when you are a good idea to the same END


In [20]:
print("------Human Trigram------")
for i in range(5):
    print(generate_sentence(human_trigrams_train,n=3,T = 50), end= "\n")
print("------Gpt Trigram------")
for i in range(5):
    print(generate_sentence(gpt_trigrams_train,n=3,T = 50), end= "\n")

------Human Trigram------
START START increased machine video . because make or system x86 rumelhart at rotary processes in told depends surveys and than chains only END
START START it s a more larger load and sponsored the they what funny curvy ball prepared ate this traction . would device END
START START linemen laws lemons recently is and information and eat of has do remaining hairless pop and fractional fmri . security go END
START START crash s in or well had modify the or a a the and you you starts lines tv way organisms steal END
START START and lungs any sky free an can evo 2 that asked lots later 4 how with miss to institutions the home END
------Gpt Trigram------
START START caused right it specialized time definite rap these number use value include part blood which ingredients using a through without everyone END
START START if giftgiving if with if standoffs how language pe tensions it peoples END you END
START START it is important to note that income which accepting fa