In [None]:
%tensorflow_version 1.x

from google.colab import drive

drive.mount("/content/drive", force_remount=True)

%cd 'drive/MyDrive/'

!export BERT_BASE_DIR=Internship/BERT/BERT-mini
!export DATA_DIR=Internship/Data/binary

!python Internship/BERT/run_classifier.py \
  --task_name=GPT_Classifier \
  --do_train=true \
  --do_eval=true \
  --data_dir=$DATA_DIR/\
  --vocab_file=Internship/BERT/BERT-mini/vocab.txt \
  --bert_config_file=Internship/BERT/BERT-mini/bert_config.json \
  --init_checkpoint=Internship/BERT/BERT-mini/bert_model.ckpt \
  --max_seq_length=512 \
  --train_batch_size=32 \
  --learning_rate=2e-5 \
  --num_train_epochs=15 \
  --output_dir=$DATA_DIR/output

In [None]:
!pip3 install transformers numpy torch sklearn

# this cell imports the data, trains the BERT model and saves it

import torch
from transformers.file_utils import is_tf_available, is_torch_available, is_torch_tpu_available
from transformers import BertTokenizerFast, BertForSequenceClassification, AutoTokenizer
from transformers import Trainer, TrainingArguments
import numpy as np
import random
from sklearn.model_selection import train_test_split
from sklearn.datasets import fetch_20newsgroups
import pandas as pd
import pickle
from sklearn.metrics import accuracy_score
import random

from google.colab import drive

drive.mount("/content/drive", force_remount=True)

%cd 'drive/MyDrive/Internship/Data'

temps = ["0_5", "0_75", "1_0", "1_25", "1_5"]
file_names = ["CNNArticles", "fb_edit", "ireland_headlines", "tweets_edited"]


model_name = "prajjwal1/bert-medium"
max_length = 512

#tokenizer = AutoTokenizer.from_pretrained(model_name, do_lower_case=True, use_fast=False)

model_path = "gpt_classifier"
#tokenizer.save_pretrained(model_path)

tokenizer = AutoTokenizer.from_pretrained(model_path)

def compute_acc(pred):

  labels = pred.label_ids
  preds = pred.predictions.argmax(-1)

  acc = accuracy_score(labels, preds)
  return {
      'accuracy': acc,
  }

def read_data():

    gpt_data = pd.read_csv("train_data/gpt_text.csv")
    human_data = pd.read_csv("train_data/human_text.csv")

    gpt_text = list(gpt_data.iloc[:, 0])
    human_text = list(human_data.iloc[:, 0])

    # get test data

    with open("test_data/test_data.pkl", "rb") as f:

        test_data = pickle.load(f)

    test_labels = []
    test_text = []

    print(test_data[("CNNArticles", "0_5")])

    test_human_len = 0
    test_gpt_len = 0

    for f in file_names:

        for t in temps:

            a = list(test_data[(f,t)].iloc[:, 0])
            test_text = test_text + a
            test_labels = test_labels + list(np.ones(len(test_data[(f,t)]), dtype=int))
            test_gpt_len += len(a)

    for f in file_names:

        a = list(test_data[f].iloc[:,0])
        test_text = test_text + a
        test_labels = test_labels + list(np.zeros(len(test_data[f]), dtype=int))

        test_human_len += len(a)

    # format train data

    train_text = gpt_text + human_text

    print(len(gpt_text))
    print(len(human_text))
    print(test_human_len)
    print(test_gpt_len)

    train_labels = list(np.ones(len(gpt_text), dtype=int)) + list(np.zeros(len(human_text), dtype=int))

    return train_text, test_text, train_labels, test_labels


(train_text, test_text, train_labels, test_labels)= read_data()


print(len(train_text))
print(len(test_text))
print(len(train_labels))
print(len(test_labels))

for i, t in enumerate(train_text):

  if type(t) != str:

    train_text[i] = " "


for i, t in enumerate(test_text):

  if type(t) != str:

    test_text[i] = " "

shuffling = list(zip(train_text, train_labels))
random.shuffle(shuffling)
train_text, train_labels = zip(*shuffling)

train_text = list(train_text)
train_labels = list(train_labels)


print(train_text[1:10])
print(train_labels[1:10])

train_encoding = tokenizer(text=train_text, truncation=True, padding=True, max_length=max_length)
test_encoding = tokenizer(text=test_text, truncation=True, padding=True, max_length=max_length)

class TextDataset(torch.utils.data.Dataset):

    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
        item["labels"] = torch.tensor([self.labels[idx]])
        return item

    def __len__(self):
        return len(self.labels)


train_dataset = TextDataset(train_encoding, train_labels)
test_dataset = TextDataset(test_encoding, test_labels)

model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2).to("cuda")

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=20,
    warmup_steps=500,
    weight_decay=0.01,
    learning_rate=5e-5,
    logging_dir='./logs',
    load_best_model_at_end=True,
    logging_steps=300,
    evaluation_strategy="steps",
    gradient_accumulation_steps=16
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_acc,
)

trainer.train()

trainer.evaluate()


model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)


In [None]:
# evaluate BERT model on subsets of data

!pip3 install transformers numpy torch sklearn

from transformers import BertConfig, BertModel
from google.colab import drive
from transformers import BertTokenizerFast, BertForSequenceClassification, AutoTokenizer
from sklearn.metrics import accuracy_score
import numpy as np
import pickle

drive.mount("/content/drive", force_remount=True)

%cd 'drive/MyDrive/Internship/Data'

model_path = "gpt_classifier"
max_length = 512

temps = ["0_5", "0_75", "1_0", "1_25", "1_5"]
file_names = ["CNNArticles", "fb_edit", "ireland_headlines", "tweets_edited"]

model = BertForSequenceClassification.from_pretrained(model_path).to("cuda")
tokenizer = AutoTokenizer.from_pretrained(model_path)

with open("test_data/test_data.pkl", "rb") as f:

    test_data = pickle.load(f)

def get_prediction(text):

    try:

      inputs = tokenizer(text, padding=True, truncation=True, max_length=max_length, return_tensors="pt").to("cuda")

    except:

      print(text)

    outputs = model(**inputs)

    probs = outputs[0].softmax(1)

    return probs.argmax()


for f in file_names:

    x = []
    y = []

    for t in temps:

        a = list(test_data[(f, t)].iloc[:, 0].dropna())

        x = x + a
        y = y + list(np.ones(len(a), dtype=int))

    a = list(test_data[f].iloc[:, 0].dropna())

    x = x + a
    y = y + list(np.zeros(len(a), dtype=int))

    y_hat = [int(get_prediction(i)) for i in x]

    print(y_hat[1:10])
    print(f)
    print(accuracy_score(y, y_hat))
    print("\n")









In [None]:
[1,2,3] == [1,0,1]

False

In [None]:
print(test_data[("CNNArticles", "0_5")])

NameError: ignored

In [None]:
for i in train_text:

  if type(i) != str:

    print(type(i))

In [None]:
import pandas as pd
from google.colab import drive
import numpy as np


import pickle

drive.mount("/content/drive", force_remount=True)

%cd 'drive/MyDrive/Internship/Data'

temps = ["0_5", "0_75", "1_0", "1_25", "1_5"]
file_names = ["CNNArticles", "fb_edit", "ireland_headlines", "tweets_edited"]

gpt_train_list = []
human_train_list = []

test_data = {}

# training data and test data (divided by temperature and corpus)

for f in file_names:

  for t in temps:

    df = pd.read_csv("gpt/" + f + t + ".csv", engine='python', error_bad_lines=False)

    if t == "1_0":

        indices = np.random.choice(range(df.shape[0]), 10000, replace=False)

    else:

        indices = np.random.choice(range(df.shape[0]), min(df.shape[0],2000), replace=False)

    df = df.iloc[indices, :]

    for i in range(df.shape[0]):

      df.iloc[i, 0] = df.iloc[i, 0].replace("<|startoftext|>", "")

    test_indices = np.random.choice(range(df.shape[0]), int(df.shape[0]*0.2), replace=False)
    train_indices = list(set(range(df.shape[0])) - set(test_indices))


    gpt_train_list.append(df.iloc[train_indices, :])

    test_data[(f, t)] = df.iloc[test_indices, :]

for f in file_names:


    df = pd.read_csv("human/" + f + ".csv")

    indices = np.random.choice(range(df.shape[0]), 18000, replace=False)

    df = df.iloc[indices, :]

    test_indices = np.random.choice(range(df.shape[0]), int(df.shape[0]*0.2), replace=False)
    train_indices = list(set(range(df.shape[0])) - set(test_indices))

    human_train_list.append(df.iloc[train_indices, :])

    test_data[f] = df.iloc[test_indices, :]


gpt_df = pd.concat(gpt_train_list)
human_df = pd.concat(human_train_list)

gpt_df.to_csv("train_data/gpt_text.csv", index = False)
human_df.to_csv("train_data/human_text.csv", index = False)

a = gpt_df.shape[0]
b = human_df.shape[0]

indices = np.random.choice(min(a,b), size=1000, replace=False)

gpt_df.iloc[indices, :].to_csv("test_gpt.csv", index=False)
human_df.iloc[indices, :].to_csv("test_human.csv", index=False)


with open("test_data/test_data.pkl", "wb") as f:

    pickle.dump(test_data, f)




In [None]:
# testing that pickled test data saved correctly

import pickle
import pandas as pd
from google.colab import drive

drive.mount("/content/drive", force_remount=True)

%cd 'drive/MyDrive/Internship/Data'

with open("test_data/test_data.pkl", "rb") as f:

    test_data = pickle.load(f)

print(test_data.keys())

print(test_data["ireland_headlines"])
print(test_data[("CNNArticles", "0_75")])

for i in test_data.keys():

  print(test_data[i].shape)



In [None]:
import numpy as np

print(np.random.choice(range(1000), 10))

[767 364 630 279 976 506 114 781   9  23]


In [None]:
!pip3 install transformers numpy torch sklearn

import pickle
import pandas as pd
import nltk
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from transformers import BertTokenizerFast, BertForSequenceClassification, AutoTokenizer

from google.colab import drive

drive.mount("/content/drive", force_remount=True)

%cd 'drive/MyDrive/Internship/Data'

file_names = ["CNNArticles", "fb_edit", "ireland_headlines", "tweets_edited"]
temps = ["0_5", "0_75", "1_0", "1_25", "1_5"]
model_path = "gpt_classifier"
max_length = 512


test_data = pickle.load(open("test_data/new_test_data.pkl", "rb"))

model = BertForSequenceClassification.from_pretrained("results/checkpoint-4200").to("cuda")
tokenizer = AutoTokenizer.from_pretrained(model_path)

def get_prediction(text):

    try:

      inputs = tokenizer(text, padding=True, truncation=True, max_length=max_length, return_tensors="pt").to("cuda")

    except:

      print(text)

    outputs = model(**inputs)

    probs = outputs[0].softmax(1)

    return probs.argmax()

for t in temps:

  x = []
  y = []

  for f in file_names:

      x = test_data[(f,t)][0]
      y = list(np.ones(len(test_data[(f,t)][0])))
      y_hat = [int(get_prediction(i)) for i in x]

      print(accuracy_score(y, y_hat))
      print(f)
      print(t)
      print("\n")


for t in temps:

  x = []
  y = []

  for f in file_names:

      x = x + test_data[(f,t)][0]
      y = y + list(np.ones(len(test_data[(f,t)][0])))

  y_hat = [int(get_prediction(i)) for i in x]

  print(accuracy_score(y, y_hat))
  print(t)
  print("\n")



for f in file_names:

    x = test_data[f][0]
    y = list(np.zeros(len(test_data[f][0])))

    for t in temps:

        x = x + test_data[(f,t)][0]
        y = y + list(np.ones(len(test_data[(f, t)][0])))


# get accuracy by temperature AND datatype

test_x = []
test_y = []
x_length = []


for f in file_names:


    test_x = test_x + test_data[f][0]
    x_length = x_length + test_data[f][1]

    test_y = test_y + list(np.zeros(len(test_data[f][0])))

    for t in temps:

        test_x = test_x + test_data[(f,t)][0]
        x_length = x_length + test_data[(f,t)][1]

        test_y = test_y + list(np.ones(len(test_data[(f,t)][0])))


    # now we have x, y and lengths of x

    q1 = np.percentile(x_length, 25)
    q2 = np.percentile(x_length, 50)
    q3 = np.percentile(x_length, 75)

    x1 = [x for i, x in enumerate(test_x) if x_length[i] <= q1]
    x2 = [x for i, x in enumerate(test_x) if q1 < x_length[i] <= q2]
    x3 = [x for i, x in enumerate(test_x) if q2 < x_length[i] <= q3]
    x4 = [x for i, x in enumerate(test_x) if x_length[i] > q3]

    y1 = [y for i, y in enumerate(test_y) if x_length[i] <= q1]
    y2 = [y for i, y in enumerate(test_y) if q1 < x_length[i] <= q2]
    y3 = [y for i, y in enumerate(test_y) if q2 < x_length[i] <= q3]
    y4 = [y for i, y in enumerate(test_y) if x_length[i] > q3]

    y_hat1 = [int(get_prediction(x)) for x in x1]
    y_hat2 = [int(get_prediction(x)) for x in x2]
    y_hat3 = [int(get_prediction(x)) for x in x3]
    y_hat4 = [int(get_prediction(x)) for x in x4]

    print(accuracy_score(y1, y_hat1))
    print(q1)
    print(accuracy_score(y2, y_hat2))
    print(q2)
    print(accuracy_score(y3, y_hat3))
    print(q3)
    print(accuracy_score(y4, y_hat4))

    print(f)
    print("\n")






In [None]:
!pip3 install transformers numpy torch sklearn

import pickle
import pandas as pd
import nltk
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from transformers import BertTokenizerFast, BertForSequenceClassification, AutoTokenizer

from google.colab import drive

drive.mount("/content/drive", force_remount=True)

%cd 'drive/MyDrive/Internship/Data'

file_names = ["CNNArticles", "fb_edit", "ireland_headlines", "tweets_edited"]
temps = ["0_5", "0_75", "1_0", "1_25", "1_5"]
model_path = "gpt_classifier"
max_length = 512


test_data = pickle.load(open("test_data/new_test_data.pkl", "rb"))

model = BertForSequenceClassification.from_pretrained("results/checkpoint-4200").to("cuda")
tokenizer = AutoTokenizer.from_pretrained(model_path)

def get_prediction(text):

    try:

      inputs = tokenizer(text, padding=True, truncation=True, max_length=max_length, return_tensors="pt").to("cuda")

    except:

      print(text)

    outputs = model(**inputs)

    probs = outputs[0].softmax(1)

    return probs.argmax()

# get accuracy by temperature AND datatype

test_x = {}
test_y = {}
x_length = {}

q = {}



for t in temps:

    test_x[t] = []
    x_length[t] = []
    test_y[t] = []

    for f in file_names:

        test_x[t] = test_x[t] + test_data[(f,t)][0]
        x_length[t] = x_length[t] + test_data[(f,t)][1]

        test_y[t] = test_y[t] + list(np.ones(len(test_data[(f,t)][0])))

    q[t] = [np.percentile(x_length[t], i) for i in range(0, 100, 10)]

print(q)
for t in temps:

  print(t)

  for i in range(1, len(q[t])):

    x_ = [x for j, x in enumerate(test_x[t]) if q[t][i-1] < x_length[t][j] <= q[t][i]]
    y_ = [y for j, y in enumerate(test_y[t]) if q[t][i-1] < x_length[t][j] <= q[t][i]]

    y_hat = [int(get_prediction(x)) for x in x_]

    print(accuracy_score(y_, y_hat))

  print("\n")


In [None]:
import pickle
import pandas as pd
from google.colab import drive

drive.mount("/content/drive", force_remount=True)

%cd 'drive/MyDrive/Internship/Data'

# convert test data to form with length attached

with open("test_data/test_data.pkl", "rb") as f:

  test_data = pickle.load(f)

new_test_data = {}

for k in test_data.keys():

   text = list(test_data[k].iloc[:, 0])
   text = [str(x) for x in text]
   length = [len(x) for x in text]
   new_test_data[k] = [text, length]

   print(type(new_test_data[k][0]))

   print(new_test_data[k][0][0:10])
   print(new_test_data[k][1][0:10])

with open("test_data/new_test_data.pkl", "wb") as f:

  pickle.dump(new_test_data, f)
