In [2]:
import torch
from transformers.file_utils import is_tf_available, is_torch_available, is_torch_tpu_available
from transformers import BertTokenizerFast, BertForSequenceClassification
from transformers import Trainer, TrainingArguments
import numpy as np
import random
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split

import re
import pandas as pd
from collections import Counter

In [3]:
def set_seed(seed: int):
    """
    Helper function for reproducible behavior to set the seed in ``random``, ``numpy``, ``torch`` and/or ``tf`` (if
    installed).

    Args:
        seed (:obj:`int`): The seed to set.
    """
    random.seed(seed)
    np.random.seed(seed)
    if is_torch_available():
        torch.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
        # ^^ safe to call this function even if cuda is not available
    if is_tf_available():
        import tensorflow as tf

        tf.random.set_seed(seed)

set_seed(1)

In [4]:
# the model we're going to train, base uncased BERT
model_name = "bert-base-uncased"
# max sequence length for each document/sentence sample
max_length = 512

In [5]:
# load the tokenizer
tokenizer = BertTokenizerFast.from_pretrained(model_name, do_lower_case=True)

In [6]:
def read_and_clean_lines(data):   
    NeuScores = []
    Statuses = []

   
    for line in data:
        
        NeuScore = line[1]
        if NeuScore == 'y':
            NeuScore = 0
        else:
            NeuScore = 1
        
        Status = line[2]
        
        clean_text = re.sub(r"\s+"," ",Status)
        Statuses.append(clean_text)
        NeuScores.append(NeuScore)
        
    print("Read {} documents".format(len(Statuses)))
    print("Read {} labels".format(len(NeuScores)))
    return Statuses,NeuScores

# Call sklearn's train_test_split function to split the dataset into training items/labels and test items/labels.  

def split_training_set(lines, labels, test_size, random_seed=42):
    X_train, X_test, y_train, y_test = train_test_split(lines, labels, test_size=test_size, random_state=random_seed, stratify=labels)
    print("Training set label counts: {}".format(Counter(y_train)))
    print("Test set     label counts: {}".format(Counter(y_test)))
    return X_train, X_test, y_train, y_test


#format the MyPersonality data

df = pd.read_csv ('MyPersonalityData.csv')
df = df.groupby('#AUTHID').agg({'cNEU':'first', 
                             'STATUS': ', '.join }).reset_index()
data = df.values.tolist()

#clean the data
X, y = read_and_clean_lines(data)

#split the data
train_texts, valid_texts, train_labels, valid_labels = split_training_set(X, y, test_size = 0.1)
target_names = ['Neurotic','Not_Neurotic']


Read 250 documents
Read 250 labels
Training set label counts: Counter({1: 136, 0: 89})
Test set     label counts: Counter({1: 15, 0: 10})


In [7]:
# tokenize the dataset, truncate when passed `max_length`, 
# and pad with 0's when less than `max_length`
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=max_length)
valid_encodings = tokenizer(valid_texts, truncation=True, padding=True, max_length=max_length)

In [8]:
class TweetDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
        item["labels"] = torch.tensor([self.labels[idx]])
        return item

    def __len__(self):
        return len(self.labels)

# convert our tokenized data into a torch Dataset
train_dataset = TweetDataset(train_encodings, train_labels)
valid_dataset = TweetDataset(valid_encodings, valid_labels)

In [9]:
# load the model and pass to CUDA
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=len(target_names))

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [10]:
from sklearn.metrics import accuracy_score

def compute_metrics(pred):
  labels = pred.label_ids
  preds = pred.predictions.argmax(-1)
  # calculate accuracy using sklearn's function
  acc = accuracy_score(labels, preds)
  return {
      'accuracy': acc,
  }

In [11]:
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=20,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    load_best_model_at_end=True,     # load the best model when finished training (default metric is loss)
    # but you can specify `metric_for_best_model` argument to change to accuracy or other metric
    logging_steps=200,               # log & save weights each logging_steps
    evaluation_strategy="steps",     # evaluate each `logging_steps`
)

In [12]:
trainer = Trainer(
    model=model,                         # the instantiated Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=valid_dataset,          # evaluation dataset
    compute_metrics=compute_metrics,     # the callback that computes metrics of interest
)

In [13]:
# train the model
trainer.train()

Step,Training Loss,Validation Loss


TrainOutput(global_step=45, training_loss=0.6650547875298394, metrics={'train_runtime': 3654.7524, 'train_samples_per_second': 0.012, 'total_flos': 227025562060800, 'epoch': 3.0})

In [14]:
#evaluate the model
load_best_model_at_end = True
trainer.evaluate()

{'eval_loss': 0.6851962208747864,
 'eval_accuracy': 0.6,
 'eval_runtime': 41.921,
 'eval_samples_per_second': 0.596,
 'epoch': 3.0}

In [15]:
model_path = "mypersonality-bert-base-uncased"
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)

('mypersonality-bert-base-uncased\\tokenizer_config.json',
 'mypersonality-bert-base-uncased\\special_tokens_map.json',
 'mypersonality-bert-base-uncased\\vocab.txt',
 'mypersonality-bert-base-uncased\\added_tokens.json')

In [16]:
def get_prediction(text):
    # prepare our text into tokenized sequence
    inputs = tokenizer(text, padding=True, truncation=True, max_length=max_length, return_tensors="pt")#.to("cuda")
    # perform inference to our model
    outputs = model(**inputs)
    # get output probabilities by doing softmax
    probs = outputs[0].softmax(1)
    # executing argmax function to get the candidate label
    return target_names[probs.argmax()]

In [62]:
labels_list = []
for post in train_texts:
    text = post
    #print(get_prediction(text))
    labels_list.append(get_prediction(text))

In [63]:
print(len(train_labels))
print(len(labels_list))
print(train_labels)
#print(valid_dataset.labels)
#print(target_names[valid_labels])

225
225
[0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1]


In [59]:
#format the Essays data

df1 = pd.read_csv ('Essays.csv')
df1 = df1.groupby('#AUTHID').agg({'cNEU':'first', 
                             'TEXT': ', '.join }).reset_index()
data1 = df1.values.tolist()

#clean the data
X1, y1 = read_and_clean_lines(data1)
print(X1[0])
print(y1)
print(len(y1))

#split the data
train_texts1, valid_texts1, train_labels1, valid_labels1 = split_training_set(X1, y1, test_size = 0.1)
target_names = ['Neurotic','Not_Neurotic']

train_encodings1 = tokenizer(train_texts1, truncation=True, padding=True, max_length=max_length)
valid_encodings1 = tokenizer(valid_texts1, truncation=True, padding=True, max_length=max_length)

train_dataset1 = TweetDataset(train_encodings1, train_labels1)
valid_dataset1 = TweetDataset(valid_encodings1, valid_labels1)

Read 2468 documents
Read 2468 labels
The lights are all out here in Hardin House. Thank goodness Amy has this lap top so I can do this assignment. I still have to do the other one too. They both are due Friday. The lights just came on. I'm so relieved because now I can feel the air conditioning. I'm a little bit off the assignment because Cara is watching me. Cara says hi. she is really sick right now and her nose is always runny. she snotted on my bed. or maybe she just drooled. I don't really know. Tonight I need to go to the theta house and study. We have to get certain amount of hours done . I don't really want to read anymore of the Great Plains by Webb. It is like a history book. It makes me want to fall asleep sometimes when I read it. The most interesting section was about the animals on the Plains. The little jackrabbit has a white patch on its butt so that whenever it is in danger or something it flares it up to communicate with it's own kind. I guess that's how it works. Rig

In [60]:
labels_list1 = []
for post in X1:
    text = post
    #print(get_prediction(text))
    labels_list1.append(get_prediction(text))

In [65]:
print(labels_list)
for i in range(len(labels_list)):
    if labels_list[i] == 'Neurotic':
        print(labels_list[i], i)
#print(valid_dataset1.labels)
#print(len(valid_labels1))
for i in range(len(train_labels)):
    if train_labels[i] == 1:
        train_labels[i] = 'Not_Neurotic'
    if train_labels[i] == 0:
        train_labels[i] = 'Neurotic'
#print(y1)
#print(valid_labels1)
#print(valid_labels1[157])
correct = 0
for i in range(len(labels_list)):
    if labels_list[i] == train_labels[i]:
        correct = correct + 1
accuracy = correct/len(labels_list)
print(accuracy)

['Not_Neurotic', 'Not_Neurotic', 'Not_Neurotic', 'Not_Neurotic', 'Not_Neurotic', 'Not_Neurotic', 'Not_Neurotic', 'Not_Neurotic', 'Not_Neurotic', 'Not_Neurotic', 'Not_Neurotic', 'Not_Neurotic', 'Not_Neurotic', 'Not_Neurotic', 'Not_Neurotic', 'Not_Neurotic', 'Not_Neurotic', 'Not_Neurotic', 'Not_Neurotic', 'Not_Neurotic', 'Not_Neurotic', 'Not_Neurotic', 'Not_Neurotic', 'Not_Neurotic', 'Not_Neurotic', 'Not_Neurotic', 'Not_Neurotic', 'Not_Neurotic', 'Not_Neurotic', 'Not_Neurotic', 'Not_Neurotic', 'Not_Neurotic', 'Not_Neurotic', 'Not_Neurotic', 'Not_Neurotic', 'Not_Neurotic', 'Not_Neurotic', 'Not_Neurotic', 'Not_Neurotic', 'Not_Neurotic', 'Not_Neurotic', 'Not_Neurotic', 'Not_Neurotic', 'Not_Neurotic', 'Not_Neurotic', 'Not_Neurotic', 'Not_Neurotic', 'Not_Neurotic', 'Not_Neurotic', 'Not_Neurotic', 'Not_Neurotic', 'Not_Neurotic', 'Not_Neurotic', 'Not_Neurotic', 'Not_Neurotic', 'Not_Neurotic', 'Not_Neurotic', 'Not_Neurotic', 'Not_Neurotic', 'Not_Neurotic', 'Not_Neurotic', 'Not_Neurotic', 'Not_Ne

In [41]:
print(valid_texts1)



In [42]:
print(len(valid_texts1))

247


In [44]:
print(valid_labels1)

I really wish that I could get on to the psychology pretesting page to get it over with but I've tried three times and it won't let me on. It's really driving me crazy. I feel like I have so much stuff that I need to get done right know, but if I space it out I can get it done. I need to learn to manage my time a little better. It was easy to do everything at the last minute in high school, but I'm in college know and I need to keep up with my assignments. I'm sure other people are feeling the same as me. If I could get everything done in one day I would feel so relieved, but I have to much to do in one day. My two other friends, live in an apartment, and there so boring. All they do is sit at home all day and watch TV. I wish that they would go and do something, but if I asked them to do something they would probably do it. But, I just haven't asked. I'm still adjusting to moving. I come from a really small town, of about 2600 people, and I am overwhelmed at the amount of people that 