In [1]:
import pandas as pd
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from transformers import DataCollatorWithPadding
from transformers import TrainingArguments
from transformers import Trainer, AutoConfig
from datasets import load_dataset, load_metric
import numpy as np
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, cohen_kappa_score, roc_curve, auc
import torch.nn as nn

In [2]:
# Sys setting
device = torch.device('cuda:0')
checkpoint = 'facebook/bart-large-mnli'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
nli_model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
metric = load_metric("accuracy")

In [3]:
raw_datasets = load_dataset('csv', data_files={'train': 'train_DF.csv',
                                            #    'validation': 'dataset/all_data_val/all_data_val.csv',
                                               'test': 'test_DF.csv'})
print(raw_datasets)

raw_datasets = raw_datasets.shuffle()

Using custom data configuration default
Reusing dataset csv (/home/hdu5/.cache/huggingface/datasets/csv/default-b94c376aa2f3b39d/0.0.0/2960f95a26e85d40ca41a230ac88787f715ee3003edaacb8b1f0891e9f04dda2)
Loading cached shuffled indices for dataset at /home/hdu5/.cache/huggingface/datasets/csv/default-b94c376aa2f3b39d/0.0.0/2960f95a26e85d40ca41a230ac88787f715ee3003edaacb8b1f0891e9f04dda2/cache-5cb669ffb0595772.arrow


DatasetDict({
    train: Dataset({
        features: ['pid', 'document', 'summary', 'classification'],
        num_rows: 387
    })
    test: Dataset({
        features: ['pid', 'document', 'summary', 'classification'],
        num_rows: 97
    })
})


In [4]:
def train_preprocess_function(examples):
    model_inputs = tokenizer(examples['document'], ['This text is about problems']*len(examples['document']),
                             padding='max_length', truncation_strategy='only_first')

    model_inputs['label'] = examples['classification']

    return model_inputs

'''
def train_preprocess_function(examples):

    model_inputs = tokenizer(examples['REVIEW']*2, ['This review is positive']*len(examples['REVIEW'])+
                                    ['This review is negative']*len(examples['REVIEW']),
                                    padding='max_length', truncation_strategy='only_first')

    model_inputs['label'] = examples["TAG_POSITIVE_TONE"] + [1-e for e in examples["TAG_POSITIVE_TONE"]]

    model_inputs['REVIEW'] = examples['REVIEW']*2
    model_inputs['TAG_SUGGESTION'] = examples['TAG_SUGGESTION']*2
    model_inputs['TAG_PROBLEM'] = examples['TAG_PROBLEM']*2
    model_inputs['TAG_POSITIVE_TONE'] = examples['TAG_POSITIVE_TONE']*2

    return model_inputs
'''

# This text is expresses positive emotions
# This text is about problems
# This text is about suggestions
def test_preprocess_function(examples):
    model_inputs = tokenizer(examples['document'], ['This text is about problems'] * len(examples['document']),
                             padding='max_length', truncation_strategy='only_first')

    model_inputs['label'] = examples['classification']

    return model_inputs


In [18]:
train_tokenized_dataset = raw_datasets['train'].map(train_preprocess_function, batched=True)
test_tokenized_dataset = raw_datasets['test'].map(test_preprocess_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

AttributeError: 'list' object has no attribute 'map'

In [26]:
train_tokenized_dataset['input_ids']

[[0,
  9089,
  11497,
  10071,
  41552,
  13043,
  766,
  40635,
  3463,
  11497,
  10071,
  15698,
  26721,
  10071,
  28696,
  12139,
  15698,
  49703,
  13043,
  15698,
  16,
  41,
  490,
  1300,
  3748,
  16698,
  13,
  5,
  304,
  9,
  258,
  521,
  8,
  18341,
  4,
  85,
  16,
  2226,
  15,
  19642,
  15,
  45347,
  1761,
  4,
  901,
  335,
  15,
  26721,
  10071,
  64,
  28,
  303,
  28696,
  12139,
  15698,
  479,
  20,
  1300,
  3260,
  34,
  57,
  13,
  10916,
  8,
  3741,
  11469,
  13,
  442,
  24785,
  7,
  5,
  7,
  2658,
  8823,
  17554,
  4,
  152,
  47068,
  1639,
  41,
  8339,
  88,
  84,
  5694,
  7,
  5,
  384,
  8108,
  26721,
  10071,
  2502,
  6,
  5650,
  15,
  8526,
  7257,
  5137,
  5,
  11624,
  44036,
  1790,
  293,
  33737,
  4,
  28696,
  20094,
  15698,
  28696,
  14595,
  15698,
  36422,
  112,
  4,
  134,
  4,
  28696,
  12139,
  15698,
  112,
  4,
  134,
  4,
  134,
  4,
  28696,
  12139,
  15698,
  112,
  4,
  134,
  4,
  176,
  4,
  28696,
  12139,
 

In [19]:
from transformers import BartTokenizer, BartForSequenceClassification
import torch

tokenizer = BartTokenizer.from_pretrained('facebook/bart-large')
model = BartForSequenceClassification.from_pretrained('facebook/bart-large')

inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1
outputs = model(**inputs, labels=labels)
loss, logits = outputs[:2]

Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large and are newly initialized: ['classification_head.dense.weight', 'classification_head.out_proj.weight', 'classification_head.dense.bias', 'classification_head.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [20]:
loss

tensor(1.0627, grad_fn=<NllLossBackward0>)

In [21]:
logits

tensor([[0.0666, 0.0932, 0.0102]], grad_fn=<AddmmBackward0>)

In [22]:
inputs

{'input_ids': tensor([[    0, 31414,     6,   127,  2335,    16, 11962,     2]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1]])}

In [3]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from collections import Counter

nltk.download('punkt')
nltk.download('stopwords')

def extract_keywords(feedback, num_keywords=10):
    # Preprocessing
    feedback = feedback.lower()
    
    # Tokenization
    words = word_tokenize(feedback)
    
    # Stopword Removal
    stop_words = set(stopwords.words('english'))
    filtered_words = [word for word in words if word.isalnum() and word not in stop_words]
    
    # Stemming
    stemmer = PorterStemmer()
    stemmed_words = [stemmer.stem(word) for word in filtered_words]
    
    # Frequency Analysis
    word_freq = Counter(stemmed_words)
    keywords = [item[0] for item in word_freq.most_common(num_keywords)]
    
    return keywords

feedback = "The doc could be significantly improved by providing code examples of how to use the SSO service on each use case. e.g., what did you change on the rainbow service to use the SSO?"
print(extract_keywords(feedback))

['use', 'sso', 'servic', 'doc', 'could', 'significantli', 'improv', 'provid', 'code', 'exampl']


[nltk_data] Downloading package punkt to /home/hdu5/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/hdu5/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [1]:
import spacy

2023-09-27 21:49:59.323749: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-09-27 21:49:59.324023: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-09-27 21:49:59.324137: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero


In [2]:
import spacy
from collections import Counter

# Load the spaCy English model
nlp = spacy.load("en_core_web_sm")

def extract_keywords_spacy(feedback, num_keywords=10):
    # Preprocessing
    doc = nlp(feedback.lower())
    
    # Token Filtering (Considering nouns, adjectives, and verbs as potential keywords)
    filtered_tokens = [token.text for token in doc if token.is_alpha and token.pos_ in ["NOUN", "ADJ", "VERB"] and not token.is_stop]
    
    # Frequency Analysis
    token_freq = Counter(filtered_tokens)
    keywords = [item[0] for item in token_freq.most_common(num_keywords)]
    
    return keywords

feedback = "The student demonstrates a clear understanding of the subject. However, there's room for improvement in presenting complex ideas."

print(extract_keywords_spacy(feedback))




['student', 'demonstrates', 'clear', 'understanding', 'subject', 'room', 'improvement', 'presenting', 'complex', 'ideas']


In [3]:
import nltk
import gensim
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Sample instructor feedback (you can replace this with your dataset)
instructor_feedbacks = [
    "The wiki was to the point and explains all the code changes they have made. Why they touched some parts of code and why they didn't touch the others. Their test plan mentions a list of all the test cases that are currently covered. It's not clear how that list is organized, and given that there are dozens of items, there needs to be a logical ordering to them. Also, they did not mention how they have tested it manually. Some reviewers complained that they did not know how to test it properly since their test plan did not cover it. ",
    "The doc could be significantly improved by providing code examples of how to use the SSO service on each use case. e.g., what did you change on the rainbow service to use the SSO?",
    "Design doc is quite readable.  Would have helped if there was a subheading for each change.  Other than that, it seems fine.",
    "The document describes the project well in narrative fashion.  It seems to suggest that revisions can be solicited after any round, not just after Round 1.  I don't think that's consistent with the code.  There are at least two points where notes about the project are in brackets in the design doc.  These should have been removed for the final version.  I don't think that the code modifications, especially the ones to controllers, are adequately described.  They are just listed, with no explanation of why the change was needed.  Similarly, the testing plan should explain what is tested.",
    "The document has everything that we asked for. LGTM!",
    "About half of the report is screenshots of existing functionality.  Only a few stylistic changes seem to have been made.  Style is now better, but these are not real refactorings.",
    "Would have been helpful to describe why those mock instances were created (why 2 participants, for example).  The individual get_permission and get_authorization contexts should be explained.  Otherwise, the descriptions are quite good.",
    "Good description of the changes made, and the rationale for them.  You attempted to summarize how many issues were resolved.  A textual summary of the most common would also have been helpful.",
    "This design doc does not really say much about how the proposed solution is implemented.  There are no references to the code.  The need tor a new grading_history_controller is not established.  It would be better to include references to the changed code on Github and describe the changes that were made.  Also, the screenshots should be smaller, so that one does not need to zoom way out.",
    "Writeup clearly explains what was changed, but not how it was changed.  For example a number of instance variables were removed and replaced by a call to get_assigned_surveys, but get_assigned_surveys is not described.  Design principles are mentioned, but no mention of design patterns."
]

# Tokenize and preprocess the feedbacks
stop_words = set(stopwords.words('english'))
tokenized_feedbacks = [word_tokenize(feedback.lower()) for feedback in instructor_feedbacks]
filtered_feedbacks = [[word for word in tokens if word.isalnum() and word not in stop_words] for tokens in tokenized_feedbacks]

# Create a Gensim dictionary and corpus
dictionary = gensim.corpora.Dictionary(filtered_feedbacks)
corpus = [dictionary.doc2bow(tokens) for tokens in filtered_feedbacks]

# Build the LDA model
lda_model = gensim.models.LdaModel(corpus, num_topics=2, id2word=dictionary, passes=15)

# Get the top words for each topic
topics = lda_model.print_topics(num_words=5)

# Extract rubric-related terms from the topics
rubric_terms = []

for topic in topics:
    terms = topic[1].split("+")
    for term in terms:
        term = term.strip().split("*")[1].strip(' "')
        rubric_terms.append(term)

# Print the rubric-related terms
print("Generated Rubric Terms:")
print(rubric_terms)


Generated Rubric Terms:
['code', 'test', 'plan', 'also', 'changes', 'design', 'would', 'doc', 'changed', 'use']
