## INSTALLING THE NECESSARY DEPENDENCIES:

In [1]:
##(install and restart runtime)
#!pip install --upgrade accelerate
#!pip install transformers[sentencepiece]

In [2]:
import torch
import numpy as np
import pandas as pd

!pip install transformers datasets huggingface_hub tensorboard==2.11
!sudo apt-get install git-lfs --yes
import os

from datasets import Dataset
from datasets import load_metric
from transformers import AutoTokenizer, DataCollatorWithPadding
from transformers import TrainingArguments, AutoModelForSequenceClassification, Trainer

torch.__version__


from huggingface_hub import HfFolder, notebook_login

Reading package lists... Done
Building dependency tree       
Reading state information... Done
git-lfs is already the newest version (2.9.2-1).
0 upgraded, 0 newly installed, 0 to remove and 15 not upgraded.


In [3]:
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [4]:
repository_id = "Geee05/overlap"

In [5]:
dataset = pd.read_csv('/content/All_transcript_train.csv')
tdataset= pd.read_csv('/content/All_transcript_test_new.csv')

## DATA PRE-PROCESSING

In [6]:
dataset.rename(columns = {'Class':'labels'}, inplace = True)
tdataset.rename(columns = {'Group':'labels'}, inplace = True)

In [7]:
dataset.replace({'labels':{"HC":0,"DP":1}},inplace=True)
tdataset.replace({'labels':{"HC":0,"DP":1}},inplace=True)

In [8]:
dataset['labels'].value_counts()

0    126
1     37
Name: labels, dtype: int64

In [9]:
#resample data to deal with class imbalance
from sklearn.utils import resample
df_1=dataset[dataset['labels']==1]
df_2=dataset[dataset['labels']!=1]
df_upsampled=resample(df_1,random_state=42,n_samples=126,replace=True)
df=pd.concat([df_upsampled,df_2])
df.reset_index(inplace=True)

In [23]:
#creating Datasets
train = Dataset.from_pandas(df, preserve_index=False)
test = Dataset.from_pandas(tdataset, preserve_index=False)

print(train)
print(test)

Dataset({
    features: ['index', 'Unnamed: 0', 'File', 'Text', 'labels'],
    num_rows: 252
})
Dataset({
    features: ['Unnamed: 0.1', 'Unnamed: 0', 'File', 'Text', 'labels'],
    num_rows: 56
})


In [24]:
leng=[]
for i in range (0,len(train['Text'])):
  leng.append(len(train['Text'][i]))

print(min(leng))
print(max(leng))

1362
21064


### Function Definition:

In [31]:
#SEGMENT TEXTS
def segment_function(example):
  seg=example['Text']
  res=[]
  n=15000

  for iter in range (0,len(seg)):
    for i in range (0,len(seg[iter])):
      j=i*7500

      if len(seg[iter])<15000:
          part=(seg[iter])
          res.append(part)
          break
      elif j<=len(seg[iter]):
          part=(seg[iter][j:j+n])
          res.append(part)
      else:
        break

  return res

#TOKENIZE TEXTS
tokenizer = AutoTokenizer.from_pretrained('gooohjy/suicidal-electra')
def tokenize_function(example):
  return tokenizer(example["Text"], padding='max_length',max_length=512,truncation=True,add_special_tokens = True)

In [13]:
#LABELS CORR. TO SEGMENTS
label=[]
for i in range (0,len(train)):
  if (len(train['Text'][i])) <15000:
    label.append(train['labels'][i])
  else:
    x = ((len(train['Text'][i]))//7500) + 1
    for j in range (0,x):
      label.append(train['labels'][i])



label_t=[]
for i in range (0,len(test)):
  if (len(test['Text'][i])) <15000:
    label_t.append(test['labels'][i])
  else:
    x = ((len(test['Text'][i]))//7500) + 1
    for j in range (0,x):
      label_t.append(test['labels'][i])

#### Segmenting Train Data

In [32]:
ip_seg=segment_function(train)

In [35]:
len(ip_seg)

276

In [36]:
#preparing dataframe for segmented texts & corresponding labels
train_data=pd.DataFrame(ip_seg,columns=['Text'])
train_data['labels']=label
train_data.head(2)

Unnamed: 0,Text,labels
0,okay hi I'm Ellie thanks for coming in today ...,1
1,and please are you okay yes I'm alright with ...,1


#### Tokenizing Train Data

In [37]:
trainset = Dataset.from_pandas(train_data, preserve_index=False)
trainset

Dataset({
    features: ['Text', 'labels'],
    num_rows: 276
})

In [38]:
tokenized_train_dataset = trainset.map(tokenize_function, batched=True)
tokenized_train_dataset

Map:   0%|          | 0/276 [00:00<?, ? examples/s]

Dataset({
    features: ['Text', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 276
})

#### Segmenting Test Data

In [39]:
ip_seg_t=segment_function(test)

test_data=pd.DataFrame(ip_seg_t,columns=['Text'])
test_data['labels']=label_t
test_data.head(2)

Unnamed: 0,Text,labels
0,okay there she is coming to go ahead and Shrin...,0
1,this is super need I like this me either at a...,1


#### Tokenizing Test Data

In [40]:
testset = Dataset.from_pandas(test_data, preserve_index=False)
testset

Dataset({
    features: ['Text', 'labels'],
    num_rows: 58
})

In [41]:
tokenized_test_dataset = testset.map(tokenize_function, batched=True)
tokenized_test_dataset

Map:   0%|          | 0/58 [00:00<?, ? examples/s]

Dataset({
    features: ['Text', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 58
})

### Review

In [42]:
#tokenizer.model_max_length =512

In [43]:
print(len(label))
print(len(label_t))


276
58


In [44]:
print(len(ip_seg))
print(len(ip_seg_t))


276
58


In [45]:
type(tokenized_train_dataset)

datasets.arrow_dataset.Dataset

## TRAINING

In [46]:
# Set dataset format using set_format() function to specify the dataset format, making it compatible with PyTorch.
tokenized_train_dataset.set_format("torch", columns=['input_ids','attention_mask','labels'])
tokenized_test_dataset.set_format("torch", columns=['input_ids','attention_mask','labels'])

In [47]:
os.environ["WANDB_DISABLED"] = "true"

In [53]:
from sklearn.metrics import classification_report

training_args = TrainingArguments(evaluation_strategy="epoch",num_train_epochs=35,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    learning_rate=2e-5,
    weight_decay=0.01,
    warmup_steps=500,
    output_dir=repository_id,
    logging_dir=f"{repository_id}/logs",
    logging_strategy="steps",
    logging_steps=10,
    save_strategy="epoch",
    load_best_model_at_end=True,
    save_total_limit=2,
    metric_for_best_model="f1",
    greater_is_better=True,
    report_to="tensorboard",
    push_to_hub=True,
    hub_strategy="every_save",
    hub_model_id=repository_id,
    hub_token=HfFolder.get_token())   # default arguments for fine-tuning
model = AutoModelForSequenceClassification.from_pretrained('gooohjy/suicidal-electra', num_labels=2) #,ignore_mismatched_sizes=True
def compute_metrics(eval_pred):
    load_accuracy = load_metric("accuracy")
    load_f1 = load_metric("f1")

    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    #probabilities = softmax(logits, axis=-1)
    accuracy = load_accuracy.compute(predictions=predictions, references=labels)["accuracy"]
    f1 = load_f1.compute(predictions=predictions, references=labels)["f1"]
    return {"accuracy": accuracy, "f1": f1}
    print((classification_report(labels,predictions)))

trainer = Trainer(   # specifying trainer class
    model,
    training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_test_dataset,
    #data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)


/content/Geee05/overlap is already a clone of https://huggingface.co/Geee05/overlap. Make sure you pull the latest changes with `repo.git_pull()`.


In [54]:
trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy,F1
1,2.5963,1.004066,0.741379,0.482759
2,1.1164,1.35289,0.586207,0.5
3,0.9122,1.659758,0.465517,0.491803
4,0.5366,0.934164,0.603448,0.510638
5,0.3298,1.138318,0.603448,0.510638
6,0.2497,1.174823,0.689655,0.526316
7,0.0676,1.485095,0.724138,0.529412
8,0.0011,1.706962,0.724138,0.529412
9,0.0003,1.793563,0.741379,0.545455
10,0.0002,2.027387,0.724138,0.529412


TrainOutput(global_step=1225, training_loss=0.1781041769952423, metrics={'train_runtime': 2800.6691, 'train_samples_per_second': 3.449, 'train_steps_per_second': 0.437, 'total_flos': 2541652794777600.0, 'train_loss': 0.1781041769952423, 'epoch': 35.0})

In [55]:
trainer.state.best_model_checkpoint

'Geee05/overlap/checkpoint-315'

In [57]:
# Save our tokenizer and create a model card
tokenizer.save_pretrained(repository_id)
trainer.create_model_card()
# Push the results to the hub
trainer.push_to_hub()

Upload file logs/events.out.tfevents.1689075518.e36198f2fb2b.5406.3:   0%|          | 1.00/35.7k [00:00<?, ?B/…

To https://huggingface.co/Geee05/overlap
   1a179d1..b5617d0  main -> main

   1a179d1..b5617d0  main -> main



'https://huggingface.co/Geee05/overlap/commit/b5617d0bd962c4b305279a5a85774bdf69ecdd2b'

## VALIDATION

In [58]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("Geee05/overlap")
model_electra = AutoModelForSequenceClassification.from_pretrained("Geee05/overlap")

In [59]:
vdataset= pd.read_csv('/content/All_transcript_val_new.csv')

In [60]:
vdataset.head(2)

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,File,Text,Group
0,0,0,600_AUDIO,okay there she is coming to go ahead and Shrin...,HC
1,1,1,602_AUDIO,this is super need I like this me either at a...,DP


In [61]:
vdataset.rename(columns = {'Group':'labels'}, inplace = True)
vdataset.replace({'labels':{"HC":0,"DP":1}},inplace=True)
vdataset['labels'].value_counts()

0    39
1    17
Name: labels, dtype: int64

In [62]:
val = Dataset.from_pandas(vdataset, preserve_index=False)
print(val)

Dataset({
    features: ['Unnamed: 0.1', 'Unnamed: 0', 'File', 'Text', 'labels'],
    num_rows: 56
})


In [70]:
#LABELS CORR. TO SEGMENTS

label_v=[]
for i in range (0,len(val)):
  if (len(val['Text'][i])) <15000:
    label_v.append(val['labels'][i])
  else:
    x = ((len(val['Text'][i]))//7500) + 1
    for j in range (0,x):
      label_v.append(val['labels'][i])

In [71]:
ip_seg_v=segment_function(val)

In [72]:
print(len(label_v))
print(len(ip_seg_v))

58
58


In [73]:
ip_seg_v=segment_function(val)

val_data=pd.DataFrame(ip_seg_v,columns=['Text'])
val_data['labels']=label_v
val_data.head(2)

Unnamed: 0,Text,labels
0,okay there she is coming to go ahead and Shrin...,0
1,this is super need I like this me either at a...,1


In [74]:
valset = Dataset.from_pandas(val_data, preserve_index=False)
valset

Dataset({
    features: ['Text', 'labels'],
    num_rows: 58
})

In [75]:
tokenized_val_dataset = valset.map(tokenize_function, batched=True)
tokenized_val_dataset

Map:   0%|          | 0/58 [00:00<?, ? examples/s]

Dataset({
    features: ['Text', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 58
})

In [76]:
tokenized_val_dataset.set_format("torch", columns=['input_ids','attention_mask','labels'])

In [77]:
tokenized_val_dataset

Dataset({
    features: ['Text', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 58
})

In [None]:
tokenized_val_dataset['input_ids'].shape

torch.Size([71, 512])

In [78]:
logits=[]
labels=[]
for i in range (0,len(tokenized_val_dataset['Text'])):

  inputs = tokenizer(tokenized_val_dataset['Text'][i],padding='max_length',max_length=512,truncation=True,add_special_tokens = True, return_tensors="pt")
  with torch.no_grad():
    outputs = model_electra(**inputs)
    logits_l = model_electra(**inputs).logits
    #print(logits)
    logits.append(logits_l)
    labels_i = logits_l.argmax().item()
  labels.append(labels_i)

  #print(labels)

    #print(outputs)
  #print(np.array(inputs['input_ids']).shape)
  #print(np.array(inputs['token_type_ids']).shape)
  #print(np.array(inputs['attention_mask']).shape)
  #print(**inputs.shape)

  #last_hidden_states = outputs.last_hidden_state
  #result.append(outputs)

In [79]:
len(labels)

58

In [80]:
len(logits)

58

In [81]:
logits

[tensor([[-0.2904,  0.2225]]),
 tensor([[ 4.6095, -4.6444]]),
 tensor([[ 4.7588, -4.8537]]),
 tensor([[ 4.7731, -4.8482]]),
 tensor([[ 5.0388, -5.1447]]),
 tensor([[ 3.5424, -3.5500]]),
 tensor([[ 5.0549, -5.1543]]),
 tensor([[ 4.9254, -5.0034]]),
 tensor([[ 4.3699, -4.4225]]),
 tensor([[ 3.4817, -3.4906]]),
 tensor([[ 4.6348, -4.6966]]),
 tensor([[ 0.4240, -0.5027]]),
 tensor([[ 4.6962, -4.7387]]),
 tensor([[-4.9565,  4.4561]]),
 tensor([[ 4.6513, -4.7107]]),
 tensor([[ 4.0101, -4.0449]]),
 tensor([[ 4.1006, -4.0973]]),
 tensor([[-4.1458,  3.7066]]),
 tensor([[ 4.6119, -4.6484]]),
 tensor([[ 4.8490, -4.9112]]),
 tensor([[ 4.0216, -4.0048]]),
 tensor([[-4.3153,  3.9245]]),
 tensor([[ 4.6414, -4.7118]]),
 tensor([[ 3.9913, -4.0199]]),
 tensor([[-2.9366,  2.6586]]),
 tensor([[ 4.7997, -4.8814]]),
 tensor([[ 3.9065, -3.9110]]),
 tensor([[ 3.2822, -3.2865]]),
 tensor([[ 4.3038, -4.3698]]),
 tensor([[ 4.0772, -4.1131]]),
 tensor([[-4.0504,  3.6831]]),
 tensor([[ 3.7329, -3.7316]]),
 tensor(

In [82]:
true_labels=[]
for i in range (0,len(tokenized_val_dataset['labels'])):
  true_labels.append(tokenized_val_dataset['labels'][i])


In [85]:
len(true_labels)

58

In [84]:
from sklearn.metrics import classification_report

print(classification_report(true_labels,labels))

              precision    recall  f1-score   support

           0       0.77      0.87      0.82        39
           1       0.64      0.47      0.55        19

    accuracy                           0.74        58
   macro avg       0.71      0.67      0.68        58
weighted avg       0.73      0.74      0.73        58

