## Google Drive Mount

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Import

In [2]:
import os
import torch
import random
import shutil
import warnings
warnings.filterwarnings(action='ignore')

import numpy as np
import pandas as pd

from collections import Counter

## Load Dataset

In [3]:
data_path = '/content/drive/MyDrive/dacon_sentiment_analysis/dataset'

In [8]:
data_csv = pd.read_csv(os.path.join(data_path, 'train.csv'))
data_csv.head()

Unnamed: 0,ID,Utterance,Speaker,Dialogue_ID,Target
0,TRAIN_0000,also I was the point person on my company’s tr...,Chandler,0,neutral
1,TRAIN_0001,You must’ve had your hands full.,The Interviewer,0,neutral
2,TRAIN_0002,That I did. That I did.,Chandler,0,neutral
3,TRAIN_0003,So let’s talk a little bit about your duties.,The Interviewer,0,neutral
4,TRAIN_0004,My duties? All right.,Chandler,0,surprise


## Unbalanced Labels

In [7]:
Counter(list(data_csv['Target']))

Counter({'neutral': 4710,
         'surprise': 1205,
         'fear': 268,
         'sadness': 683,
         'joy': 1743,
         'disgust': 271,
         'anger': 1109})

## Accumulate Dialogues

In [14]:
def accumulate_dialogues(df, num=3):
  new_utterances = list() # 최종 저장소
  did = 0
  dialogues = list()

  for i, row in df.iterrows():
    if did != row['Dialogue_ID']: # 새로운 대화
      did = row['Dialogue_ID']
      dialogues = list()

    dialogues.append(row['Utterance'])
    
    if len(dialogues) > num:
      dialogues.pop(0)

    new_utterances.append(' '.join(dialogues))
  
  new_utterances

  return new_utterances

In [15]:
new_utterances = accumulate_dialogues(data_csv)

for i in range(10, 20):
  print(new_utterances[i])

Good to know. We can go into detail No don’t I beg of you!
We can go into detail No don’t I beg of you! All right then, we’ll have a definite answer for you on Monday, but I think I can say with some confidence, you’ll fit in well here.
No don’t I beg of you! All right then, we’ll have a definite answer for you on Monday, but I think I can say with some confidence, you’ll fit in well here. Really?!
All right then, we’ll have a definite answer for you on Monday, but I think I can say with some confidence, you’ll fit in well here. Really?! Absolutely.  You can relax
But then who? The waitress I went out with last month?
But then who? The waitress I went out with last month? You know? Forget it!
But then who? The waitress I went out with last month? You know? Forget it! No-no-no-no, no! Who, who were you talking about?
You know? Forget it! No-no-no-no, no! Who, who were you talking about? No, I-I-I-I don't, I actually don't know
No-no-no-no, no! Who, who were you talking about? No, I-I-I-

In [16]:
data_csv['Utterance'] = new_utterances

In [17]:
data_csv.head()

Unnamed: 0,ID,Utterance,Speaker,Dialogue_ID,Target
0,TRAIN_0000,also I was the point person on my company’s tr...,Chandler,0,neutral
1,TRAIN_0001,also I was the point person on my company’s tr...,The Interviewer,0,neutral
2,TRAIN_0002,also I was the point person on my company’s tr...,Chandler,0,neutral
3,TRAIN_0003,You must’ve had your hands full. That I did. T...,The Interviewer,0,neutral
4,TRAIN_0004,That I did. That I did. So let’s talk a little...,Chandler,0,surprise
