In [14]:
import json
from pathlib import Path
from sklearn.model_selection import train_test_split

def gather_dataset(folder_path : str) -> dict:
    dataset = dict()
    for item in Path(folder_path).iterdir():
        if not item.is_file(): continue 
        if not item.suffix == ".json": continue

        dialog_id = item.stem
        with open(item, "r") as json_file:
            data = json.load(json_file)
            dialog = []
            for sentence in data:
                dialog.append(sentence["speaker"] + ": " + sentence["text"])
            dataset[dialog_id] = dialog 
    return dataset

def tt_split(dialogs, labels, test_size=0.2, random_state=42):
    train_sentences = []
    val_sentences = []
    train_labels = [] 
    val_labels = []
    for dialog_id in dialogs:
        # train test split inside the dialog
        d_sentences = dialogs[dialog_id]
        d_labels = labels[dialog_id]
        d_train_sentences, d_val_sentences, d_train_labels, d_val_labels = train_test_split(d_sentences, d_labels, test_size=test_size, random_state=random_state)
        
        # aggregate split
        train_sentences += d_train_sentences
        val_sentences += d_val_sentences
        train_labels += d_train_labels
        val_labels += d_val_labels
    
    return train_sentences, val_sentences, train_labels, val_labels


# dataset = gather_dataset("training")
# with open("test.json", "w") as json_file:
    # json.dump(dataset, json_file, indent=2)



In [15]:
import json

In [16]:
dataset = gather_dataset("training")
with open ('training_labels.json', 'r') as json_file:
    labels = json.load(json_file)

In [17]:
df = tt_split(dataset, labels)

In [18]:
train_sentences, val_sentences, train_labels, val_labels = df

In [19]:
train_labels

[0,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 1,


In [20]:
train_sentences

['UI: and you could just click that to <disfmarker>',
 "PM: And uh I I think our discussion was around the fact that uh if we're gonna go for uh a long lasting power supply , then basically it's uh sealed for life",
 "ID: Uh y the fewer buttons you have , I guess the fewer internal connections and internal codes you're gonna need . Um",
 'PM: and find difficult to learn .',
 'UI: You know what I mean ?',
 'ID: that can kinda tie into our changing uh face things ,',
 "PM: um it's uh it's good for the supermarket",
 'UI: and especially for making them so like different and <disfmarker>',
 'ID: Yeah ,',
 "ID: I don't actually have any price information ,",
 'PM: <vocalsound> Mm yeah .',
 "ID: 'cause it <disfmarker> a mouse you're kinda just resting on it ,",
 'PM: Um <vocalsound> now the the the internal chip',
 'ID: Um p there we go .',
 'ME: Uh yeah , and uh',
 'PM: Sorry',
 'ID: yellow seems a bit of a strong colour',
 "ID: I think they're they're about the same cost really .",
 'PM: u

In [21]:
import pandas as pd

In [66]:
df = pd.DataFrame({'phrase' : train_sentences, 'label' : train_labels})

In [67]:
# df['phrase'] = df['phrase'].apply(lambda x : x.split(' ')[0])
df['phrase'] = df['phrase'].apply(lambda x : x.split(' '))

In [69]:
df = df.explode(column = 'phrase')

In [75]:
df_one = df.loc[df['label'] == 1]
df_zero = df.loc[df['label'] == 0]

In [96]:
df_one_freq = df_one.value_counts().droplevel(1, axis = 0).rename('ones')
df_zero_freq = df_zero.value_counts().droplevel(1, axis = 0).rename('zeros')

In [100]:
df_merged = pd.merge(left = df_one_freq, right =  df_zero_freq, how = 'left', left_index = True, right_index = True).fillna(1)

In [103]:
(df_merged['ones'] / df_merged['zeros']).sort_values(ascending=False)

phrase
presented      15.000000
considering     9.000000
briefly         8.000000
U_I_D_          8.000000
lab             7.000000
                 ...    
Huh             0.013514
'kay            0.011905
Uh-huh          0.008065
Yep             0.006390
Mm-hmm          0.005747
Length: 5470, dtype: float64

In [88]:
df_one_freq.droplevel(1, axis = 0)

phrase
the            7233
,              6755
.              6209
PM:            3501
to             3478
               ... 
frustrates        1
frustration       1
fuct              1
fuel              1
zones             1
Length: 5470, dtype: int64

In [58]:
df = df.explode(column = 'phrase')

In [60]:
df.value_counts().head(30)

phrase        label
the           1        7233
,             1        6755
.             1        6209
PM:           1        3501
to            1        3478
uh            1        3431
a             1        3325
and           1        2742
we            1        2675
I             1        2644
ID:           1        2411
of            1        2379
ME:           1        2352
you           1        2300
UI:           1        2270
it            1        2123
that          1        2094
have          1        1673
is            1        1556
<disfmarker>  1        1372
be            1        1298
<vocalsound>  1        1220
in            1        1213
um            1        1211
on            1        1194
for           1        1186
think         1        1110
like          1        1044
can           1         941
remote        1         941
dtype: int64

In [42]:
df.loc[df['label'] == 1].value_counts()

phrase  label
PM:     1        3501
ID:     1        2411
ME:     1        2352
UI:     1        2270
dtype: int64

In [34]:
df.loc[df['label'] == 1].nunique()

phrase    5470
label        1
dtype: int64

In [36]:
df.loc[df['label'] == 1]

Unnamed: 0,phrase,label
1,PM:,1
1,And,1
1,uh,1
1,I,1
1,I,1
...,...,...
58058,final,1
58058,design,1
58058,as,1
58058,well,1


In [35]:
9 * 60000

540000

In [25]:
df.explode(column = 'phrase')

Unnamed: 0,phrase,label
0,UI: and you could just click that to <disfmarker>,0
1,PM: And uh I I think our discussion was around...,1
2,"ID: Uh y the fewer buttons you have , I guess ...",1
3,PM: and find difficult to learn .,0
4,UI: You know what I mean ?,0
...,...,...
58054,PM: You have to reach a little bit don't you .,0
58055,PM: Mm there's a risk of that .,0
58056,PM: Okay,0
58057,"PM: <vocalsound> Nah , I'm I'm not convinced o...",0
