In [4]:
from phonecodes import phonecodes
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from typing import Iterable, List
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, Dataset
from timeit import default_timer as timer
from torch.nn import Transformer
from torch import Tensor
from sklearn.model_selection import train_test_split
import tqdm
import librosa
import seaborn as sns
import torch.nn as nn
import torch
import torch.nn.functional as F
import numpy as np
import math
import os
import pandas as pd
import matplotlib.pyplot as plt
import textgrid
from scipy.spatial.distance import euclidean

import jiwer
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

from transformers import AutoProcessor, AutoModelForCTC
from phonemizer.backend.espeak.wrapper import EspeakWrapper
import soundfile as sf

_ESPEAK_LIBRARY = r"C:\Program Files\eSpeak NG\libespeak-ng.dll"
EspeakWrapper.set_library(_ESPEAK_LIBRARY)
processor_P = AutoProcessor.from_pretrained("facebook/wav2vec2-lv-60-espeak-cv-ft")
model_P = AutoModelForCTC.from_pretrained("facebook/wav2vec2-lv-60-espeak-cv-ft")

from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-960h")
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h")




Some weights of the model checkpoint at facebook/wav2vec2-lv-60-espeak-cv-ft were not used when initializing Wav2Vec2ForCTC: ['wav2vec2.encoder.pos_conv_embed.conv.weight_g', 'wav2vec2.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-lv-60-espeak-cv-ft and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably

In [25]:
def get_pathset(paths):
    return [os.path.join(dir, each_file) for dir, mid, files in os.walk(paths) for each_file in files if each_file.endswith(".wav")]

def CTC_index(processor,outind):
    meaningful_ids = []
    meaningful_indices = []
    previous_id = -1  
    blank_token_id = processor.tokenizer.pad_token_id  
    for i, token_id in enumerate(outind[0]):  
        if token_id != previous_id and token_id != blank_token_id:
            meaningful_ids.append(token_id.item())  
            meaningful_indices.append(i)  
        previous_id = token_id
    
    return meaningful_indices

def get_set_diphone(paths,model,processor):
    out_dict=[]
    
    english_phonemes = ['<pad>', '<s>', '</s>', '<unk>','p', 'b', 't', 'd', 'k', 'ɡ','m', 'n', 'ŋ', 'f', 'v', 'θ', 'ð', 's', 'z', 'ʃ', 'h', 'tʃ', 'dʒ', 'l', 'ɹ', 'w', 'j',"i","ɪ","ʊ","u","e","ɜ","æ","ʌ","ɑ","ɒ","eɪ","ɔɪ","oʊ","aɪ","aʊ"]
    english_phoneme_dict = {k: v for k, v in processor_P.tokenizer.get_vocab().items() if k in english_phonemes}
    #english_phoneme_dict.values()
    set1_list=[0,1,2,3,4,5,6,7,8,9,10,12,13,14,15,16]
    set2_list=[17,18,19,20,21,22,24,25,26,27,28,29,30,31,37,40]
    all_set=set1_list+set2_list
    for each_sentence in tqdm.tqdm(paths):
        tg = textgrid.TextGrid.fromFile(each_sentence[:-3]+"TextGrid")
        tg_sentence = [i for i in tg[0] if i.mark!=""]
        tg_sentence = [i for _,i in enumerate(tg_sentence) if  _ in all_set]
        #tg_word = [i for i in tg[1] if i.mark!="" and i.mark!="sp"]

        
        wave, sr = librosa.load(each_sentence)
        wave_res = librosa.resample(wave, orig_sr=sr, target_sr=16000)
        #wave_res = wave_res[:int(sentence16_end_time*16000)]
        for each_tg in tg_sentence:
            start=round(each_tg.minTime*16000)
            end=round(each_tg.maxTime*16000)
            input=processor(wave_res[start:end],sampling_rate=16000, return_tensors="pt").input_values
            input=input.to(device)
            model.to(device)
            with torch.no_grad():
                out_encoder1=model(input).logits
            
            selected=out_encoder1
            mask = np.ones(selected.shape[-1], dtype=bool)
            mask[list(english_phoneme_dict.values())] = False
            selected[:, :, mask] = 0
            outind=torch.argmax(selected,dim=-1).cpu().numpy()
            out_encoder1=out_encoder1.cpu().detach().numpy()
            #outind=torch.argmax(out_encoder1,dim=-1).cpu().numpy()
            transcription = processor.batch_decode(outind)[0].split(" ")
            phonemeindex = CTC_index(processor,outind)
            #out_FE=model.wav2vec2.feature_extractor(input)[0].transpose(1,0).cpu().detach().numpy()
            for i in range(len(transcription)-1):
                key = transcription[i] + transcription[i + 1]
                #if key not in out_dict:
                    #out_dict[key] = []
                if "CMN" in each_sentence:
                    out_dict.append(["L2",os.path.basename(each_sentence)[:-4],each_tg.mark.lower(),key,np.vstack((out_encoder1[0,phonemeindex[i]], out_encoder1[0,phonemeindex[i + 1]]))])
                else:
                    out_dict.append(["L1",os.path.basename(each_sentence)[:-4],each_tg.mark.lower(),key,np.vstack((out_encoder1[0,phonemeindex[i]], out_encoder1[0,phonemeindex[i + 1]]))])
            torch.cuda.empty_cache()
    torch.cuda.empty_cache()
    return out_dict



In [16]:
ALL_hint1_path=r"..\data\raw_hint1_L1_L2"
ALL_pathset=get_pathset(ALL_hint1_path)
os.path.basename(ALL_pathset[-1])[:-4]


'ALL_133_M_ENG_ENG_HT1'

In [26]:
ALL_ENG_ENG_dict = get_set_diphone(ALL_pathset, model_P, processor_P)

100%|██████████| 39/39 [00:26<00:00,  1.48it/s]


In [238]:
df = pd.DataFrame(ALL_ENG_ENG_dict, columns=['Category', 'ID', 'Sentence', 'Diphoneme', 'Vector'])

In [239]:
df.loc[df['Sentence'] == 'a boy fell from a window', 'Sentence'] = 'a boy fell from the window'

In [240]:
import pandas as pd
import numpy as np
from sklearn.manifold import TSNE
import plotly.express as px

#vector_data = np.vstack(df['Vector'].values)
vector_data=np.array([i.ravel() for i in df['Vector'].values])
tsne = TSNE(n_components=3, random_state=42)
reduced_vectors = tsne.fit_transform(vector_data)

df['TSNE-1'] = reduced_vectors[:, 0]
df['TSNE-2'] = reduced_vectors[:, 1]
df['TSNE-3'] = reduced_vectors[:, 2]
df['Label'] = df['ID'] + ' - ' + df['Sentence']


In [243]:
set(df['Sentence'])

{'a boy fell from the window',
 'big dogs can be dangerous',
 'father forgot the bread',
 'he grew lots of vegetables',
 'he hung up his raincoat',
 "it's time to go to bed",
 'mother read the instructions',
 'she argues with her sister',
 'she found her purse in the trash',
 "she's drinking from her own cup",
 'somebody stole the money',
 'the bananas are too ripe',
 'the car is going too fast',
 'the children waved at the train',
 'the dog is eating some meat',
 'the family bought a house',
 'the family likes fish',
 'the fire was very hot',
 'the girl is fixing her dress',
 'the house had nine bedrooms',
 'the kitchen window was clean',
 'the mailman brought a letter',
 'the mother heard the baby',
 'the paint dripped on the ground',
 'the painter uses a brush',
 'the picture came from a book',
 'the player lost a shoe',
 'the road goes up a hill',
 'the shoes were very dirty',
 'the table has three legs',
 'the wife helped her husband',
 'they had two empty bottles'}

## 1.L1 vs L2, 1v1 Trajectory comparison

for 005_M_CMN_ENG and 133_M_ENG_ENG, running on sentence 'a boy fell from the window'

In [241]:
import plotly.graph_objects as go


filtered_df = df[(df['ID'].isin(['ALL_005_M_CMN_ENG_HT1', 'ALL_133_M_ENG_ENG_HT1'])) & 
                 (df['Sentence'] == 'a boy fell from the window')]


talker_ids = filtered_df['ID'].unique()
talker_data_dict = {talker_id: filtered_df[filtered_df['ID'] == talker_id] for talker_id in talker_ids}


all_x = filtered_df['TSNE-1']
all_y = filtered_df['TSNE-2']
all_z = filtered_df['TSNE-3']

x_range = [all_x.min() - 5, all_x.max() + 5]
y_range = [all_y.min() - 5, all_y.max() + 5]
z_range = [all_z.min() - 5, all_z.max() + 5]


fig = go.Figure()
for talker_id in talker_ids:
    talker_data = talker_data_dict[talker_id]
    point_numbers = [str(i + 1) for i in range(len(talker_data))]
    fig.add_trace(go.Scatter3d(
        x=talker_data['TSNE-1'],
        y=talker_data['TSNE-2'],
        z=talker_data['TSNE-3'],
        mode='lines+markers+text',
        marker=dict(size=10 ,symbol="cross"),
        line=dict(width=2),
        name=list(talker_data['Category'])[0]+'-accented',
        text=[f"{idx+1}" for idx in range(len(talker_data))],
        textposition='top center',
        hoverinfo='text'
    ))


max_time_steps = max(len(talker_data_dict[talker_id]) for talker_id in talker_ids)
frames = []
for i in range(max_time_steps):
    frame_data = []
    for talker_id in talker_ids:
        talker_data = talker_data_dict[talker_id]
        if i < len(talker_data):  
            frame_data.append(go.Scatter3d(
                x=talker_data['TSNE-1'][:i+1],
                y=talker_data['TSNE-2'][:i+1],
                z=talker_data['TSNE-3'][:i+1],
                mode='lines+markers+text',
                marker=dict(size=10 ,symbol="cross"),
                line=dict(width=2),
                name=list(talker_data['Category'])[0]+'-accented',
                text=[f"{idx+1}" for idx in range(len(talker_data))],
                textposition='top center',
                hoverinfo='text'
            ))
    frames.append(go.Frame(data=frame_data, name=str(i+1)))
camera = dict(
    up=dict(x=0, y=0, z=1),
    center=dict(x=0, y=0, z=0),
    eye=dict(x=1.25, y=1.25, z=1.25)
)

fig.update(frames=frames)
fig.update_layout(
    updatemenus=[dict(type='buttons', showactive=False,
                      buttons=[dict(label='Play',
                                    method='animate',
                                    args=[None, dict(frame=dict(duration=500, redraw=True),
                                                     fromcurrent=True, mode='immediate')])])],
    sliders=[dict(steps=[dict(label=str(i+1),
                              method='animate',
                              args=[[str(i+1)], dict(frame=dict(duration=500, redraw=True),
                                                     mode='immediate')]) for i in range(max_time_steps)],
                  active=0)],
    scene_camera=camera,
    scene=dict(
        xaxis=dict(title='TSNE-1', range=x_range),  
        yaxis=dict(title='TSNE-2', range=y_range),  
        zaxis=dict(title='TSNE-3', range=z_range)   
    )
)



fig.update_layout(
    scene_camera=camera,
    scene=dict(
        xaxis=dict(title='TSNE-1', range=x_range),  
        yaxis=dict(title='TSNE-2', range=y_range),  
        zaxis=dict(title='TSNE-3', range=z_range)   
    ),
    title='3D t-SNE Visualization with Animation(L1 and L2 speaker), for sentence "a boy fell from the window"',
    width=1000,
    height=600,
)
fig.show()


for 005_M_CMN_ENG and 133_M_ENG_ENG, running on sentence 'a boy fell from the window'

In [210]:
#set(df['Sentence'])

In [248]:
import plotly.graph_objects as go


filtered_df = df[(df['ID'].isin(['ALL_005_M_CMN_ENG_HT1', 'ALL_133_M_ENG_ENG_HT1'])) & 
                 (df['Sentence'] == 'big dogs can be dangerous')]


talker_ids = filtered_df['ID'].unique()
talker_data_dict = {talker_id: filtered_df[filtered_df['ID'] == talker_id] for talker_id in talker_ids}


all_x = filtered_df['TSNE-1']
all_y = filtered_df['TSNE-2']
all_z = filtered_df['TSNE-3']

x_range = [all_x.min() - 5, all_x.max() + 5]
y_range = [all_y.min() - 5, all_y.max() + 5]
z_range = [all_z.min() - 5, all_z.max() + 5]


fig = go.Figure()
for talker_id in talker_ids:
    talker_data = talker_data_dict[talker_id]
    point_numbers = [str(i + 1) for i in range(len(talker_data))]
    fig.add_trace(go.Scatter3d(
        x=talker_data['TSNE-1'],
        y=talker_data['TSNE-2'],
        z=talker_data['TSNE-3'],
        mode='lines+markers+text',
        marker=dict(size=10 ,symbol="cross"),
        line=dict(width=2),
        name=list(talker_data['Category'])[0]+'-accented',
        text=[f"{idx+1}" for idx in range(len(talker_data))],
        textposition='top center',
        hoverinfo='text'
    ))


max_time_steps = max(len(talker_data_dict[talker_id]) for talker_id in talker_ids)
frames = []
for i in range(max_time_steps):
    frame_data = []
    for talker_id in talker_ids:
        talker_data = talker_data_dict[talker_id]
        if i < len(talker_data):  
            frame_data.append(go.Scatter3d(
                x=talker_data['TSNE-1'][:i],
                y=talker_data['TSNE-2'][:i],
                z=talker_data['TSNE-3'][:i],
                mode='lines+markers+text',
                marker=dict(size=10 ,symbol="cross"),
                line=dict(width=2),
                name=list(talker_data['Category'])[0]+'-accented',
                text=[f"{idx+1}" for idx in range(len(talker_data))],
                textposition='top center',
                hoverinfo='text'
            ))
    frames.append(go.Frame(data=frame_data, name=str(i+1)))
camera = dict(
    up=dict(x=0, y=0, z=1),
    center=dict(x=0, y=0, z=0),
    eye=dict(x=1.25, y=1.25, z=1.25)
)

fig.update(frames=frames)
fig.update_layout(
    updatemenus=[dict(type='buttons', showactive=False,
                      buttons=[dict(label='Play',
                                    method='animate',
                                    args=[None, dict(frame=dict(duration=500, redraw=True),
                                                     fromcurrent=True, mode='immediate')])])],
    sliders=[dict(steps=[dict(label=str(i+1),
                              method='animate',
                              args=[[str(i+1)], dict(frame=dict(duration=500, redraw=True),
                                                     mode='immediate')]) for i in range(max_time_steps)],
                  active=0)],
    scene_camera=camera,
    scene=dict(
        xaxis=dict(title='TSNE-1', range=x_range),  
        yaxis=dict(title='TSNE-2', range=y_range),  
        zaxis=dict(title='TSNE-3', range=z_range)   
    )
)



fig.update_layout(
    scene_camera=camera,
    scene=dict(
        xaxis=dict(title='TSNE-1', range=x_range),  
        yaxis=dict(title='TSNE-2', range=y_range),  
        zaxis=dict(title='TSNE-3', range=z_range)   
    ),
    title='3D t-SNE Visualization with Animation(L1 and L2 speaker), for sentence "big dogs can be dangerous"',
    width=1000,
    height=600,
)
fig.show()


In [250]:

import plotly.graph_objects as go


filtered_df = df[(df['ID'].isin(['ALL_005_M_CMN_ENG_HT1', 'ALL_133_M_ENG_ENG_HT1'])) & 
                 (df['Sentence'] == 'mother read the instructions')]


talker_ids = filtered_df['ID'].unique()
talker_data_dict = {talker_id: filtered_df[filtered_df['ID'] == talker_id] for talker_id in talker_ids}


all_x = filtered_df['TSNE-1']
all_y = filtered_df['TSNE-2']
all_z = filtered_df['TSNE-3']

x_range = [all_x.min() - 5, all_x.max() + 5]
y_range = [all_y.min() - 5, all_y.max() + 5]
z_range = [all_z.min() - 5, all_z.max() + 5]


fig = go.Figure()
for talker_id in talker_ids:
    talker_data = talker_data_dict[talker_id]
    point_numbers = [str(i + 1) for i in range(len(talker_data))]
    fig.add_trace(go.Scatter3d(
        x=talker_data['TSNE-1'],
        y=talker_data['TSNE-2'],
        z=talker_data['TSNE-3'],
        mode='lines+markers+text',
        marker=dict(size=10 ,symbol="cross"),
        line=dict(width=2),
        name=list(talker_data['Category'])[0]+'-accented',
        text=[f"{idx+1}" for idx in range(len(talker_data))],
        textposition='top center',
        hoverinfo='text'
    ))


max_time_steps = max(len(talker_data_dict[talker_id]) for talker_id in talker_ids)
frames = []
for i in range(max_time_steps):
    frame_data = []
    for talker_id in talker_ids:
        talker_data = talker_data_dict[talker_id]
        if i < len(talker_data):  
            frame_data.append(go.Scatter3d(
                x=talker_data['TSNE-1'][:i],
                y=talker_data['TSNE-2'][:i],
                z=talker_data['TSNE-3'][:i],
                mode='lines+markers+text',
                marker=dict(size=10 ,symbol="cross"),
                line=dict(width=2),
                name=list(talker_data['Category'])[0]+'-accented',
                text=[f"{idx+1}" for idx in range(len(talker_data))],
                textposition='top center',
                hoverinfo='text'
            ))
    frames.append(go.Frame(data=frame_data, name=str(i+1)))
camera = dict(
    up=dict(x=0, y=0, z=1),
    center=dict(x=0, y=0, z=0),
    eye=dict(x=1.25, y=1.25, z=1.25)
)

fig.update(frames=frames)
fig.update_layout(
    updatemenus=[dict(type='buttons', showactive=False,
                      buttons=[dict(label='Play',
                                    method='animate',
                                    args=[None, dict(frame=dict(duration=500, redraw=True),
                                                     fromcurrent=True, mode='immediate')])])],
    sliders=[dict(steps=[dict(label=str(i+1),
                              method='animate',
                              args=[[str(i+1)], dict(frame=dict(duration=500, redraw=True),
                                                     mode='immediate')]) for i in range(max_time_steps)],
                  active=0)],
    scene_camera=camera,
    scene=dict(
        xaxis=dict(title='TSNE-1', range=x_range),  
        yaxis=dict(title='TSNE-2', range=y_range),  
        zaxis=dict(title='TSNE-3', range=z_range)   
    )
)



fig.update_layout(
    scene_camera=camera,
    scene=dict(
        xaxis=dict(title='TSNE-1', range=x_range),  
        yaxis=dict(title='TSNE-2', range=y_range),  
        zaxis=dict(title='TSNE-3', range=z_range)   
    ),
    title='3D t-SNE Visualization with Animation(L1 and L2 speaker), for sentence "mother read the instructions"',
    width=1000,
    height=600,
)
fig.show()


## 2. Trajectory distribution of all talkers under the same sentence.

In [246]:
import plotly.graph_objects as go
import plotly.express as px

filtered_df = df[df['Sentence'] == 'a boy fell from the window']

categories = filtered_df['Category'].unique()
colors = px.colors.qualitative.Plotly  
color_map = {category: colors[i % len(colors)] for i, category in enumerate(categories)}

fig = go.Figure()
for talker_id in filtered_df['ID'].unique():
    talker_data = filtered_df[filtered_df['ID'] == talker_id].sort_values(by='Diphoneme')
    category = talker_data['Category'].iloc[0]
    
    fig.add_trace(go.Scatter3d(
        x=talker_data['TSNE-1'],
        y=talker_data['TSNE-2'],
        z=talker_data['TSNE-3'],
        mode='lines+markers', 

        name=f"{talker_id} (Category: {category})",  
        line=dict(width=2, color=color_map[category]),  
        marker=dict(size=5),
        text=talker_data['Diphoneme'], 
        hoverinfo='text',
        opacity=0.3  
    ))



fig.update_layout(
    showlegend=False, 
    title="t-SNE 3D Visualization of 'a boy fell from the window' Grouped by accent",
    scene=dict(
        xaxis_title='TSNE-1',
        yaxis_title='TSNE-2',
        zaxis_title='TSNE-3'
    ),
    width=1000,  
    height=800   
)

fig.show()


In [251]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.express as px

# 筛选数据
filtered_df = df[df['Sentence'] == 'a boy fell from the window']

# 获取唯一的Category，并为每个Category分配颜色
categories = filtered_df['Category'].unique()
colors = px.colors.qualitative.Plotly  
color_map = {category: colors[i % len(colors)] for i, category in enumerate(categories)}

# 创建子图：两个子图用于分别显示L1和L2
fig = make_subplots(
    rows=1, cols=2, 
    specs=[[{'type': 'scatter3d'}, {'type': 'scatter3d'}]],  # 两个3D图
    subplot_titles=("L1 Trajectories", "L2 Trajectories")
)

# 遍历不同的talker，并根据Category绘制到不同的子图中，保持原始顺序并按diphone连线
for talker_id in filtered_df['ID'].unique():
    talker_data = filtered_df[filtered_df['ID'] == talker_id]  # 保持原始顺序
    category = talker_data['Category'].iloc[0]
    
    if category == 'L1':  
        fig.add_trace(go.Scatter3d(
            x=talker_data['TSNE-1'],
            y=talker_data['TSNE-2'],
            z=talker_data['TSNE-3'],
            mode='lines+markers', 
            line=dict(width=2, color=color_map[category]),  
            marker=dict(size=5),
            text=talker_data['Diphoneme'],  
            hoverinfo='text',
            opacity=0.4
        ), row=1, col=1)
    
    elif category == 'L2':
        fig.add_trace(go.Scatter3d(
            x=talker_data['TSNE-1'],
            y=talker_data['TSNE-2'],
            z=talker_data['TSNE-3'],
            mode='lines+markers', 
            line=dict(width=2, color=color_map[category]),  
            marker=dict(size=5),
            text=talker_data['Diphoneme'],  
            hoverinfo='text',
            opacity=0.4
        ), row=1, col=2)


fig.update_layout(
    showlegend=False, 
    scene=dict(
        xaxis=dict(title='TSNE-1', range=[-40, 40]),  
        yaxis=dict(title='TSNE-2', range=[-40, 40]),  
        zaxis=dict(title='TSNE-3', range=[-40, 40])   
    ),
    scene2=dict(
        xaxis=dict(title='TSNE-1', range=[-40, 40]),  
        yaxis=dict(title='TSNE-2', range=[-40, 40]),  
        zaxis=dict(title='TSNE-3', range=[-40, 40])   
    ),
    width=1400,
    height=800,
    title="L1 vs. L2 Trajectories in Separate Subplots (Diphones Connected in Sequence)"
)

fig.show()


In [217]:
sentence_data = filtered_df[filtered_df['Sentence'] == 'a boy fell from the window']
categories = sentence_data['Category'].unique()


mean_std_results = {}
for category in categories:
    category_data = sentence_data[sentence_data['Category'] == category]
    
    mean_tsne1 = []
    mean_tsne2 = []
    mean_tsne3 = []
    std_tsne1 = []
    std_tsne2 = []
    std_tsne3 = []
    
    
    n_points = category_data.groupby('ID').size().min()

 
    for i in range(n_points):
        tsne1_values = category_data.groupby('ID').nth(i)['TSNE-1'].values
        tsne2_values = category_data.groupby('ID').nth(i)['TSNE-2'].values
        tsne3_values = category_data.groupby('ID').nth(i)['TSNE-3'].values

        mean_tsne1.append(np.mean(tsne1_values))
        mean_tsne2.append(np.mean(tsne2_values))
        mean_tsne3.append(np.mean(tsne3_values))
        std_tsne1.append(np.std(tsne1_values))
        std_tsne2.append(np.std(tsne2_values))
        std_tsne3.append(np.std(tsne3_values))
    
    mean_std_results[category] = {
        'mean_TSNE1': mean_tsne1,
        'mean_TSNE2': mean_tsne2,
        'mean_TSNE3': mean_tsne3,
        'std_TSNE1': std_tsne1,
        'std_TSNE2': std_tsne2,
        'std_TSNE3': std_tsne3
    }

df_=[]
for category, values in mean_std_results.items():
    df1 = pd.DataFrame(values)
    df1.insert(0, 'Category', category)
    df_.append(df1)
    #print(f"\nCategory: {category}\n", df)

In [234]:
pd.DataFrame(np.vstack((df_[0],df_[1])))

Unnamed: 0,0,1,2,3,4,5,6
0,L2,-0.931693,7.5592,2.97348,10.199516,5.539025,7.431101
1,L2,12.516928,18.237545,-7.263641,8.578317,2.575008,11.46356
2,L2,1.082867,19.702959,15.447894,11.962669,6.45685,13.024559
3,L2,16.271391,9.084264,-7.889411,19.102985,10.763084,3.357185
4,L2,-17.997431,0.813379,-4.432423,18.787262,11.97265,6.76118
5,L2,14.885074,-15.770889,0.599916,9.501342,16.634596,2.916249
6,L2,-0.430478,8.798479,-2.916182,1.306006,2.93785,4.937639
7,L2,-0.461525,3.540794,-10.298547,2.25104,10.760588,2.755587
8,L2,-3.012528,-7.466353,-11.458876,5.798983,13.27246,4.326192
9,L2,0.983622,-21.223539,1.850889,6.509469,17.393917,15.59948


In [235]:
df_[0].iloc[:,1:]-df_[1].iloc[:,1:]
#diff matrix

Unnamed: 0,mean_TSNE1,mean_TSNE2,mean_TSNE3,std_TSNE1,std_TSNE2,std_TSNE3
0,4.1895,1.732384,-3.518801,5.29795,2.792089,3.813331
1,-4.560366,0.819248,2.469183,3.804309,-2.774352,4.985492
2,-1.193446,-3.465559,-3.838515,5.43695,3.437971,4.64754
3,-3.926203,-2.934626,1.085597,4.080443,8.255424,3.166446
4,8.596048,-0.599666,2.977685,4.744887,3.722342,3.959847
5,-4.350398,7.80101,-1.216316,3.797443,6.556499,2.257285
6,-0.0125,-0.915379,-2.558581,0.155521,1.673087,2.294041
7,-0.240484,-3.430434,-1.300916,0.846197,8.31311,1.294158
8,0.939642,-4.380399,1.678712,4.336018,4.866571,1.376774
9,-0.187886,8.168304,7.19794,4.118725,9.232501,6.776237


In [150]:
fig = px.scatter_3d(df, x='TSNE-1', y='TSNE-2', z='TSNE-3', color='Diphoneme', 
                    title='t-SNE 3D Visualization of Vectors',
                    labels={'Label': 'ID + Sentence'},width=1000,height=600)#Label
fig.show()