In [1]:
import json
import nltk
import statistics
import os
import logging
from os import listdir
from os.path import isfile, join
from nltk.util import ngrams, bigrams
from nltk.probability import FreqDist
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import plotly.express as px #Graficos interactivos

mypath = 'Voicebot'

In [2]:
handler = logging.handlers.WatchedFileHandler(os.environ.get("LOGFILE", f"{mypath}/debug.log"))
formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
handler.setFormatter(formatter)
root = logging.getLogger()
root.setLevel(os.environ.get("LOGLEVEL", "INFO"))
root.addHandler(handler)
logging.info(f"INICIO PROCESO '2.speech_processing.ipynb'")

In [3]:
onlyfiles = [f for f in listdir(f'{mypath}/transcripts/json/') if isfile(join(f'{mypath}/transcripts/json/', f))]
logging.info(f"{len(onlyfiles)} files inside '{mypath}/transcripts/json'")
jsonfiles = [os.path.splitext(filename)[0] for filename in onlyfiles if os.path.splitext(filename)[1]=='.json']
logging.info(f"{len(onlyfiles)} jsonfiles inside '{mypath}/transcripts/json'")

In [4]:
# load .json
rawdata = []
rawinfo = []

for jsonfile in jsonfiles:
  with open(f"{mypath}/transcripts/json/{jsonfile}.json", encoding='utf8') as json_file:
    rawdata.append(json.load(json_file)) # Transcripts and data

with open(f"{mypath}/transcripts/info/times.json", encoding='utf8') as json_file:
  for times in json.load(json_file):
    rawinfo.append(times) # Time of processing transcriptions
  
# data = rawdata[0]
logging.info(f"{len(rawdata)} files append to rawdata")
print(f"{len(rawdata)} files appended to rawdata")
logging.info(f"{len(rawdata)} times extracted from '{mypath}/transcripts/info/times.json'")
print(f"{len(rawdata)} times extracted from '{mypath}/transcripts/info/times.json'")

100 files appended to rawdata
100 times extracted from 'Voicebot/transcripts/info/times.json'


# Info data

In [None]:
# data.keys()                       # dict_keys(['audio_metrics', 'results', 'result_index', 'speaker_labels'])
## data['audio_metrics'].keys()     # dict_keys(['sampling_interval', 'accumulated'])
## data['results'][0].keys()        # dict_keys(['alternatives', 'final'])
## data['result_index']             # 0
## data['speaker_labels'][0].keys() # dict_keys(['from', 'to', 'speaker', 'confidence', 'final'])

## Info data: audio metrics

In [None]:
##  data['audio_metrics'].keys()                                    # dict_keys(['sampling_interval', 'accumulated'])
###                      # 0.1 [seconds] *
### data['audio_metrics']['accumulated'].keys()                     # dict_keys(['non_speech_level', 'clipping_rate', 'high_frequency_loss', 'end_time', 'speech_ratio',                                                                                      'direct_current_offset','final', 'signal_to_noise_ratio', 'speech_level'])
### data['audio_metrics']['accumulated']['non_speech_level']        # Non-Speech levels *                
### data['audio_metrics']['accumulated']['clipping_rate']           # Clipping rates
### data['audio_metrics']['accumulated']['high_frequency_loss']     # 0.0
### data['audio_metrics']['accumulated']['end_time']                # 147.6 [seconds] *
### data['audio_metrics']['accumulated']['speech_ratio']            # 0.732 [speech_time/end_time] *
### data['audio_metrics']['accumulated']['direct_current_offset']   # Offset levels
### data['audio_metrics']['accumulated']['final']                   # True
### data['audio_metrics']['accumulated']['signal_to_noise_ratio']   # 12.8 *
### data['audio_metrics']['accumulated']['speech_level']            # Speech levels *

In [5]:
# Check Bads Requests...
bads = []
for i,data in enumerate(rawdata):
    try: 
        data['error']
        bads.append(i)
    except:
        pass
if bads == []:
    logging.info(f"No bad requests transcripts found")
else: logging.warning(f"Bad requests found: {bads}")
bads

[]

In [6]:
audio_metrics = []
for i, data in enumerate(rawdata):
    sampling_interval       = data['audio_metrics']['sampling_interval']
    non_speech_level_freq   = [non_speech_level['count'] for non_speech_level in data['audio_metrics']['accumulated']['non_speech_level']]
    non_speech_level_mean   = sum(freq*(i + 0.5)/10 for i, freq in enumerate(non_speech_level_freq))/sum(non_speech_level_freq) if sum(non_speech_level_freq) != 0 else 0
    non_speech_level_var    = sum(freq*((i + 0.5)/10 - non_speech_level_mean)**2 for i, freq in enumerate(non_speech_level_freq))/sum(non_speech_level_freq) if sum(non_speech_level_freq) != 0 else 0
    end_time                = data['audio_metrics']['accumulated']['end_time']
    speech_ratio            = data['audio_metrics']['accumulated']['speech_ratio']
    final                   = data['audio_metrics']['accumulated']['final']
    # signal_to_noise_ratio   = data['audio_metrics']['accumulated']['signal_to_noise_ratio']
    speech_level_freq       = [speech_level['count'] for speech_level in data['audio_metrics']['accumulated']['speech_level']]
    speech_level_mean       = sum(freq*(i + 0.5)/10 for i, freq in enumerate(speech_level_freq))/sum(speech_level_freq) if sum(speech_level_freq) != 0 else 0
    speech_level_var        = sum(freq*((i + 0.5)/10 - speech_level_mean)**2 for i, freq in enumerate(speech_level_freq))/sum(speech_level_freq) if sum(speech_level_freq) != 0 else 0
    audio_metrics.append({  'file_id'               : i + 1,
                            'path'                  : r"\\192.9.100.44\grabaciones\CAT\20210427",
                            'file_name'             : f"{jsonfiles[i]}.ogg",
                            # 'time_transcription'    : rawinfo[i],                      
                            'sampling_interval'     : sampling_interval,
                            'has_ending'            : final,
                            'speech_level_mean'     : speech_level_mean,
                            'speech_level_var'      : speech_level_var,
                            'non_speech_level_mean' : non_speech_level_mean,
                            'non_speech_level_var'  : non_speech_level_var,
                            # 'signal_to_noise_ratio' : signal_to_noise_ratio,
                            'speech_ratio'          : speech_ratio,
                            'time_call'             : end_time,})

## Info data: results & speaker labels

In [None]:
##  data['results'][0]                                          # List of 'transcripts' divided according to pauses *
##  data['results'][0].keys()                                   # dict_keys(['alternatives', 'final'])
### data['results'][0]['alternatives'][0]                       # List of 'alternatives' to a specific 'transcript' *
### data['results'][0]['alternatives'][0].keys()                # dict_keys(['timestamps', 'confidence', 'transcript', 'word_confidence']) 
### data['results'][0]['alternatives'][0]['timestamps'][0]      # List of 'timestamps' for each 'word' in 'transcript' with 'begin' and 'end'
### data['results'][0]['alternatives'][0]['confidence']         # Value of 'transcript' 'confidence'
### data['results'][0]['alternatives'][0]['transcript']         # Value of 'transcript'
### data['results'][0]['alternatives'][0]['word_confidence'][0] # List of 'word_confidence' for each 'word' in 'transcript' with confidence value
### data['speaker_labels'][0]                                   # List of 'speaker_labels' divided for timestamps
### data['speaker_labels'][0].keys()                            # dict_keys(['from', 'to', 'speaker', 'confidence', 'final'])
### data['speaker_labels'][0]['from']                           # Value of 'from' timestamp
### data['speaker_labels'][0]['to']                             # Value of 'to' timestamp
### data['speaker_labels'][0]['speaker']                        # Value of 'speaker' number
### data['speaker_labels'][0]['confidence']                     # Value of 'confidence' number
### data['speaker_labels'][0]['final']                          # True/False

### Corporea y Corpus

In [7]:
cmkeys = ['word','begin','end','word_confidence','speaker','speaker_confidence','file_id'] # corpus_multi_keys
corporea = []               # List of corpus (list of 'sentences' containing separated 'words' in a phone conversation)
corporea_multi = []         # List of corpus_multi (list of 'sentences' containing separated 'words', 'timestamps', 'confidence', and 'speaker' in a phone conversation)
corporea_multi_dict = []    # List of corpus_multi dictionaries (with labels)
for n, data in enumerate(rawdata):
    corpus = [[word[0] for word in alternative['timestamps']] for result in data['results'] for alternative in result['alternatives'][:1]]
    corpus_flat = [group for sentence in corpus for group in sentence]
    corpus_timestamps = [[group_times for group_times in alternative['timestamps']] for result in data['results'] for alternative in result['alternatives'][:1]]
    corpus_confidence = [[group_confidence for group_confidence in alternative['word_confidence']] for result in data['results'] for alternative in result        ['alternatives'][:1]]
    corpus_speaker = [data['speaker_labels'][sum(len(corpus[i]) for i in range(j)):sum(len(corpus[i]) for i in range(j+1))] for j, sentence in enumerate(corpus)]
    corpus_extra = [[(group + [corpus_confidence[i][j][-1]]) for j, group in enumerate(corpus_timestamps[i])] for i, sentence in enumerate(corpus_timestamps)]
    corpus_multi = [[(group + [corpus_speaker[i][j]['speaker']] + [corpus_speaker[i][j]['confidence']] + [n+1]) for j, group in enumerate(corpus_extra[i])] for i, sentence in enumerate(corpus_extra)]
    corpus_multi_flat = [group for sentence in corpus_multi for group in sentence]
    corpus_multi_dict = [[dict(zip(cmkeys, word_group)) for word_group in sentence] for sentence in corpus_multi]
    corpus_multi_dict_flat = [group for sentence in corpus_multi_dict for group in sentence]
    corporea.append(corpus_flat)
    corporea_multi.append(corpus_multi_flat)
    corporea_multi_dict.append(corpus_multi_dict_flat)
# corporea_multi_dict[0][:2]

In [8]:
total_speakers = []
for corpus_multi_dict in corporea_multi_dict:
    for word in corpus_multi_dict:
        if word['speaker'] not in total_speakers: total_speakers.append(word['speaker'])
speaker_label = ["Speaker_f", "Speaker_s"]
for i in range(len(total_speakers)): 
    speaker_label.append(f'Speaker_{i}')
speaker_label

['Speaker_f', 'Speaker_s', 'Speaker_0', 'Speaker_1', 'Speaker_2', 'Speaker_3']

In [9]:
pd.set_option('display.max_columns', None)

df = pd.DataFrame(columns=speaker_label)

for i,corpus_multi_dict in enumerate(corporea_multi_dict):
    try:
        words_counter = [0]*len(total_speakers)
        first = corpus_multi_dict[0]['speaker']
        second = first
        for word in corpus_multi_dict:
            if first == second:
                if first != word['speaker']: 
                    second = word['speaker']
            for id in range(len(words_counter)):
                if word['speaker']==id: 
                    words_counter[id] += 1
                    break
                    
        aux = {speaker_label[0]: first, speaker_label[1]: second}
        for j in range(len(total_speakers)): 
            aux[speaker_label[j+2]] = words_counter[j]
        
        df = df.append(aux, ignore_index=True)
    except:
        df = df.append(pd.Series(), ignore_index=True)
        pass
        
df.index += 1
dft = df.T

  df = df.append(pd.Series(), ignore_index=True)


In [10]:
jsonfiles[0].split('_')

['CAT', '009810255-8', '20210427', '101607', 'taisagap']

In [11]:
# INFO Extractions from filename
# agent_result = [filename.split('_')[-1] for filename in jsonfiles] # The result is extracted from the filename if the file has it
agent_name = [filename.split('_')[4] for filename in jsonfiles] # The result is extracted from the filename
audio_cartera = [filename.split('_')[0] for filename in jsonfiles]
audio_fecha = [filename.split('_')[2] for filename in jsonfiles]
audio_hora = [filename.split('_')[3] for filename in jsonfiles]
rut = [filename.split('_')[1] for filename in jsonfiles]

In [12]:
# Select "speaker" who is "agent" and "speaker" who is "client"
# agent_speaker = [corpus_multi_dict[0]['speaker'] for corpus_multi_dict in corporea_multi_dict] # The first in talk is considered the 'agent_speaker'
agent_speaker = [list(dft[column])[2:].index(sorted(list(dft[column])[2:])[-1]) for column in dft.columns] # The 1st one that talk most is the 'agent_speaker'
client_speaker = [list(dft[column])[2:].index(sorted(list(dft[column])[2:])[-2]) for column in dft.columns] # The 2nd one that talk most is the 'client_speaker'
print(f'agent_speaker: {agent_speaker}')
print(f'client_speaker: {client_speaker}')

agent_speaker: [0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 2, 1, 1, 0, 0, 1, 2, 2, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 2, 0, 0, 2, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 2, 0, 0, 2, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0]
client_speaker: [2, 0, 0, 0, 0, 1, 0, 1, 0, 2, 0, 2, 0, 0, 0, 2, 1, 0, 0, 1, 1, 2, 0, 2, 0, 2, 1, 1, 1, 0, 0, 0, 1, 0, 0, 3, 0, 0, 0, 0, 1, 2, 0, 0, 0, 2, 3, 0, 0, 0, 2, 1, 1, 3, 0, 0, 1, 1, 2, 2, 2, 0, 1, 1, 1, 0, 1, 2, 2, 0, 1, 0, 0, 0, 0, 3, 0, 1, 0, 2, 0, 0, 2, 1, 1, 0, 1, 1, 0, 2, 0, 0, 0, 0, 1, 0, 2, 1, 2, 0]


In [13]:
for i in range(len(df['Speaker_f'])):
    if agent_speaker[i] != client_speaker[i]:
        df['Speaker_f'][i+1] = agent_speaker[i]
        df['Speaker_s'][i+1] = client_speaker[i]
df = df.rename(columns={'Speaker_f':'agent_speaker', 'Speaker_s':'client_speaker'})
dft = df.T
dft

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100
agent_speaker,0,,1,1,1,0,1,0,,1,,0,1,1,1,1,0,,,0,0,1,1,1,1,1,0,0,2,1,1,,0,1,2,2,,,,,0,1,0,1,,0,0,,1,1,1,0,0,1,,1,0,0,1,0,1,,2,0,0,2,0,0,0,1,0,,,,1,1,,0,1,1,,0,1,0,0,2,0,0,2,1,1,,1,1,0,1,1,0,1,
client_speaker,2,,0,0,0,1,0,1,,2,,2,0,0,0,2,1,,,1,1,2,0,2,0,2,1,1,1,0,0,,1,0,0,3,,,,,1,2,1,0,,2,3,,0,0,2,1,1,3,,0,1,1,2,2,2,,1,1,1,0,1,2,2,0,1,,,,0,3,,1,0,2,,2,2,1,1,0,1,1,0,2,0,,0,0,1,0,2,1,2,
Speaker_0,26,,1,3,2,3,1,5,,1,,41,11,3,8,0,34,,,46,46,0,1,2,2,1,19,40,2,1,14,,11,3,3,48,,,,,1,0,6,3,,45,27,,8,1,6,9,7,0,,6,9,11,0,280,48,,1,6,65,2,9,27,38,2,9,,,,10,0,,5,5,0,,8,0,44,29,5,47,26,3,1,5,,3,5,40,2,1,7,1,
Speaker_1,0,,4,59,9,2,8,4,,7,,0,23,6,24,44,1,,,9,7,11,2,28,12,6,4,4,3,2,21,,6,5,2,0,,,,,2,48,6,66,,0,0,,50,43,26,5,2,82,,9,2,1,65,0,103,,4,2,15,2,1,2,0,9,4,,,,44,62,,1,10,46,,0,84,11,12,2,6,6,2,22,10,,4,10,22,7,8,2,6,
Speaker_2,3,,1,0,0,0,1,0,,2,,16,0,1,0,22,0,,,0,0,4,0,5,0,2,0,0,19,0,0,,0,0,6,139,,,,,2,5,1,0,,6,0,,0,0,14,0,1,0,,0,0,0,17,149,65,,6,2,0,3,0,14,3,0,0,,,,0,0,,0,4,9,,8,55,0,0,10,3,0,4,3,2,,2,0,0,1,2,0,2,
Speaker_3,0,,0,0,0,0,0,0,,0,,0,0,0,0,0,0,,,0,0,0,0,0,0,0,0,0,0,0,0,,0,0,0,81,,,,,0,0,0,0,,0,10,,0,0,0,0,0,23,,0,0,0,0,0,0,,0,0,0,0,0,0,0,0,0,,,,0,15,,0,0,0,,0,0,0,0,0,0,0,0,0,0,,0,0,0,0,0,0,0,


In [None]:
# # Swap manually the speakers
# agent_swap = []
# agent_swap = [5,12,14,21,22,24,26,29,32,34,35,41] # agents need to be corrected manually
# agent_speaker = list(df['agent_speaker'])
# client_speaker = list(df['client_speaker'])
# for id in agent_swap:
#     aux = agent_speaker[id-1]
#     agent_speaker[id-1] = client_speaker[id-1]
#     client_speaker[id-1] = aux
# df['agent_speaker'] = agent_speaker
# df['client_speaker'] = client_speaker
# dft = df.T
# dft

In [None]:
# # MANUAL EXTRACT DATA
# agent_type = ['outbound', 'outbound', 'outbound', 'outbound', 'outbound', 'outbound', 'inbound', 'outbound', 'outbound', 'outbound', 'outbound', 'outbound']
# # agent_speaker = [0 for i in range(len(agent_result))]
# agent_speaker = [0,0,0,0,0,0,0,1,0,1,0,0]
# # agent_result = ['compromiso', 'no_contacto', 'no_contacto', 'compromiso', 'no_contacto', 'reclamo_facturación', 'compromiso', 'reclamo_baja', 'reclamo_baja', 'reclamo_facturación', 'no_contacto', 'compromiso']

In [None]:
# # Dictionary of speaker n in multi_dic[i]
# n = 1
# i = 12
# [{word["word"],word["word_confidence"]} for word in corporea_multi_dict[i-1] if word['speaker']==n]

## Medida de riqueza lexica en un texto: 
$$ R_l = \frac{\text{total de palabras únicas}}{\text{total de palabras}} = \frac{\text{longitud del vocabulario}}{\text{longitud del texto}}$$ 

In [142]:
for i, corpus_multi_dict in enumerate(corporea_multi_dict):
    audio_metrics[i]['time_voices']             = sum((word['end']-word['begin']) for word in corpus_multi_dict)
    audio_metrics[i]['time_speaker_0']          = sum((word['end']-word['begin']) for word in corpus_multi_dict if word['speaker']==agent_speaker[i])
    audio_metrics[i]['time_speaker_1']          = sum((word['end']-word['begin']) for word in corpus_multi_dict if word['speaker']==client_speaker[i])
    audio_metrics[i]['voices_call_ratio']       = audio_metrics[i]['time_voices']/audio_metrics[i]['time_call'] if audio_metrics[i]['time_call'] != 0 else 0
    audio_metrics[i]['agent_call_ratio']        = audio_metrics[i]['time_speaker_0'] /audio_metrics[i]['time_call'] if audio_metrics[i]['time_call'] != 0 else 0
    audio_metrics[i]['agent_dominance']         = audio_metrics[i]['time_speaker_0'] /audio_metrics[i]['time_voices'] if audio_metrics[i]['time_voices'] != 0 else 0
    audio_metrics[i]['agent_words_sum']         = sum( 1 for word in corpus_multi_dict if word['speaker']==agent_speaker[i])
    audio_metrics[i]['client_words_sum']        = sum( 1 for word in corpus_multi_dict if word['speaker']==client_speaker[i])
    audio_metrics[i]['agent_words_per_second']  = audio_metrics[i]['agent_words_sum']/audio_metrics[i]['time_speaker_0'] if audio_metrics[i]['time_speaker_0'] != 0 else 0
    audio_metrics[i]['client_words_per_second'] = audio_metrics[i]['client_words_sum']/audio_metrics[i]['time_speaker_1'] if audio_metrics[i]['time_speaker_1'] != 0 else 0
    audio_metrics[i]['agent_speaker']           = agent_speaker[i]
    audio_metrics[i]['client_speaker']          = client_speaker[i]
    audio_metrics[i]['agent_name']              = agent_name[i]
    audio_metrics[i]['cartera']                 = audio_cartera[i]
    audio_metrics[i]['audio_fecha']             = audio_fecha[i]
    audio_metrics[i]['audio_hora']              = audio_hora[i]
    audio_metrics[i]['rut']                     = rut[i]
    audio_metrics[i]['nombre']                  = nombres[i]
    audio_metrics[i]['calidad']                 = calidad[i]
    # audio_metrics[i]['lexical_wealth ']         = len(set([word['word'] for word in corpus_multi_dict if word['speaker']==agent_speaker[i]]))/len([word['word'] for word in corpus_multi_dict if word['speaker']==agent_speaker[i]]) if len([word['word'] for word in corpus_multi_dict if word['speaker']==agent_speaker[i]]) != 0 else 0
    audio_metrics[i]['conversation']            = documents[i]
# audio_metrics[0]

In [106]:
# reconstruct dialogs new
timings = []
dialogs = []
conversations = []
speakers = []
for i, corpus_multi_dict in enumerate(corporea_multi_dict):
    begin = 0
    end = 0
    sentence = ''        
    timing = []
    dialog = []
    conversation = []
    speak = []
    agent = agent_speaker[i]
    client = client_speaker[i]
    try:
        speaker = corpus_multi_dict[0]['speaker']
        for j, word in enumerate(corpus_multi_dict):
            if word['speaker'] == speaker:
                sentence += f"{word['word']} "
            else:
                end = corpus_multi_dict[j-1]['end']
                timing.append([begin,end])
                if speaker == agent: label = 'agent'
                elif speaker == client: label = 'client'
                else: label = 'none'
                dialog.append({'file': i+1, 'pos':len(dialog)+1, 'speaker': speaker, 'label': label, 'sentence':sentence[:-1]})
                conversation.append(sentence[:-1])
                speak.append(speaker)
                sentence = f"{word['word']} "
                speaker = word['speaker']
                begin = word['begin']
        end = word['end']
    except:
        pass
    timing.append([begin,end])
    dialog.append({'file': i+1, 'pos':len(dialog)+1, 'speaker': speaker, 'label': label, 'sentence':sentence[:-1]})
    conversation.append(sentence[:-1])
    speak.append(speaker)
    timings.append(timing)
    dialogs.append(dialog)
    conversations.append(conversation)
    speakers.append(speak)

In [16]:
# dialogs_trans = {key:[] for key, value in dialogs[0][0].items()}
dialogs_trans = {'file':[], 'pos':[], 'agent':[], 'sentence':[]}
for dialog in dialogs:
    for sentence in dialog:
        for key, value in sentence.items():
            if key == 'label':
                if value == 'agent': dialogs_trans['agent'].append(1)
                else: dialogs_trans['agent'].append(0)
            elif key != 'speaker': dialogs_trans[key].append(value)

dialogs_df = pd.DataFrame()
for key, value in dialogs_trans.items():
    dialogs_df[key] = value
dialogs_df.head()

Unnamed: 0,file,pos,agent,sentence
0,1,1,1,hola buenos días es usted nelson habite adis
1,1,2,0,sí aló sí
2,1,3,0,le damos la bienvenida armó un tenemos atracti...
3,2,1,0,
4,3,1,0,hola


In [108]:
documents = []
for conversation in conversations:
    document = ""
    for sentence in conversation:
        document = f"{document}{sentence}. "
    document = document[:-1]
    documents.append(document)
len(documents)

100

In [143]:
# save metrics data
dialogs_df.to_csv(f"{mypath}/transcripts/results/dialogs.csv", index=False, encoding='utf8')
tosave = {  'audio_metrics'         : audio_metrics,
            'corporea'              : corporea,
            'corporea_multi'        : corporea_multi,
            'corporea_multi_dict'   : corporea_multi_dict,
            'dialogs'               : dialogs,
            'speakers'              : speakers}
for key, data in tosave.items():
    with open(f"{mypath}/transcripts/results/{key}.json", 'w', encoding='utf8') as json_file:
        json.dump(data, json_file, ensure_ascii=False)
        
# save audio_metrics to excel file
with open(f"{mypath}/transcripts/results/audio_metrics.json", 'r', encoding='utf8') as json_file:
    data = json.loads(json_file.read())
df_audio_metrics = pd.json_normalize(data)
df_audio_metrics.to_excel(f"{mypath}/transcripts/results/audio_metrics.xlsx",index=False)

In [None]:
# # individual save
# onlysave = {'audio_metrics': audio_metrics,}
# for key, data in onlysave.items():
#     with open(f"{mypath}/transcripts/results/{key}.json", 'w', encoding='utf8') as json_file:
#         json.dump(data, json_file, ensure_ascii=False)

# Text Analytics

In [None]:
from azure.ai.textanalytics import TextAnalyticsClient
from azure.core.credentials import AzureKeyCredential
import os

key = os.environ['AZURE_KEY']
endpoint = os.environ['AZURE_ENDPOINT']


In [None]:
def authenticate_client():
    ta_credential = AzureKeyCredential(key)
    text_analytics_client = TextAnalyticsClient(
            endpoint=endpoint, credential=ta_credential)
    return text_analytics_client
client = authenticate_client()

In [None]:
## SENTIMENT
def sentiment_analysis(client, documents=["Tuve el mejor día de mi vida. Desearía que hubieras estado ahí."]):
    sentiments = []
    for document in documents:
        response = client.analyze_sentiment([document])[0] #Llamada al servicio
        sentiments.append(response)
    return sentiments

In [None]:
## MODIFICAR
def sentiment_analysis_example(client, documents=["Tuve el mejor día de mi vida. Desearía que hubieras estado ahí."]):
    response = client.analyze_sentiment(documents) #Llamada al servicio
    print("Document Sentiment: {}".format(response.sentiment))
    print("Overall scores: positive={0:.2f}; neutral={1:.2f}; negative={2:.2f} \n".format(
        response.confidence_scores.positive,
        response.confidence_scores.neutral,
        response.confidence_scores.negative,
    ))
    for idx, sentence in enumerate(response.sentences):
        print("Sentence: {}".format(sentence.text))
        print("Sentence {} sentiment: {}".format(idx+1, sentence.sentiment))
        print("Sentence score:\nPositive={0:.2f}\nNeutral={1:.2f}\nNegative={2:.2f}\n".format(
            sentence.confidence_scores.positive,
            sentence.confidence_scores.neutral,
            sentence.confidence_scores.negative,
        ))

In [None]:
sentiments = sentiment_analysis(client, documents)

In [None]:
numbers = []
for i in range(len(sentiments)):
    numbers.append(len(sentiments[i].sentences))

# API Datos Cliente

In [104]:
# traer_deuda(rut = rut[90],codemp = audio_cartera[90],pais = 152)
# traer_deuda(rut = '003421657-6',codemp = 'CAT',pais = 152)

In [90]:
import requests
import time
def traer_deuda(rut,codemp,pais):
    URL = 'https://api-recsaone.recsa.cl/api/malena/cliente-acciones'
    PARAMS = {'accion':'traer_deudas',
                'rut':rut,
                'cod_emp':codemp,
                'usuario':'voicebot',
                'cod_cli':'',
                'pais':pais}
    HEADERS = {'api-key':os.environ['API_RECSAONE'],
                'Content-Type':'application/json',
                'Accept':'application/json'}
    r = requests.post(url=URL, params=PARAMS, headers=HEADERS)
    data = r.json()
    return data

nombres = []
for i,r in enumerate(rut):
    data = traer_deuda(rut = r,codemp = audio_cartera[i],pais = 152)
    if i==50: time.sleep(40)
    try:
        nombres.append(data['Nombre'])
    except:
        nombres.append(data)

In [129]:
import re
from unicodedata import normalize

calidad = []
for i,document in enumerate(documents):
    if document == '.': calidad.append('Vacio')
    elif isinstance(nombres[i],dict): calidad.append('Error API')
    else:
        mencionado = False
        document_normalized = normalize('NFC', re.sub(r"([^n\u0300-\u036f]|n(?!\u0303(?![\u0300-\u036f])))[\u0300-\u036f]+", r"\1", normalize( "NFD", document), 0, re.I))
        for nombre in nombres[i].split():
            if nombre.lower() in document_normalized: mencionado = True
        if mencionado: calidad.append('Mencionado')
        else: calidad.append('No Mencionado')
len(calidad)


100

In [130]:
# NoMencionado_index = [i for i,cal in enumerate(calidad) if cal == 'No Mencionado']
# Mencionado_index = [i for i,cal in enumerate(calidad) if cal == 'Mencionado']
# print(NoMencionado_index)
# print(Mencionado_index)

In [131]:
# NoMencionado = [corporea[i] for i in NoMencionado_index]
# Mencionado = [corporea[i] for i in Mencionado_index]
# NoMencionado

# Docx dialogs

In [21]:
import docx
from docx import Document
from docx.shared import Inches, Pt
from docx.enum.text import WD_ALIGN_PARAGRAPH, WD_LINE_SPACING
from docx.shared import RGBColor

indent = 1.5
for n, conversation in enumerate(conversations):
    timing = timings[n]
    # sentences = sentiments[n].sentences
    document = Document()
    docx_name = audio_metrics[n]['file_name']
    metric = []
    metric.append(f"rut: {audio_metrics[n]['rut']}")
    metric.append(f"time_call: {audio_metrics[n]['time_call']:.2f} s")
    metric.append(f"agent_name: {audio_metrics[n]['agent_name']}")
    metric.append(f"cartera: {audio_metrics[n]['cartera']}")
    metric.append(f"agent_dominance: {audio_metrics[n]['agent_dominance']*100:.0f}%")
    metric.append(f"agent_words_per_second: {audio_metrics[n]['agent_words_per_second']:.2f} wps")
    metric.append(f"client_words_per_second: {audio_metrics[n]['client_words_per_second']:.2f} wps")
    # metric.append(f"conversation sentiment: {sentiments[n].sentiment}")
    # metric.append(f"overall scores: positive={sentiments[n].confidence_scores.positive:.2f}; neutral={sentiments[n].confidence_scores.neutral:.2f}; negative={sentiments[n].confidence_scores.negative:.2f}")

    paragraph = document.add_heading(level=0)
    paragraph_format = paragraph.paragraph_format
    paragraph_format.space_after = Pt(3)
    run = paragraph.add_run(f'{docx_name}')
    run.font.size = Pt(16)

    for m in metric:
        paragraph = document.add_paragraph()
        paragraph_format = paragraph.paragraph_format
        paragraph.add_run(str(m)).font.size = Pt(9)
        paragraph_format.space_after = Pt(0)
    paragraph_format.space_after = Pt(9)

    begin = 0
    left = True
    for s in speakers[n]:
        if s == agent_speaker[n] or s == client_speaker[n]:
            if s == agent_speaker[n]:
                left = False
            break
    for i, sentence in enumerate(conversation):
        if speakers[n][i] == agent_speaker[n] or speakers[n][i] == client_speaker[n]:
            paragraph = document.add_paragraph()
            # timestamp = f"{sentences[i].sentiment} | pos:{sentences[i].confidence_scores.positive:.2f} | neu:{sentences[i].confidence_scores.neutral:.2f} | neg:{sentences[i].confidence_scores.negative:.2f} | [{timing[i][0]:.1f}, {timing[i][1]:.1f}]s"
            timestamp = f"[{timing[i][0]:.1f}, {timing[i][1]:.1f}]s"
            if left:
                paragraph.add_run(f'{sentence}').font.color.rgb = RGBColor(0,176,80)
                paragraph.paragraph_format.right_indent = Inches(indent)
                paragraph.paragraph_format.space_after = Pt(0)
                paragraph = document.add_paragraph()
                paragraph.add_run(f'{i+1}. {timestamp}').font.size = Pt(7)
                paragraph.paragraph_format.right_indent = Inches(indent)
            else:
                paragraph.add_run(f'{sentence}').font.color.rgb = RGBColor(0,112,192)
                paragraph.paragraph_format.left_indent = Inches(indent)
                paragraph.paragraph_format.alignment = WD_ALIGN_PARAGRAPH.RIGHT
                paragraph.paragraph_format.space_after = Pt(0)
                paragraph = document.add_paragraph()
                paragraph.add_run(f'{i+1}. {timestamp}').font.size = Pt(7)
                paragraph.paragraph_format.left_indent = Inches(indent)
                paragraph.paragraph_format.alignment = WD_ALIGN_PARAGRAPH.RIGHT
            paragraph.paragraph_format.space_after = Pt(6)
            left = not left
    document.add_page_break()
    document.save(f"{mypath}/transcripts/docx/{docx_name}.docx")