# read files

In [1]:
import pymssql
import pymysql as mysql
import os
import re
import pandas as pd
import datetime

In [2]:
class stt_server:

    def __init__(self, cpu_id):

        # settings ++
        self.cpu_id = cpu_id
        self.cpu_cores = [i for i in range(0,15)]

        # ms sql
        self.sql_name = 'voice_ai'
        self.sql_server = '10.2.4.124'
        self.sql_login = 'ICECORP\\1c_sql'

        # mysql
        self.mysql_name = {
            1: 'MICO_96',
            2: 'asterisk',
        }
        self.mysql_server = '10.2.4.146'
        self.mysql_login = 'asterisk'

        self.script_path = '/home/alex/projects/call_centre_stt_server/'
        self.model_path = '/home/alex/projects/vosk-api/python/example/model'
        self.source_id = 0
        self.sources = {
            'call': 1,
            'master': 2,
        }
        self.original_storage_path = {
            1: '/mnt/share/audio_call/',
            2: '/mnt/share/audio_master/REC_IN_OUT/',
        }
        self.original_storage_prefix = {
            1: 'RXTX_',
            2: '',
        }
        self.temp_file_path = self.script_path+'files/'
        # settings --

        self.temp_file_name = ''
        self.original_file_path = ''
        self.original_file_name = ''
        self.original_file_duration	= 0
        self.date_y = ''
        self.date_m = ''
        self.date_d = ''
        self.rec_date = ''

        #store pass in file, to prevent pass publication on gitdelete_current_queue
        with open(self.script_path+'sql.pass','r') as file:
            self.sql_pass = file.read().replace('\n', '')
            file.close()

        with open(self.script_path+'mysql.pass','r') as file:
            self.mysql_pass = file.read().replace('\n', '')
            file.close()

        self.conn = self.connect_sql()
        self.mysql_conn = {
            1: self.connect_mysql(1),
            2: self.connect_mysql(2),
        }

    def connect_sql(self):

        return pymssql.connect(
            server = self.sql_server,
            user = self.sql_login,
            password = self.sql_pass,
            database = self.sql_name,
            #autocommit=True
        )

    def connect_mysql(self, source_id):

        return mysql.connect(
            host = self.mysql_server, 
            user = self.mysql_login, 
            passwd = self.mysql_pass,
            db = self.mysql_name[source_id],
            #autocommit = True
        )

    def get_fs_files_list(self):

        self.original_file_path = self.original_storage_path[self.source_id]
        self.original_file_path += self.original_storage_prefix[self.source_id]
        if self.source_id == self.sources['call']:
            self.original_file_path += self.date_y + '-' + self.date_m + '/' + self.date_d + '/'
        files_list = []
        for (dirpath, dirnames, filenames) in os.walk(self.original_file_path):
            files_list.extend(filenames)
            break

        # get record date
        fd_list = []
        for filename in files_list:

            rec_date = 'Null'

            if self.source_id == self.sources['call']:
                rec_source_date = re.findall(r'\d{4}-\d{2}-\d{2}-\d{2}-\d{2}-\d{2}', filename)[0]
                if len(rec_source_date):
                    rec_date = rec_source_date[:10] + ' ' + rec_source_date[11:].replace('-', ':')
                if len(re.findall(r'\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}', rec_date)) == 0:
                    rec_date = 'Null'
                    print('Unable to extract date from filename', filename)

            elif self.source_id == self.sources['master']:
                uniqueid = re.findall(r'^\d*.\d*', filename)[0]
                cursor = self.mysql_conn[self.source_id].cursor()
                query = "select calldate from cdr where uniqueid = '" + uniqueid + "' limit 1;"
                cursor.execute(query)
                for row in cursor.fetchall():
                    rec_date = str(row[0])

            fd_list.append({
                'filename': filename,
                'rec_date': rec_date
            })

            #break  # todo: REMOVE

        df = pd.DataFrame(fd_list)
        df.sort_values(['rec_date', 'filename'], ascending=True, inplace=True)

        return df.values
    
    def get_sql_complete_files(self):

        cursor = self.conn.cursor()
        
        if self.source_id == self.sources['call']:
            sql_query = "select distinct filename from queue where "
            sql_query += "source_id='"+str(self.source_id)+"' and "
            sql_query += "date_y='"+self.date_y+"' and "
            sql_query += "date_m='"+self.date_m+"' and "
            sql_query += "date_d='"+self.date_d+"' "
            sql_query += "union all "
            sql_query += "select distinct audio_file_name from transcribations where "
            sql_query += "date_y='"+self.date_y+"' and "
            sql_query += "date_m='"+self.date_m+"' and "
            sql_query += "date_d='"+self.date_d+"' "
            sql_query += "order by filename;"
        
        elif self.source_id == self.sources['master']:
            sql_query = "select distinct filename from queue "
            sql_query += "where source_id='"+str(self.source_id)+"' "
            sql_query += "order by filename;"
            
        cursor.execute(sql_query)
        complete_files = []
        for row in cursor.fetchall():
            complete_files.append(row[0])

        return complete_files
    
    def get_source_id(self, source_name):
        for source in self.sources.items():
            if source[0] == source_name:
                return source[1]
        return 0
    
    def set_today_ymd(self):
        self.date_y	= datetime.datetime.today().strftime('%Y')
        self.date_m	= datetime.datetime.today().strftime('%m')
        self.date_d	= datetime.datetime.today().strftime('%d')

In [135]:
server_object = stt_server(0)
server_object.set_today_ymd()
for source_id in server_object.sources: # ['call', 'master']
    #server_object.source_id = server_object.sources['call']
    server_object.source_id = server_object.get_source_id(source_id)
    complete_files	= server_object.get_sql_complete_files()
    incomplete_count = 0
    complete_count = 0
    print('server_object.source_id', server_object.source_id)
    for filename, rec_date in server_object.get_fs_files_list():
        print(filename, rec_date)
    break

server_object.source_id 1
in_74954833140_2021-02-10-07-01-45rxtx.wav 2021-02-10 07:01:45
in_74957237230_2021-02-10-07-03-10rxtx.wav 2021-02-10 07:03:10
in_74957237230_2021-02-10-07-03-19rxtx.wav 2021-02-10 07:03:19
in_74957237230_2021-02-10-07-03-27rxtx.wav 2021-02-10 07:03:27
in_74957237230_2021-02-10-07-03-51rxtx.wav 2021-02-10 07:03:51
in_74957237230_2021-02-10-07-04-11rxtx.wav 2021-02-10 07:04:11
in_79267167834_2021-02-10-07-04-11rxtx.wav 2021-02-10 07:04:11
in_4954833140_2021-02-10-07-05-22rxtx.wav 2021-02-10 07:05:22
in_4957237230_2021-02-10-07-08-02rxtx.wav 2021-02-10 07:08:02
in_4957237230_2021-02-10-07-08-09rxtx.wav 2021-02-10 07:08:09
in_4957237230_2021-02-10-07-08-20rxtx.wav 2021-02-10 07:08:20
in_4957237230_2021-02-10-07-08-28rxtx.wav 2021-02-10 07:08:28
in_4957237230_2021-02-10-07-08-38rxtx.wav 2021-02-10 07:08:38
in_4957237230_2021-02-10-07-08-48rxtx.wav 2021-02-10 07:08:48
in_4957237230_2021-02-10-07-08-58rxtx.wav 2021-02-10 07:08:58
in_4957237230_2021-02-10-07-09-06rxtx

In [117]:
complete_files

[]

In [86]:
server_object = stt_server(0)
server_object.source_id = 2
fd_list = server_object.get_fs_files_list()
fd_list

[{'filename': '1612426929.1440376-in.wav', 'rec_date': '2021-02-04 11:22:09'},
 {'filename': '1612426929.1440376-out.wav', 'rec_date': '2021-02-04 11:22:09'},
 {'filename': '1612426931.1440380-in.wav', 'rec_date': '2021-02-04 11:22:11'},
 {'filename': '1612426931.1440380-out.wav', 'rec_date': '2021-02-04 11:22:11'},
 {'filename': '1612426943.1440386-in.wav', 'rec_date': '2021-02-04 11:22:23'},
 {'filename': '1612426943.1440386-out.wav', 'rec_date': '2021-02-04 11:22:23'},
 {'filename': '1612424788.1439425-in.wav', 'rec_date': '2021-02-04 10:46:28'},
 {'filename': '1612424788.1439425-out.wav', 'rec_date': '2021-02-04 10:46:28'},
 {'filename': '1612424799.1439439-in.wav', 'rec_date': '2021-02-04 10:46:39'},
 {'filename': '1612424799.1439439-out.wav', 'rec_date': '2021-02-04 10:46:39'},
 {'filename': '1612424800.1439441-in.wav', 'rec_date': '2021-02-04 10:46:40'},
 {'filename': '1612424800.1439441-out.wav', 'rec_date': '2021-02-04 10:46:40'},
 {'filename': '1612424835.1439453-in.wav', 're

In [103]:
df = pd.DataFrame(fd_list)
df.sort_values(['rec_date', 'filename'], ascending=False, inplace=True)

In [104]:
#df[df.rec_date=='Null']
for f,d in df.values:
    print(f,d)
    break

1612947058.1565426-out.wav Null


In [106]:
filename = '1612946589.1564912-in.wav'
uniqueid = re.findall(r'^\d*.\d*', filename)[0]
uniqueid

'1612946589.1564912'

In [105]:
cursor = server_object.mysql_conn[server_object.source_id].cursor()
query = "select calldate from cdr where uniqueid = '"+uniqueid+"' limit 1;"
cursor.execute(query)
for row in cursor.fetchall():
    #rec_date = str(row[0])
    print(row[0])

In [56]:
def get_source_id(source_name):
    for source in server_object.sources.items():
        if source[0] == source_name:
            return source[1]
    return 0
get_source_id('master')

2

In [74]:
server_object.rec_date = 'Null'
filename = 'in_4957237230_2021-02-10-07-16-03rxtx.wav'
rec_source_date = re.findall(r'\d{4}-\d{2}-\d{2}-\d{2}-\d{2}-\d{2}', filename)[0]                
if len(rec_source_date):
    server_object.rec_date = rec_source_date[:10] + ' ' + rec_source_date[11:].replace('-', ':')
if len(re.findall(r'\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}', server_object.rec_date)) == 0:
    print('Unable to extract date from filename', filename)
server_object.rec_date

'2021-02-10 07:16:03'

In [71]:
rec_source_date

'2021-02-10-07-16-03'

In [72]:
server_object.rec_date

'2021-02-10 07:16:03'

In [73]:
re.findall(r'\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}', server_object.rec_date)

['2021-02-10 07:16:03']

In [57]:
for source_id in server_object.sources:
    print(get_source_id(source_id))

1
2


In [67]:
server_object = stt_server(0)
server_object.source_id = 1

In [68]:
files = server_object.get_fs_files_list()
len(files), files[0]

IndexError: list index out of range

In [66]:
files

[{'filename': '1612426929.1440376-in.wav', 'rec_date': '2021-02-04 11:22:09'}]

In [26]:
complete = server_object.get_sql_complete_files()
len(complete)

0

### queue show columns

In [39]:
cursor = server_object.conn.cursor()
query = "SELECT column_name FROM information_schema.columns WHERE table_name='queue';"
cursor.execute(query)
for row in cursor.fetchall():
        print(row)

('date',)
('cpu_id',)
('filepath',)
('date_y',)
('date_m',)
('date_d',)
('filename',)
('duration',)
('record_date',)
('source_id',)


### get record date from cdr

In [30]:
cursor = server_object.mysql_conn[server_object.source_id].cursor()
query = "SELECT column_name FROM information_schema.columns WHERE table_name='cdr';"
cursor.execute(query)
for row in cursor.fetchall():
        print(row)

('calldate',)
('clid',)
('src',)
('dst',)
('dcontext',)
('channel',)
('dstchannel',)
('lastapp',)
('lastdata',)
('duration',)
('billsec',)
('disposition',)
('amaflags',)
('accountcode',)
('uniqueid',)
('userfield',)
('did',)
('recordingfile',)
('calldate',)
('clid',)
('src',)
('dst',)
('dcontext',)
('channel',)
('dstchannel',)
('lastapp',)
('lastdata',)
('duration',)
('billsec',)
('disposition',)
('amaflags',)
('accountcode',)
('uniqueid',)
('userfield',)
('did',)
('recordingfile',)
('calldate',)
('clid',)
('src',)
('dst',)
('dcontext',)
('channel',)
('dstchannel',)
('lastapp',)
('lastdata',)
('duration',)
('billsec',)
('disposition',)
('amaflags',)
('accountcode',)
('uniqueid',)
('userfield',)
('did',)
('recordingfile',)
('id',)
('calldate',)
('clid',)
('src',)
('dst',)
('dcontext',)
('channel',)
('dstchannel',)
('lastapp',)
('lastdata',)
('duration',)
('billsec',)
('disposition',)
('amaflags',)
('accountcode',)
('uniqueid',)
('peeraccount',)
('linkedid',)
('sequence',)
('userfield',)

In [96]:
server_object.source_id = 2
server_object.original_file_name = files[0] #debug
uniqueid = re.findall(r'^\d*.\d*', server_object.original_file_name)[0]
print('uniqueid', uniqueid)
cursor = server_object.mysql_conn[server_object.source_id].cursor()
query = "select calldate from cdr where uniqueid = '1612424788.1439425' limit 1;"
cursor.execute(query)
for row in cursor.fetchall():
    print(row[0])

IndexError: list index out of range

### delete from queue

In [11]:
cursor = server_object.conn.cursor()
sql_query = "delete from queue;"
cursor.execute(sql_query)
server_object.conn.commit() # autocommit

### select from queue

In [12]:
server_object = stt_server(0)
server_object.source_id = 2

cursor = server_object.conn.cursor()
sql_query = "select count(filename) from queue where source_id = '"+str(server_object.source_id)+"';"
#sql_query = "select filename, record_date, cpu_id, duration, source_id from queue "
#sql_query += "where source_id='" + str(server_object.source_id) + "' "
#sql_query += "where duration=60 "
#sql_query += "order by record_date;"
cursor.execute(sql_query)
for row in cursor.fetchall():
    #print(row[0], row[1], row[2], row[3])
    print(row[0])

0


In [111]:
server_object.get_sql_complete_files()

['1612426929.1440376-in.wav']

# other

In [1]:
import datetime

In [42]:
cur_date = datetime.datetime.now()
DD = datetime.timedelta(days=int(365 / 2))
crop_date = cur_date - DD
cur_date_y = crop_date.strftime("%Y")
cur_date_m = crop_date.strftime("%m")
cur_date_d = crop_date.strftime("%d")
cur_date_y, cur_date_m, cur_date_d

('2020', '07', '13')

In [35]:
cur_date = datetime.datetime.now()
cur_date

datetime.datetime(2021, 1, 11, 14, 26, 53, 499098)

In [40]:
DD = datetime.timedelta(days=int(365/2))
crop_date = cur_date - DD
crop_date

datetime.datetime(2020, 7, 13, 14, 26, 53, 499098)

In [23]:
#datetime.datetime.now().year
datetime.datetime.now().strftime("%Y")

'2021'

In [33]:
#datetime.datetime.now().month
datetime.datetime.now().strftime("%m")

'25'

In [46]:
cur_date = datetime.datetime.strptime("2021-04-12T07:00:00Z","%Y-%m-%dT%H:%M:%SZ")
DD = datetime.timedelta(days=int(365 / 2))
crop_date = cur_date - DD
#datetime.datetime.strptime(str(datetime.datetime.now()),"%m")
crop_date

datetime.datetime(2020, 10, 12, 7, 0)

In [28]:
#datetime.datetime.now().day
datetime.datetime.now().strftime("%d")

'11'

In [126]:
import pandas as pd
from init_server import stt_server
from deeppavlov import build_model, configs
import numpy as np

In [127]:
BATCH_SIZE = 3

In [128]:
server_object = stt_server(0)

In [129]:
query = """
    select top """+str(BATCH_SIZE)+""" 
    id,
    text,
    sentiment
    from transcribations 
    where sentiment is NULL and text!=''
    """

In [130]:
df = pd.read_sql(query, server_object.conn)

model = build_model(configs.classifiers.rusentiment_bert, download=True) #download first time
res = model(df.text)
df['sentiment'] = model(df.text)

In [131]:
print(df)

   id                                               text sentiment
0   2                   единая служба сэр спрятана отары      None
1   3                                     карельская дом      None
2   4  все наши операторы заняты пожалуйста оставайте...      None


In [132]:
df['sentiment'] = ['neutral','negative','skip']

In [133]:
df

Unnamed: 0,id,text,sentiment
0,2,единая служба сэр спрятана отары,neutral
1,3,карельская дом,negative
2,4,все наши операторы заняты пожалуйста оставайте...,skip


In [139]:
for index, row in df.iterrows():
    print(index, row.id,row.sentiment)

0 2 neutral
1 3 negative
2 4 skip


In [134]:
for row in df.values:
    print(row.sentiment)

AttributeError: 'numpy.ndarray' object has no attribute 'sentiment'

In [144]:
for index, row in df.iterrows():
    if row.sentiment == 'negative':
        neg = 1
        pos = 0
    else:
        neg = 0
        pos = 1
    query = "update transcribations set "
    query += "sentiment = '"+row.sentiment+"', "
    query += "sentiment_neg = "+str(neg)+", "
    query += "sentiment_pos = "+str(pos)+" "
    query += "where id = "+str(row.id)
    break

In [145]:
query

"update transcribations set sentiment = 'neutral', sentiment_neg = 0, sentiment_pos = 1 where id = 2"

In [83]:
df['sentiment_neg']=np.zeros(len(df))
df['sentiment_pos']=np.zeros(len(df))

In [84]:
sentiments = pd.DataFrame(columns=['sentiment','sentiment_neg','sentiment_pos'])

In [85]:
sentiments = sentiments.append(pd.DataFrame({
    'sentiment':
        [
            'negative',
            'positive',
            'neutral',
            'speech',
            'skip'
        ],
    'sentiment_neg':
        [
            1,0,0,0,0
        ],
    'sentiment_pos':
        [
            0,1,1,1,1
        ]
}))

In [86]:
sentiments

Unnamed: 0,sentiment,sentiment_neg,sentiment_pos
0,negative,1,0
1,positive,0,1
2,neutral,0,1
3,speech,0,1
4,skip,0,1


In [96]:
df

Unnamed: 0,id,text,sentiment,sentiment_neg,sentiment_pos
0,2,единая служба сэр спрятана отары,neutral,0.0,0.0
1,3,карельская дом,negative,0.0,0.0
2,4,все наши операторы заняты пожалуйста оставайте...,skip,0.0,0.0


In [92]:
df[df.sentiment=='negative'].sentiment_neg=np.ones(len(df[df.sentiment=='negative']))

In [103]:
df[df.sentiment=='negative']

Unnamed: 0,id,text,sentiment,sentiment_neg,sentiment_pos
1,3,карельская дом,negative,0.0,0.0


In [94]:
df[df.sentiment=='negative'].sentiment_neg=[1]

In [93]:
df[df.sentiment=='negative'].sentiment_neg

1    0.0
Name: sentiment_neg, dtype: float64

In [59]:
df.set_value[df.sentiment=='negative']['sentiment_neg']=np.ones(len(df[df.sentiment=='negative']))
#df[df.sentiment!='negative']['sentiment_pos']=1
#df.fillna(0)
#df.set_value('C', 'x', 10)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [53]:
df[df.sentiment=='negative']['sentiment_neg']

1    0.0
Name: sentiment_neg, dtype: float64