### Contradiction

In [1]:
import json
import os
from typing import List, Union
import torch

from expert.core.contradiction import contradiction_analysis


In [2]:
device = torch.device('cuda')

In [6]:
contr_det = contradiction_analysis.ContradictionDetector(transcription_path = '/home/izs/Projects/ml-server/app/libs/contradiction/words.json',
                                                         path_to_video='sdafs',
                                                          save_to='./check',
                                                            lang='en', device=device)

In [7]:
contr_det.get_contradiction('cat')

[{'text': "so for the past twenty years i've been helping malaysians and other southeast asians to speak better english and through training thousands of southeast asians i've discovered a very surprising truth i've discovered that how well somebody communicate in english actually has very little to do with their english level it has a lot to do with their attitude towards english",
  'predict': 2.0,
  'interval': 0},
 {'text': 'there are people out there who have a very very low level of english and they can communicate very very well one of them that i remember was a student of participant of my name face out he was a factory supervisor english level very very low but this guy could just sit and listen to anybody very calmly clue he really and then he could respond absolutely express his thoughts beautifully at a very low level of english so today i want to share with you what is',
  'predict': 2.0,
  'interval': 0},
 {'text': 'so different about people like finds out how do they do 

In [None]:
contr_det = contradiction_analysis.ContradictionDetector(lang='en', device=device)
contr_det.get_contradiction('cat', '/home/izs/Projects/ml-server/app/libs/contradiction/words.json',
'csds',
'./check'
)

[{'text': "so for the past twenty years i've been helping malaysians and other southeast asians to speak better english and through training thousands of southeast asians i've discovered a very surprising truth i've discovered that how well somebody communicate in english actually has very little to do with their english level it has a lot to do with their attitude towards english",
  'predict': 2.0,
  'interval': 0},
 {'text': 'there are people out there who have a very very low level of english and they can communicate very very well one of them that i remember was a student of participant of my name face out he was a factory supervisor english level very very low but this guy could just sit and listen to anybody very calmly clue he really and then he could respond absolutely express his thoughts beautifully at a very low level of english so today i want to share with you what is',
  'predict': 2.0,
  'interval': 0},
 {'text': 'so different about people like finds out how do they do 

In [5]:
contr_det_ru = contradiction_analysis.ContradictionDetector(lang='ru', device='cuda')

In [6]:
speech = """Не нужна фундаментальная подготовка"""

In [7]:
contr_det_ru.get_contradiction(speech, '/home/izs/Projects/expert/expert/core/words.json',
'csds',
'./check_rus'
)

[{'text': 'То, что было в советской системе, конечно надо понимать, это фундаментальная подготовка. Фундаментальность подготовки, она действительно всегда была. Поэтому я здесь считаю, что фундаментальная подготовка, она обязательно должна быть. А добавляться будет конкретными теми профессиональными стандартами, которые создает уже работодатель.',
  'predict': 2.0,
  'interval': 0}]

### Aggression

In [1]:
from expert.core.aggression.audio_aggression import audio_analysis

In [2]:
audio = audio_analysis.AudioAggression(audio='/home/izs/Videos/simplescreenrecorder-2023-02-22_12.08.11.mp4', stamps=[[14, 39]])

In [3]:
r = audio.get_report()

In [4]:
r

([{'time_sec': 14, 'volume': 0.05858, 'dynamic_changes': 0, 'temp': 125.0},
  {'time_sec': 24, 'volume': 0.0564, 'dynamic_changes': 0, 'temp': 133.929},
  {'time_sec': 34, 'volume': 0.05519, 'dynamic_changes': 0, 'temp': 117.188}],
 [{'loud part': 0.33, 'fast_part': 0.33}])

In [None]:
figure, ax = plt.subplots(figsize=(50, 10))
sns.set_theme(style="whitegrid")
sns.barplot(data=exp_data, x="con_time_sec", y="congruence", color="#FFC043", alpha=0.75)

In [2]:
audio = audio_analysis.AudioAggression(audio='/home/izs/Projects/auto_censor/examples/rus_sample.wav')

In [3]:
audio.get_report()

([{'time_sec': 0, 'volume': 0.02273, 'dynamic_changes': 0, 'temp': 104.167}],
 [{'loud part': 1.0, 'fast_part': 1.0}])

In [4]:
print(audio.average_temp,
audio.average_vol,
audio.loud_part,
audio.fast_part)

104.16666666666667 0.02272891253232956 1.0 1.0


### Text aggression

In [1]:
from __future__ import annotations

import json
import pandas as pd
from typing import List, Tuple, Union
import torch

# Local dependencies:
from expert.core.aggression.text_aggression.text_models_en import Depreciation as DepreciationEn
from expert.core.aggression.text_aggression.text_models_en import Imperative as ImperativeEn
from expert.core.aggression.text_aggression.text_models_en import Toxic as ToxicEn

from expert.core.aggression.text_aggression.text_models_ru import Depreciation as DepreciationRu
from expert.core.aggression.text_aggression.text_models_ru import Imperative as ImperiveRu
from expert.core.aggression.text_aggression.text_models_ru import Toxic as ToxicRu


class TextAggression:
    def __init__(
        self,
        # words_path
        fragments: Union[str, List[str]],
        lang: str = 'en',
        device: torch.device | None = None
    ) -> None:
        if lang not in ['en', 'ru']:
            raise NameError

        self.lang = lang
        self._device = torch.device("cpu")
        self.analysis_result = []
        # if device is not None:
        #     self._device = device
        #     self.model.to(self._device)\
        self.fragments = fragments
        self.div_aud_agg = []
        if type(self.fragments) not in [str, list]:
            raise ValueError("Fragments should be str or list")

    def is_imperative(self) -> List[bool]:
        if self.lang == 'en':
            imperative = ImperativeEn()

            if type(self.fragments) is str:
                return [imperative.is_imperative(self.fragments)]
            elif type(self.fragments) is list:
                result_list = []
                for fragment in self.fragments:
                    result_list.append(imperative.is_imperative(fragment))
                return result_list
            else:
                raise ValueError('Input sents must be str or list')
        elif self.lang == 'ru':
            imperative = ImperiveRu()
            if type(self.fragments) is str:
                return [imperative.is_imperative(self.fragments)]
            elif type(self.fragments) is list:
                result_list = []
                for fragment in self.fragments:
                    result_list.append(imperative.is_imperative(fragment))
                return result_list

    def is_toxic(self) -> List[bool]:
        if self.lang == 'en':
            toxic = ToxicEn()
            if type(self.fragments) is str:
                return [toxic.is_toxic(self.fragments)]
            elif type(self.fragments) is list:
                result_list = []
                for sent in self.fragments:
                    result_list.append(toxic.is_toxic(sent))
                return result_list

        elif self.lang == 'ru':
            toxic = ToxicRu('/expert/expert/core/aggression/text_aggression/aggressive-rus/app/data/rubert-toxic-detection')
            if type(self.fragments) is str:
                return [toxic.is_toxic(self.fragments)]
            elif type(self.fragments) is list:
                result_list = []
                for fragment in self.fragments:
                    result_list.append(toxic.is_toxic(fragment))
                return result_list

    def is_depreciation(self) -> int:
        if self.lang == 'en':
            depreciation = DepreciationEn()
            if type(self.fragments) is str:
                return [depreciation.is_deprication(self.fragments)]
            elif type(self.fragments) is list:
                result_list = []
                for fragment in self.fragments:
                    result_list.append(depreciation.is_deprication(fragment))
                return result_list
        elif self.lang == 'ru':
            depreciation = DepreciationRu()
            if type(self.fragments) is str:
                return [len(depreciation.is_depreciation(self.fragments)[1])]
            elif type(self.fragments) is list:
                result_list = []
                for fragment in self.fragments:
                    result_list.append(len(depreciation.is_depreciation(fragment)[1]))
                return result_list

    @property
    def device(self) -> torch.device:
        """Check the device type."""
        return self._device


    def get_report(self):

        self.analysis_result.append(self.is_depreciation())
        self.analysis_result.append(self.is_imperative())
        self.analysis_result.append(self.is_toxic())

        toxic_part = 0
        deprecation_part = 0
        imperative_part = 0

        for num in range(len(self.analysis_result[0])):
            self.div_aud_agg.append({'num_deprecation': self.analysis_result[0][num],
                                    'is_imperative': self.analysis_result[1][num],
                                     'is_toxic': self.analysis_result[2][num]})
            if self.analysis_result[0][num] > 0:
                deprecation_part += 1
            if self.analysis_result[1][num]:
                imperative_part += 1
            if self.analysis_result[2][num]:
                toxic_part += 1
        self.full_aud_agg = {'toxic_part': toxic_part / len(self.div_aud_agg),
                      'imperative_part': imperative_part / len(self.div_aud_agg),
                      'deprecation_part': deprecation_part / len(self.div_aud_agg)
                      }

        return (
            self.div_aud_agg,
            self.full_aud_agg
        )

2023-02-22 15:30:52.659189: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2023-02-22 15:30:52.659310: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/izs/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [8]:
import json
with open('/home/izs/Projects/expert/expert/core/aggression/text_aggression/words.json') as f:
    all_words = json.load(f)

In [5]:
def get_phrases(all_words: list, duration: int = 10) -> list:
    """Split transcribed text into segments of a fixed length.
    
    Args:
        all_words (List): All stamps with words from the transcribed text.
        duration (int, optional): Length of intervals for extracting phrases from speech. Defaults to 10.
    """
    phrases = []
    
    assert len(all_words) > 1, "Not enough words in text."
    
    while all_words:
        init_elem = all_words.pop(0)
        phrase = init_elem["word"]
        time_left = duration - (init_elem["end"] - init_elem["start"])
        end_time = init_elem["end"]
        if time_left < 0:
            phrases.append({"time": [init_elem["start"], init_elem["end"]], "text": phrase})
            time_left -= init_elem["end"] - end_time
            end_time = init_elem["end"]
            continue
        while time_left > 0 and all_words:
            elem = all_words.pop(0)
            phrase = phrase + " " + elem["word"]
            time_left -= elem["end"] - end_time
            end_time = elem["end"]
        else:
            phrases.append({"time": [init_elem["start"], elem["end"]], "text": phrase})
    
    return phrases

In [10]:
ph = get_phrases(all_words)

In [14]:
ph = [ttt['text'] for ttt in ph]

In [19]:
test = TextAggression(fragments=ph, lang='ru', device='cuda')

In [20]:
test.get_report()

([{'num_deprecation': 0, 'is_imperative': False, 'is_toxic': False},
  {'num_deprecation': 0, 'is_imperative': False, 'is_toxic': False},
  {'num_deprecation': 0, 'is_imperative': False, 'is_toxic': False}],
 {'toxic_part': 0.0, 'imperative_part': 0.0, 'deprecation_part': 0.0})

In [1]:
text  = """You know, fuck you, American democracy is under attack because the defeated former President of the United States refuses to accept the results of the 2020 election. He refuses to accept the will of the people. He refuses to accept the fact that he lost. He has abused his power cling trutte and put the loyalty to himself before loyally the Constitution. Open the doors, HAL, please.< And he's made a big lie, an article of faith in the Magier Republican Party, the minority of that party. Write it down, the great irony about the 2020 election is that it's the most attacked election in our history. Stop it, will you. And yet, and yet. There's no election in our history that we can be more certain of its results. Every legal little cutie challenge that could have been brought was brought. Every recount that could have been undertaken was undertaken. Every recount confirmed the results. This institution, this intimidation, this violence against Democrats, Republicans, and nonpartisan officials just doing their jobs are the consequence of lies told for power and profit, lies of conspiracy and malice, lies repeated over and over to generate a cycle of anger, hate, vitriol, and even violence. In this shit moment, we have to confront those lies with the truth. You have to do it. Stop The very future of our nation depends on it. My fellow Americans, we're facing a defining moment, an inflection point, when we must with one overwhelming unified voice speak as a country and say there's no place, no place, for voter intimidation or political violence in America, whether it's directed at Democrats or Republicans. """
sents = text.split('.')

In [2]:
sents

['You know, fuck you, American democracy is under attack because the defeated former President of the United States refuses to accept the results of the 2020 election',
 ' He refuses to accept the will of the people',
 ' He refuses to accept the fact that he lost',
 ' He has abused his power cling trutte and put the loyalty to himself before loyally the Constitution',
 ' Open the doors, HAL, please',
 "< And he's made a big lie, an article of faith in the Magier Republican Party, the minority of that party",
 " Write it down, the great irony about the 2020 election is that it's the most attacked election in our history",
 ' Stop it, will you',
 ' And yet, and yet',
 " There's no election in our history that we can be more certain of its results",
 ' Every legal little cutie challenge that could have been brought was brought',
 ' Every recount that could have been undertaken was undertaken',
 ' Every recount confirmed the results',
 ' This institution, this intimidation, this violence a

In [12]:
from expert.core.aggression.text_aggression.text_analysis import TextAggression

In [4]:
test = TextAggression(fragments=sents[:-1], device='cpu')

In [5]:
a, b = test.get_report()



2023-02-17 17:43:00,990 loading file /home/izs/.flair/models/pos-english-fast/36f7923039eed4c66e4275927daaff6cd275997d61d238355fb1fe0338fe10a1.ff87e5b4e47fdb42a0c00237d9506c671db773e0a7932179ace82e584383a1b8
2023-02-17 17:43:01,146 SequenceTagger predicts: Dictionary with 53 tags: <unk>, O, UH, ,, VBD, PRP, VB, PRP$, NN, RB, ., DT, JJ, VBP, VBG, IN, CD, NNS, NNP, WRB, VBZ, WDT, CC, TO, MD, VBN, WP, :, RP, EX, JJR, FW, XX, HYPH, POS, RBR, JJS, PDT, NNPS, RBS, AFX, WP$, -LRB-, -RRB-, ``, '', LS, $, SYM, ADD
Token[5]: "cling" → NN (0.9573)


In [6]:
a

[{'num_deprecation': 0, 'is_imperative': False, 'is_toxic': True},
 {'num_deprecation': 0, 'is_imperative': False, 'is_toxic': False},
 {'num_deprecation': 0, 'is_imperative': False, 'is_toxic': False},
 {'num_deprecation': 1, 'is_imperative': False, 'is_toxic': False},
 {'num_deprecation': 0, 'is_imperative': False, 'is_toxic': False},
 {'num_deprecation': 0, 'is_imperative': False, 'is_toxic': False},
 {'num_deprecation': 0, 'is_imperative': False, 'is_toxic': False},
 {'num_deprecation': 0, 'is_imperative': True, 'is_toxic': False},
 {'num_deprecation': 0, 'is_imperative': False, 'is_toxic': False},
 {'num_deprecation': 0, 'is_imperative': False, 'is_toxic': False},
 {'num_deprecation': 0, 'is_imperative': False, 'is_toxic': False},
 {'num_deprecation': 0, 'is_imperative': False, 'is_toxic': False},
 {'num_deprecation': 0, 'is_imperative': False, 'is_toxic': False},
 {'num_deprecation': 0, 'is_imperative': False, 'is_toxic': False},
 {'num_deprecation': 0, 'is_imperative': False, 'i

In [7]:
b

{'toxic_part': 0.1111111111111111,
 'imperative_part': 0.05555555555555555,
 'deprecation_part': 0.05555555555555555}

In [8]:
text_ru = ' студентов и тмон, которые сделали... Нет, на самом деле это данные соответствующей реальности, это данные соответствующих сайтов, где милые дамы оставляют свои геотеги, квартиры, где они находятся. Так вот, если посмотреть на эти данные, видно, что все скопления желтые пятны, это все скопление вокруг станции метро, то есть совсем не в звероподобный водитель джипа, которому приехать куда угодно, и являются основными пользы всем. А пользы в тлиметро, которые больше, то есть, фактически даже такие загадочные процессы, которые никто не видит, они тоже зависят в городской транспортной сети. Понятно, что где есть потоки людей, там появляются и предложения. Так вот, эта гипотеза сама по себе, она может быть заложена в модели, которые и определяют потоки грязных, криминальных денег в Санкт-Петербурге. Сюда вкладывается все, то есть, от бабушек с семечками у метро, которые просто не торгуют, не лицензировано, и таксистов с поленой водкой багажники, до проститута, и наркотиков и жизнью. Вот здесь вот голубые места, скопления микро... голубые точки, давайте, коллеги, я еще раз перезапущу, голубые облакают скопление микрантов, а точки это посвяну и данные, это звонки зарегистрированные.'
sents_rus = text_ru.split('.')

In [9]:
while '' in sents_rus:
    sents_rus.remove('')

In [10]:
test = TextAggression(fragments=sents_rus, lang='ru', device='cpu')

In [11]:
test.analysis_result

[]

In [12]:
a, b = test.get_report()

In [13]:
a

[{'num_deprecation': 0, 'is_imperative': False, 'is_toxic': False},
 {'num_deprecation': 0, 'is_imperative': False, 'is_toxic': False},
 {'num_deprecation': 0, 'is_imperative': False, 'is_toxic': False},
 {'num_deprecation': 0, 'is_imperative': False, 'is_toxic': False},
 {'num_deprecation': 1, 'is_imperative': False, 'is_toxic': False},
 {'num_deprecation': 1, 'is_imperative': False, 'is_toxic': False},
 {'num_deprecation': 4, 'is_imperative': False, 'is_toxic': False},
 {'num_deprecation': 0, 'is_imperative': False, 'is_toxic': False},
 {'num_deprecation': 2, 'is_imperative': False, 'is_toxic': False}]

In [14]:
b

{'toxic_part': 0.0,
 'imperative_part': 0.0,
 'deprecation_part': 0.4444444444444444}

### STT

In [1]:
from expert.data.annotation import speech_to_text

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package omw-1.4 to /home/ismirnov/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [2]:
file_path = '/home/ismirnov/workspace/ml-server/app/libs/videos/ru_en_example.mp4'

In [3]:
ttt = speech_to_text.transcribe_video(file_path, lang='ru', model='server')

In [4]:
all_words = speech_to_text.get_all_words(ttt)

In [5]:
all_words[0][-1]

{'text': 'affordable.', 'start': 213.3, 'end': 213.78, 'confidence': 0.992}

In [6]:
all_words

([{'text': 'Рядом', 'start': 0.2, 'end': 0.5, 'confidence': 0.708},
  {'text': 'со', 'start': 0.5, 'end': 0.6, 'confidence': 0.987},
  {'text': 'мной', 'start': 0.6, 'end': 0.82, 'confidence': 0.989},
  {'text': 'находится', 'start': 0.82, 'end': 1.38, 'confidence': 0.975},
  {'text': 'менеджер', 'start': 1.38, 'end': 1.98, 'confidence': 0.996},
  {'text': 'этого', 'start': 1.98, 'end': 2.2, 'confidence': 0.826},
  {'text': 'салона', 'start': 2.2, 'end': 2.62, 'confidence': 0.967},
  {'text': 'магазина', 'start': 2.62, 'end': 3.14, 'confidence': 0.825},
  {'text': 'Зои', 'start': 3.14, 'end': 3.48, 'confidence': 0.514},
  {'text': 'Виксельштейн,', 'start': 3.48, 'end': 4.0, 'confidence': 0.86},
  {'text': 'которая', 'start': 4.02, 'end': 4.28, 'confidence': 0.917},
  {'text': 'любезно', 'start': 4.28, 'end': 4.7, 'confidence': 0.934},
  {'text': 'согласилась', 'start': 4.7, 'end': 5.16, 'confidence': 0.995},
  {'text': 'ответить', 'start': 5.16, 'end': 5.64, 'confidence': 0.993},
  {'t

In [7]:
speech_to_text.between_timestamps(all_words[0], 10, 40)

'именно Simon Cheng? В мире моды сегодня насчитывают огромное количество дизайнеров. Почему предпочитаете работать с этим человеком? Перех он били колебный человек. Его дизайн могут носить любого возраста, любого шейп, сайс, хайт и он timeless. Мне вовешить, я бы мне ввешить, что я помню первый сют, когда мне было 18 лет, мы же обинчивы сют. У меня еще не есть, просто сфант, remember.'

In [58]:
from typing import List, Dict

def between_timestamps(all_words: List, start: float, end: float) -> str:
    def _binary_search(stamps, val):
        lowIdx, highIdx = 0, len(stamps) - 1
        while highIdx > lowIdx:
            idx = (highIdx + lowIdx) // 2
            elem = stamps[idx]
            if stamps[lowIdx] == val:
                return [lowIdx, lowIdx]
            elif elem == val:
                return [idx, idx]
            elif elem == val:
                return [highIdx, highIdx]
            elif elem > val:
                if highIdx == idx:
                    return [lowIdx, highIdx]
                highIdx = idx
            else:
                if lowIdx == idx:
                    return [lowIdx, highIdx]
                lowIdx = idx
        return [lowIdx, highIdx]
    starts = [elem['start'] for elem in all_words]
    ends = [elem['end'] for elem in all_words]
    start_idx = min(_binary_search(starts, start))
    end_idx = max(_binary_search(ends, end))
    words = [elem['text'] for elem in all_words[start_idx:end_idx]]
    return ' '.join(words)

In [57]:
all_words[0][47], all_words[0][61]

({'text': 'Его', 'start': 19.98, 'end': 20.16, 'confidence': 0.828},
 {'text': 'вовешить,', 'start': 29.8, 'end': 30.46, 'confidence': 0.193})

In [56]:
between_timestamps(all_words[0], 20, 30)

47 61


'Его дизайн могут носить любого возраста, любого шейп, сайс, хайт и он timeless. Мне'