In [None]:
!pip install wikipedia
!pip install transformers
!pip install keybert

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import pandas as pd

train = pd.read_csv("/content/drive/MyDrive/datasets/AIIJC/2-NLP/train_data.csv")[:50000]
test = pd.read_csv("/content/drive/MyDrive/datasets/AIIJC/2-NLP/test.csv")
sample = pd.read_csv("/content/drive/MyDrive/datasets/AIIJC/2-NLP/sample_submission.csv")

In [4]:
from transformers import pipeline
from keybert import KeyBERT
import wikipedia as wiki
from tqdm.notebook import tqdm
import warnings
from scipy.spatial.distance import cosine
from sklearn.feature_extraction.text import TfidfVectorizer


class QuestionAnswering():
    def __init__(self, train_data, transformer_name='bert-large-uncased-whole-word-masking-finetuned-squad', reserve_transformer='deepset/bert-large-uncased-whole-word-masking-squad2', min_kw_len=3, lang='en', ignore_warnings=True):
        if ignore_warnings: warnings.simplefilter("ignore")
        self.transformer = pipeline(task='question-answering', model=transformer_name, tokenizer=transformer_name)
        self.reserve_transformer = pipeline(task='question-answering', model=reserve_transformer, tokenizer=transformer_name)
        self.kw_model = KeyBERT()
        self.train = train_data
        self.min_kw_len = min_kw_len
        self.vectorizer = TfidfVectorizer()
        self.vectorizer.fit(self.train['question'].tolist())
        self.corpus_v = self.vectorizer.transform(self.train['question'].tolist()).toarray()
        wiki.set_lang(lang)

    
    def __repr__(self):
        return "Question Answering class"


    def find_key_word(self, sentence, ngrams=(1,3)):
        return self.kw_model.extract_keywords(sentence, keyphrase_ngram_range=ngrams, stop_words=None)[0][0]


    def search_on_wiki(self, topic):
        page = wiki.page(''.join(topic.split()))
        return page.content

    
    def cosine_answer(self, question):
        text_v = self.vectorizer.transform([question]).toarray()
        max = {'value' : 0, 'id': 0}
        for i in range(len(self.corpus_v)):
            cos = 1 - cosine(text_v[0], self.corpus_v[i])
            if cos > max['value']:
                max['value'] = cos
                max['id'] = i

        return self.train['answer'][max['id']]


    def find_wiki_data(self, question):
        all_data = []
        keyw = []
        for i in range(self.min_kw_len, len(question.split())):
            try:
                keywords = self.find_key_word(question, (len(question.split()) - i-1, len(question.split()) - i))
                keyw.append(keywords)
                data = wiki.summary(keywords)
                if len(data) > 0:
                    return data
                else:
                    continue
            except:
                continue

        for keywords in keyw:
            try:
                kw = wiki.search(keywords, results=1)
                data = wiki.summary(kw)
                if len(data) > 0:
                    return data
                else:
                    continue
            except:
                continue

        for keywords in keyw:
            try:
                data = self.search_on_wiki(keywords)
                if len(data) > 0:
                    return data
                else:
                    continue
            except:
                continue

        if len(all_data) > 0:
            return ' '.join(all_data)

        for keywords in keyw:
            try:
                kw = wiki.search(keywords, results=1)
                data = self.search_on_wiki(kw)
                if len(data) > 0:
                    return data
                else:
                    continue
            except:
                continue

        keyw = []
        for i in list(range(0, self.min_kw_len))[::-1]:
            try:
                keywords = self.find_key_word(question, (len(question.split()) - i-1, len(question.split()) - i))
                keyw.append(keywords)
                data = wiki.summary(keywords)
                if len(data) > 0:
                    return data
                else:
                    continue
            except:
                continue

        for keywords in keyw:
            try:
                data = self.search_on_wiki(keywords)
                if len(data) > 0:
                    return data
                else:
                    continue
            except:
                continue

        for keywords in keyw:
            try:
                kw = wiki.search(keywords, results=1)
                data = wiki.summary(kw)
                if len(data) > 0:
                    return data
                else:
                    continue
            except:
                continue

        for keywords in keyw:
            try:
                kw = wiki.search(keywords, results=1)
                data = self.search_on_wiki(kw)
                if len(data) > 0:
                    return data
                else:
                    continue
            except:
                continue
        
        for keywords in keyw:
            try:
                kws = wiki.search(keywords, results=20)
                for kw in kws:
                    data = self.search_on_wiki(kw)
                    if len(data) > 0:
                        return data
                    else:
                        continue
            except:
                continue

        return 0


    def give_answer(self, question):
        context = self.find_wiki_data(question)
        if context == 0: return self.cosine_answer(question)
        answer = self.transformer({
            'context' : context,
            'question' : question})['answer']
        if answer != '.': return answer
        else:
            answer = self.reserve_transformer({
                'context' : context,
                'question' : question})['answer']
            if answer != '.': return answer
            else:
                return self.cosine_answer(question)


    def predict(self, corpus):
        answers = []
        for question in tqdm(corpus):
            ans = self.give_answer(question)
            answers.append(ans)
        return answers

In [5]:
corpus = test['question'].tolist()

QA = QuestionAnswering(train)
# answers = QA.predict(corpus)

Downloading:   0%|          | 0.00/443 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.25G [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/540 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.25G [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/10.2k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/612 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/349 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/350 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
import requests

requests.get("https://api.telegram.org/bot2057911206:AAGQfUTD03Vd0N6dhhJcgec1F-gMPmX0wqo/sendMessage?chat_id=799213094&text=Prediction is ready!")

<Response [200]>

In [None]:
submission = sample.copy()
submission['answer'] = answers
submission.to_csv("submission_13.csv")

In [None]:
submission = sample.copy()
submission['answer'] = answers
submission.to_csv("/content/drive/MyDrive/datasets/AIIJC/submission_13.csv")

In [6]:
!pip install speechrecognition 
!pip install pydub

Collecting speechrecognition
  Downloading SpeechRecognition-3.8.1-py2.py3-none-any.whl (32.8 MB)
[K     |████████████████████████████████| 32.8 MB 57 kB/s 
[?25hInstalling collected packages: speechrecognition
Successfully installed speechrecognition-3.8.1
Collecting pydub
  Downloading pydub-0.25.1-py2.py3-none-any.whl (32 kB)
Installing collected packages: pydub
Successfully installed pydub-0.25.1


In [10]:
!pip install pytelegrambotapi

Collecting pytelegrambotapi
  Downloading pyTelegramBotAPI-4.1.1.tar.gz (102 kB)
[?25l[K     |███▏                            | 10 kB 41.4 MB/s eta 0:00:01[K     |██████▍                         | 20 kB 9.2 MB/s eta 0:00:01[K     |█████████▋                      | 30 kB 8.1 MB/s eta 0:00:01[K     |████████████▊                   | 40 kB 7.8 MB/s eta 0:00:01[K     |████████████████                | 51 kB 4.1 MB/s eta 0:00:01[K     |███████████████████▏            | 61 kB 4.4 MB/s eta 0:00:01[K     |██████████████████████▎         | 71 kB 4.6 MB/s eta 0:00:01[K     |█████████████████████████▌      | 81 kB 5.2 MB/s eta 0:00:01[K     |████████████████████████████▊   | 92 kB 5.3 MB/s eta 0:00:01[K     |███████████████████████████████▉| 102 kB 5.8 MB/s eta 0:00:01[K     |████████████████████████████████| 102 kB 5.8 MB/s 
Building wheels for collected packages: pytelegrambotapi
  Building wheel for pytelegrambotapi (setup.py) ... [?25l[?25hdone
  Created wheel for pyt

In [None]:
import speech_recognition as sr 
import os 
from pydub import AudioSegment
from pydub.silence import split_on_silence
import telebot
from time import time
import os

%cd content
%rmdir voice_msgs
%mkdir voice_msgs


def get_large_audio_transcription(path):
    r = sr.Recognizer()
    sound = AudioSegment.from_ogg(path)  
    chunks = split_on_silence(sound,
        min_silence_len = 500,
        silence_thresh = sound.dBFS-14,
        keep_silence=500,
    )
    folder_name = "audio-chunks"
    if not os.path.isdir(folder_name):
        os.mkdir(folder_name)
    whole_text = ""
    for i, audio_chunk in enumerate(chunks, start=1):
        chunk_filename = os.path.join(folder_name, f"chunk{i}.wav")
        audio_chunk.export(chunk_filename, format="wav")
        with sr.AudioFile(chunk_filename) as source:
            audio_listened = r.record(source)
            try:
                text = r.recognize_google(audio_listened)
            except sr.UnknownValueError as e:
                return 0
            else:
                text = f"{text.capitalize()}. "
                whole_text += text
    return whole_text

bot = telebot.TeleBot("2048868243:AAFnM9HA_kRW8_NYMjelhKbkQECsxP9Drpk")

@bot.message_handler(commands=['start'])
def start_handler(message):
    bot.send_message(message.chat.id, "Hi, it's QA bot. You can ask your questions :)")

@bot.message_handler(content_types=['text'])
def text_handler(message):
    answer = QA.give_answer(message.text)
    bot.send_message(message.chat.id, answer)

@bot.message_handler(content_types=['voice'])
def voice_processing(message):
    file_info = bot.get_file(message.voice.file_id)
    downloaded_file = bot.download_file(file_info.file_path)
    with open(f'voice_msgs/{message.chat.id}_{int(time())}.ogg', 'wb') as new_file:
        new_file.write(downloaded_file)
    name = f'voice_msgs/{message.chat.id}_{int(time())}.ogg'
    transcription = get_large_audio_transcription(name)
    if transcription == 0:
        bot.send_message(message.chat.id, 'Could\'t recognize your voice')
    else:
        transcription = transcription[:-2]+'?'
        bot.send_message(message.chat.id, f'Your voice was recognized as: {transcription}')
        answer = QA.give_answer(transcription)
        bot.send_message(message.chat.id, answer)

print('Bot started!')
bot.polling()

[Errno 2] No such file or directory: 'content'
/content
rmdir: failed to remove 'voice_msgs': Directory not empty
mkdir: cannot create directory ‘voice_msgs’: File exists
Bot started!
