In [1]:
from google.colab import drive
drive.mount ('/content/drive')

Mounted at /content/drive


In [2]:
%cd '/content/drive/MyDrive/chatbot skripsi'

/content/drive/MyDrive/chatbot skripsi


In [3]:
import nltk
import numpy as np
import string
import warnings
warnings.filterwarnings("ignore")

In [4]:
f = open('content.txt','r',errors = 'ignore', encoding = 'utf-8')
paragraph = f.read()

In [5]:
greetings = ['Hey', 'Hello', 'Hi', 'It’s great to see you', 'Nice to see you', 'Good to see you']
bye = ['Bye', 'Bye-Bye', 'Goodbye', 'Have a good day','Stop']
thank_you = ['Thanks', 'Thank you', 'Thanks a bunch', 'Thanks a lot.', 'Thank you very much', 'Thanks so much', 'Thank you so much']
thank_response = ['You\'re welcome.', 'No problem.', 'No worries.', 'y pleasure.', 'It was the least I could do.', 'Glad to help.']

In [6]:
nltk.download('punkt')   # for first-time use only
nltk.download('wordnet')    # for first-time use only

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [7]:
sent_tokens = nltk.sent_tokenize(paragraph)
word_tokens = nltk.word_tokenize(paragraph)

In [8]:
sent_tokens[:1]

['Jadwal operasional atau jam buka Dalmy Gym saat ini adalah setiap hari mulai dari jam 15.00 hingga 22.00.']

In [9]:
word_tokens[:7]

['Jadwal', 'operasional', 'atau', 'jam', 'buka', 'Dalmy', 'Gym']

In [10]:
import re

def categorize_question(question):
    # R1: Jadwal operasional
    if re.search(r"(jadwal|jadwal operasional|operasional|buka|jam buka)", question):
        return "jadwal_operasional"

    # R2: Biaya member
    if re.search(r"(biaya|member|biaya member|harga|harga khusus|daftar)", question):
        return "biaya_member"

    # R3: Peraturan gym
    if re.search(r"(peraturan|peraturan gym|untuk peraturan gym|aturan)", question):
        return "peraturan_gym"

    # R4: Panduan latihan
    if re.search(r"(panduan|latihan|panduan latihan|umum|panduan latihan secara umum|pemula)", question):
        return "panduan_latihan"

    # R5: Daftar makanan bodybuilder
    if re.search(r"(makanan|daftar makanan|bodybuilder|daftar makanan untuk bodybuilder)", question):
        return "daftar_makanan_bodybuilder"

    # Default: Pertanyaan tidak terklasifikasi
    return "tidak_terklasifikasi"

def answer_question(question):
    category = categorize_question(question)

    with open("content.txt", "r") as f:
        content = f.read()

    if category == "jadwal_operasional":
        return content.split("\n\n")[0]
    elif category == "biaya_member":
        return content.split("\n\n")[1]
    elif category == "peraturan_gym":
        return content.split("\n\n")[2]
    elif category == "panduan_latihan":
        return content.split("\n\n")[3]
    elif category == "daftar_makanan_bodybuilder":
        return content.split("\n\n")[4]
    else:
        return "Maaf, saya tidak mengerti pertanyaan Anda."


In [11]:
# Lemmitization

lemmer = nltk.stem.WordNetLemmatizer()

In [12]:
def LemTokens(tokens):
    return [lemmer.lemmatize(token) for token in tokens]    # iterate through every token and lemmatize it

In [13]:
# string.punctuation has all the punctuations
# ord(punct) convert punctuation to its ASCII value
# dict contains {ASCII: None} for punctuation mark

remove_punct_dict = dict((ord(punct), None) for punct in string.punctuation)

# remove_punct_dict

In [14]:
# This will return the word to LemTokens after Word tokenize, lowering its case and removing punctuation mark
# translate will find punctuation mark in remove_punct_dict and if found replace it with None

def Normalize(text):
    return LemTokens(nltk.word_tokenize(text.lower().translate(remove_punct_dict)))

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer   # For Tfid Vectorizer
from sklearn.metrics.pairwise import cosine_similarity   # For cosine similarity

In [16]:
def response(user_response):
    robo_response = ''

    sent_tokens.append(user_response)   # Appending the Question user ask to sent_tokens to find the Tf-Idf and cosine_similarity between User query and the content.
    TfidfVec = TfidfVectorizer(tokenizer = Normalize, stop_words='english')    #tokenizer ask about Pre-processing parameter and it will consume the Normalize() function and it will also remove StopWords
    tfidf = TfidfVec.fit_transform(sent_tokens)

    vals = cosine_similarity(tfidf[-1], tfidf)    # It will do cosine_similarity between last vectors and all the vectors because last vector contain the User query
    idx = vals.argsort()[0][-2]     # argsort() will sort the tf_idf in ascending order. [-2] means second last index i.e. index of second highest value after sorting the cosine_similarity. Index of last element is not taken as query is added at end and it will have the cosine_similarity with itself.

    flat = vals.flatten()   # [[0,...,0.89,1]] -> [0,...,0.89,1] this will make a single list of vals which had list inside a list.
    flat.sort()
    req_tfidf = flat[-2]  # this contains tfid value of second highest cosine similarity

    if(req_tfidf == 0):    # 0 means there is no similarity between the question and answer
        robo_response = robo_response + "Saya minta maaf! saya tidak mengerti maksud anda, bisa diketik ulang?"
        return robo_response

    else:
        robo_response = robo_response + sent_tokens[idx]    # return the sentences at index -2 as answer
        return robo_response

In [17]:
import random

def bot_initialize(user_msg):
    flag=True
    while(flag==True):
        user_response = user_msg
        if(user_response not in bye):
            if(user_response == '/start'):
                bot_resp = """Halo! untuk memudahkan cara berkomunikasi dengan bot ini kamu bisa tanyakan tentang :
                \n - Jadwal Operasional,
                \n - Biaya Member,
                \n - Peraturan Gym,
                \n - Panduan Latihan Secara Umum,
                \n - dan Daftar Makanan untuk Bodybuilder."""
                return bot_resp
            elif(user_response in thank_you):
                bot_resp = random.choice(thank_response)
                return bot_resp
            elif(user_response in greetings):
                bot_resp = random.choice(greetings) + ", Apa informasi yang ingin kamu ketahui tentang Dalmy Gym"
                return bot_resp
            else:
                user_response = user_response.lower()
                category = categorize_question(user_response)
                if category != "tidak_terklasifikasi":
                    bot_resp = answer_question(user_response)
                else:
                    bot_resp = response(user_response)
                if user_response in sent_tokens:
                    idx = sent_tokens.index(user_response)  # find the index of user_response
                    sent_tokens.pop(idx)  # remove user_response from the list
                return bot_resp
        else:
            flag = False
            bot_resp = random.choice(bye)
            return bot_resp

In [18]:
import requests
import json

class telegram_bot():
    def __init__(self):
        self.token = "6717781776:AAHLBLT7ExCgooIm1Eo-2iiedr1PTf_UfaE"    #write your token here!
        self.url = f"https://api.telegram.org/bot{self.token}"

    def get_updates(self,offset=None):
        url = self.url+"/getUpdates?timeout=100"   # In 100 seconds if user input query then process that, use it as the read timeout from the server
        if offset:
            url = url+f"&offset={offset+1}"
        url_info = requests.get(url)
        return json.loads(url_info.content)

    def send_message(self,msg,chat_id):
        url = self.url + f"/sendMessage?chat_id={chat_id}&text={msg}"
        if msg is not None:
            requests.get(url)

    def grab_token(self):
        return tokens

In [None]:
tbot = telegram_bot()

update_id = None

def make_reply(msg):     # user input will go here

    if msg is not None:
        reply = bot_initialize(msg)     # user input will start processing from bot_initialize function
    return reply

while True:
    print("...")
    updates = tbot.get_updates(offset=update_id)
    updates = updates['result']
    print(updates)
    if updates:
        for item in updates:
            update_id = item["update_id"]
            print(update_id)
            try:
                if 'message' in item:
                    message = item["message"]["text"]
                    from_ = item["message"]["from"]["id"]
                elif 'edited_message' in item:
                    message = item["edited_message"]["text"]
                    from_ = item["edited_message"]["from"]["id"]
                print(message)
            except:
                message = None
            reply = make_reply(message)
            tbot.send_message(reply,from_)

...
[]
...
[]
...
[{'update_id': 8270069, 'message': {'message_id': 850, 'from': {'id': 628764258, 'is_bot': False, 'first_name': 'hwhw', 'username': 'dwirestuan', 'language_code': 'en'}, 'chat': {'id': 628764258, 'first_name': 'hwhw', 'username': 'dwirestuan', 'type': 'private'}, 'date': 1718273149, 'text': '/start', 'entities': [{'offset': 0, 'length': 6, 'type': 'bot_command'}]}}]
8270069
/start
...
[{'update_id': 8270070, 'message': {'message_id': 852, 'from': {'id': 628764258, 'is_bot': False, 'first_name': 'hwhw', 'username': 'dwirestuan', 'language_code': 'en'}, 'chat': {'id': 628764258, 'first_name': 'hwhw', 'username': 'dwirestuan', 'type': 'private'}, 'date': 1718273156, 'text': 'jadwal'}}]
8270070
jadwal
...
[{'update_id': 8270071, 'edited_message': {'message_id': 852, 'from': {'id': 628764258, 'is_bot': False, 'first_name': 'hwhw', 'username': 'dwirestuan', 'language_code': 'en'}, 'chat': {'id': 628764258, 'first_name': 'hwhw', 'username': 'dwirestuan', 'type': 'private'}, 