In [21]:
import pandas as pd

df = pd.read_csv("arxiv_ml.csv")

print(df.head())

      id                                              title  \
0  58805  Advances in Asynchronous Parallel and Distribu...   
1  58806  Turbocharging Treewidth-Bounded Bayesian Netwo...   
2  58807  Crop Yield Prediction Integrating Genotype and...   
3  58808  Time Series Analysis and Forecasting of COVID-...   
4  58809  Movement Tracking by Optical Flow Assisted Ine...   

                                            abstract  
0    Motivated by large-scale optimization proble...  
1    We present a new approach for learning the s...  
2    Accurate prediction of crop yield supported ...  
3    Coronavirus disease 2019 (COVID-19) is a glo...  
4    Robust and accurate six degree-of-freedom tr...  


In [27]:
#Data Processing (Menghilangkan karakter, dsb)
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Download necessary NLTK data, including 'punkt_tab'
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

def preprocess_text(text):
    #Untuk Penghilangan Lowercasing
    text = text.lower()

    #Untuk Karakter Spesial
    text = re.sub(r'\W', ' ', text)

    #Untuk Melakukan Tokenisasi
    tokens = word_tokenize (text)
    tokens = [word for word in tokens if word not in stopwords.words('english')]  #Stopword removal
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]  #Lemmatization
    return " ".join(tokens)

#Terapkan preprocessing ke dataset
df["processed_abstract"] = df["abstract"].apply(preprocess_text)

print(df.head())


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


      id                                              title  \
0  58805  Advances in Asynchronous Parallel and Distribu...   
1  58806  Turbocharging Treewidth-Bounded Bayesian Netwo...   
2  58807  Crop Yield Prediction Integrating Genotype and...   
3  58808  Time Series Analysis and Forecasting of COVID-...   
4  58809  Movement Tracking by Optical Flow Assisted Ine...   

                                            abstract  \
0    Motivated by large-scale optimization proble...   
1    We present a new approach for learning the s...   
2    Accurate prediction of crop yield supported ...   
3    Coronavirus disease 2019 (COVID-19) is a glo...   
4    Robust and accurate six degree-of-freedom tr...   

                                  processed_abstract  
0  motivated large scale optimization problem ari...  
1  present new approach learning structure treewi...  
2  accurate prediction crop yield supported scien...  
3  coronavirus disease 2019 covid 19 global publi...  
4  robust

In [28]:
#Membangun Sistem Retrieval
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Inisialisasi model TF-IDF
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(df["processed_abstract"])

def retrieve_documents(query, top_k=5):
    query_vec = vectorizer.transform([preprocess_text(query)])  # Proses pertanyaan Alice
    similarities = cosine_similarity(query_vec, tfidf_matrix).flatten()  # Hitung kemiripan
    top_indices = similarities.argsort()[-top_k:][::-1]  # Ambil top-k artikel teratas
    return df.iloc[top_indices][["title", "abstract"]]

# Contoh pertanyaan
query = "What is the latest development in deep learning?"
retrieved_docs = retrieve_documents(query)
print(retrieved_docs)


                                                   title  \
15586  Applications of Federated Learning in Smart Ci...   
23461  There is no data like more data -- current sta...   
14006  Label Augmentation via Time-based Knowledge Di...   
23562  A Comprehensive Survey on Community Detection ...   
18417        Model Complexity of Deep Learning: A Survey   

                                                abstract  
15586    Federated learning plays an important role i...  
23461    Annotated datasets have become one of the mo...  
14006    Detecting anomalies has become increasingly ...  
23562    A community reveals the features and connect...  
18417    Model complexity is a fundamental problem in...  


In [29]:
#Membangun Model Generasi Jawaban
from transformers import T5ForConditionalGeneration, T5Tokenizer

# Load model dan tokenizer
model_name = "t5-small"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

def generate_answer(question, context):
    input_text = f"question: {question}  context: {context}"
    input_ids = tokenizer.encode(input_text, return_tensors="pt")
    output_ids = model.generate(input_ids)
    answer = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    return answer

# Contoh penggunaan
question = "What is deep learning?"
context = retrieved_docs.iloc[0]["abstract"]  # Ambil abstrak artikel pertama
answer = generate_answer(question, context)
print(answer)

Internet of Things, transportation, communications, finance, medical and other fields


In [26]:
#Evaluasi Model (Response Relevancy)
from sklearn.metrics import recall_score

# Simulasi y_true (ground truth) dan y_pred (hasil sistem)
y_true = [1, 1, 0, 1, 0]  # Artikel relevan atau tidak (1=relevan, 0=tidak)
y_pred = [1, 1, 0, 0, 1]

recall_at_k = recall_score(y_true, y_pred, average='macro')
print(f'Recall@K: {recall_at_k}')


Recall@K: 0.5833333333333333


In [32]:
#Evaluasi Model (Context Precision)
from nltk.translate.bleu_score import sentence_bleu
!pip install rouge-score
from rouge_score import rouge_scorer #Import rouge_scorer

#Membangun Model Generasi Jawaban # Copy from ipython-input-13-9d21328a15ea and paste here
from transformers import T5ForConditionalGeneration, T5Tokenizer
import pandas as pd #Import pandas for reading the data
from sklearn.feature_extraction.text import TfidfVectorizer #Import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity #Import cosine_similarity
# Assuming 'preprocess_text' function is defined in a previous cell or imported
#from your_preprocessing_module import preprocess_text

# Instead of trying to import 'preprocess_text',
# simply copy and paste the function definition here
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Download necessary NLTK data, including 'punkt_tab'
# (This might be redundant if already downloaded)
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

def preprocess_text(text):
    #Untuk Penghilangan Lowercasing
    text = text.lower()

    #Untuk Karakter Spesial
    text = re.sub(r'\W', ' ', text)

    #Untuk Melakukan Tokenisasi
    tokens = word_tokenize (text)
    tokens = [word for word in tokens if word not in stopwords.words('english')]  #Stopword removal
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]  #Lemmatization
    return " ".join(tokens)

# Define retrieve_documents function
def retrieve_documents(query, top_k=5):
    query_vec = vectorizer.transform([preprocess_text(query)])  # Proses pertanyaan Alice
    similarities = cosine_similarity(query_vec, tfidf_matrix).flatten()  # Hitung kemiripan
    top_indices = similarities.argsort()[-top_k:][::-1]  # Ambil top-k artikel teratas
    return df.iloc[top_indices][["title", "abstract"]]

# Assuming 'df' and 'preprocess_text' are defined in previous cells
# Load the dataset
df = pd.read_csv("arxiv_ml.csv")

# Apply preprocessing to create the 'processed_abstract' column
df["processed_abstract"] = df["abstract"].apply(preprocess_text)

# Initialize TF-IDF model
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(df["processed_abstract"])

# Load model dan tokenizer
model_name = "t5-small"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

def generate_answer(question, context):
    input_text = f"question: {question}  context: {context}"
    input_ids = tokenizer.encode(input_text, return_tensors="pt")
    output_ids = model.generate(input_ids)
    answer = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    return answer

# Contoh penggunaan
query = "What is the latest development in deep learning?" #Define query before using it
retrieved_docs = retrieve_documents(query) #Call retrieve_documents to define retrieved_docs
print(retrieved_docs)

question = "What is deep learning?"
context = retrieved_docs.iloc[0]["abstract"]  # Ambil abstrak artikel pertama
answer = generate_answer(question, context) #This will define answer variable in current scope
print(answer)
#End of copy

reference = ["Deep learning is a subset of machine learning."]
candidate = [answer]  # Jawaban dari model

# Hitung BLEU
bleu_score = sentence_bleu(reference, candidate)
print(f'BLEU Score: {bleu_score}')

# Hitung ROUGE
scorer = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)
scores = scorer.score(reference[0], candidate[0])
print(f'ROUGE Scores: {scores}')



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


                                                   title  \
15586  Applications of Federated Learning in Smart Ci...   
23461  There is no data like more data -- current sta...   
14006  Label Augmentation via Time-based Knowledge Di...   
23562  A Comprehensive Survey on Community Detection ...   
18417        Model Complexity of Deep Learning: A Survey   

                                                abstract  
15586    Federated learning plays an important role i...  
23461    Annotated datasets have become one of the mo...  
14006    Detecting anomalies has become increasingly ...  
23562    A community reveals the features and connect...  
18417    Model complexity is a fundamental problem in...  
Internet of Things, transportation, communications, finance, medical and other fields
BLEU Score: 0
ROUGE Scores: {'rouge1': Score(precision=0.1, recall=0.125, fmeasure=0.11111111111111112), 'rougeL': Score(precision=0.1, recall=0.125, fmeasure=0.11111111111111112)}
