### Import Library

In [1]:
# torch and KeyBERT to extract keyword
# if you found kernel issue after import the keybert, you have to import the torch before the keybert
import torch
from keybert import KeyBERT

import pandas as pd
import numpy as np
import string
string.punctuation
import datetime

# initialize now as filename
now = datetime.datetime.today().strftime('%d_%m_%Y')

# nltk to nlp preprocessing
import nltk
stopwords = nltk.corpus.stopwords.words('english')
from nltk.tokenize import sent_tokenize, word_tokenize

# initialize KeyBERT model
kw_model = KeyBERT()

### Read Data

In [2]:
# read data, drop duplicates, and get 'name', 'sinta_id', 'title' column
df = pd.read_csv("article_09_08_2022.csv", encoding='utf-8')
df = df.drop_duplicates(subset=None, keep='first', inplace=False, ignore_index=False)
df = df[['name', 'sinta_id', 'title']]
df.head()

Unnamed: 0,name,sinta_id,title
0,ENDI SUHENDI,5976806,Model of a tunneling current in a p-n junction...
1,ENDI SUHENDI,5976806,Improving Students' Science Process Skills thr...
2,ENDI SUHENDI,5976806,Alleviating students’ misconceptions about new...
3,ENDI SUHENDI,5976806,Preface: MSCEIS 2016
4,ENDI SUHENDI,5976806,The transformation of two-tier test into four-...


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10122 entries, 0 to 11230
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   name      10122 non-null  object
 1   sinta_id  10122 non-null  int64 
 2   title     10122 non-null  object
dtypes: int64(1), object(2)
memory usage: 316.3+ KB


### NLP Preprocessing

In [4]:
# define preprocessing function
def preprocessing(text):
    # join title, set title as lower, replace several string from title, and tokenize title
    title = text
    title = "".join([i for i in title if i not in string.punctuation])
    title = title.lower()
    title = title.replace("alqur’an", "alquran")
    title = title.replace("qur’an", "quran")    
    title = word_tokenize(title)
    
    # initialize stopwords and extend several word to the stop_list
    stopwords = nltk.corpus.stopwords.words('english')
    stop_list = ['database', '20112020']
    stopwords.extend(stop_list)
    
    # remove stop words 
    title = [i for i in title if i not in stopwords]
    
    # join title into string
    title = " ".join([i for i in title if i not in string.punctuation])
    return title

In [5]:
# apply function to data
df['title']= df['title'].apply(lambda x:preprocessing(x))

In [6]:
# join title into one row by grouping data by 'name' and 'sinta_id' then reset index
df = df.groupby(['name','sinta_id'])['title'].apply('. '.join).reset_index()
df.head()

Unnamed: 0,name,sinta_id,title
0,LINDA SETIAWATI,5995101,development computer teaching materials educat...
1,A SOBANDI,5994552,knowledge management process knowledge sharing...
2,A. BUDHI SALIRA,6771601,readiness online learning covid19 pandemic voc...
3,AAH AHMAD SYAHID,5993425,alsumud wa altakayyuf wa altathaquf tabir huwi...
4,AAM ABDUSSALAM,5993560,decade value education model bibliometric stud...


In [7]:
# check title after preprocessing
df['title'][0]

'development computer teaching materials education management applications elearning module. hard skills versus soft skills affect different job types japanese language graduates. development automatic system icmls 20 improving educational technology competences industrial revolution 40. developing integrated management information system research study institute research community services universitas pendidikan indonesia'

In [8]:
df.head()

Unnamed: 0,name,sinta_id,title
0,LINDA SETIAWATI,5995101,development computer teaching materials educat...
1,A SOBANDI,5994552,knowledge management process knowledge sharing...
2,A. BUDHI SALIRA,6771601,readiness online learning covid19 pandemic voc...
3,AAH AHMAD SYAHID,5993425,alsumud wa altakayyuf wa altathaquf tabir huwi...
4,AAM ABDUSSALAM,5993560,decade value education model bibliometric stud...


### Extract Keywords

In [9]:
# initialize model and apply lambda function to join the keywords as a string
# parameter descriptions
"""
    top_n: the number of keywords(phrases) extracted for each row
    stop_words: type of language to remove stop words
    n_gram_range:
    length of the word or phrase to be extracted, 
    if the value is (3, 3) then it will extract 3 phrases with each phrase consisting of 3 words 
    diversity: parameter level of similarity between words or phrases
"""
extract_kw = lambda x: ', '.join(k[0] for k in kw_model.extract_keywords(x, keyphrase_ngram_range=(3, 3), stop_words='english',
                              use_mmr=True, top_n=3, diversity=0.5))

# apply function to the 'topics' column
df['topics'] = df['title'].apply(extract_kw)
df.head()

Unnamed: 0,name,sinta_id,title,topics
0,LINDA SETIAWATI,5995101,development computer teaching materials educat...,"educational technology competences, management..."
1,A SOBANDI,5994552,knowledge management process knowledge sharing...,"literacy skills vocational, knowledge manageme..."
2,A. BUDHI SALIRA,6771601,readiness online learning covid19 pandemic voc...,"pandemic vocational education, readiness onlin..."
3,AAH AHMAD SYAHID,5993425,alsumud wa altakayyuf wa altathaquf tabir huwi...,"school teachers ict, neural network extreme, h..."
4,AAM ABDUSSALAM,5993560,decade value education model bibliometric stud...,"alquran literacy strategy, decade value educat..."


In [10]:
df = df[['sinta_id', 'title', 'topics']]

In [12]:
df.head()

Unnamed: 0,sinta_id,title,topics
0,5995101,development computer teaching materials educat...,"educational technology competences, management..."
1,5994552,knowledge management process knowledge sharing...,"literacy skills vocational, knowledge manageme..."
2,6771601,readiness online learning covid19 pandemic voc...,"pandemic vocational education, readiness onlin..."
3,5993425,alsumud wa altakayyuf wa altathaquf tabir huwi...,"school teachers ict, neural network extreme, h..."
4,5993560,decade value education model bibliometric stud...,"alquran literacy strategy, decade value educat..."


In [11]:
# export data to csv
df.to_csv(f'1topic_result_{now}.csv', index=False)