In [1]:
import warnings
import itertools
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
import re
import string
import matplotlib.pyplot as plt
import os

from prefixspan import PrefixSpan


warnings.filterwarnings("ignore")
pd.options.mode.chained_assignment = None
nltk.download('stopwords')
nltk.download('punkt')
factory = StemmerFactory()
stemmer = factory.create_stemmer()
listStopword =  set(stopwords.words('indonesian'))  

[nltk_data] Downloading package stopwords to punkt...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [2]:
rmlist = ['bengkulu', 'and', 'of', 'on','in','based','the','to','indonesia','thailand','from', 'for','berbasis']

In [14]:
def read_data(x):
    XL = pd.ExcelFile(os.path.join("data","data_mod.xlsx"))
    sheet = XL.sheet_names[x]
    data = pd.read_excel(XL, sheet)
    if sheet != "LPPM":
        data = data[~data['Topik'].isnull()]
    else:
        data = data[~data['Judul'].isnull()]
    data['Tahun'] = data['Tahun'].astype(int)
    data = data.reset_index(drop=True)
    data = data[data.columns.tolist()[:-3]].join(data[data.columns.tolist()[-2:]]).join(data[data.columns.tolist()[-3]])
    return data, sheet

def preprocessing(data, listtoberemoved):
    # df = pd.DataFrame(data['Judul'] + " " + data['Abstark'] + " " + data['Keyword'],columns=['Judul']).copy()
    # df = pd.DataFrame(data['Judul'] + " " + data['Keyword'],columns=['Judul']).copy()
    # df = pd.DataFrame(data['Judul'] + " " + data['Abstark'],columns=['Judul']).copy()
    df = pd.DataFrame(data['Judul'],columns=['Judul']).copy()
    cleaned = []
    for n in df['Judul'].values:
        n = n.lower()
        n = re.sub(r':', '', n)
        n = re.sub(r'‚Ä¶', '', n)
        n = re.sub(r'[^\x00-\x7F]+',' ', n)
        n = re.sub('[^a-zA-Z]', ' ', n)
        n = re.sub("&lt;/?.*?&gt;","&lt;&gt;",n)
        n = re.sub("(\\d|\\W)+"," ",n)
        n = re.sub(r'â', '', n)
        n = re.sub(r'€', '', n)
        n = re.sub(r'¦', '', n)
        cleaned.append(n)
    df['cleaned'] = cleaned

    tokenized = []
    for n in cleaned:
        n = word_tokenize(n)
        tokenized.append(n)
    df['tokenized'] = [', '.join(n) for n in tokenized]

    removed = []
    for ts in tokenized:
        n = []
        for t in ts:
            if t not in listtoberemoved and t not in listStopword and t not in string.punctuation:
                n.append(t)
        removed.append(n)
    df['removed'] = [', '.join(n) for n in removed]

    stemmed = []
    for n in removed:
        n = ' '.join(n)
        n = stemmer.stem(n)
        n = n.split(' ')
        stemmed.append(n)
    df['stemmed'] = [' '.join(n) for n in stemmed]
    return df, stemmed

def mining(data,stemmed, ms=3, mp=10, mnp=2):
    dx = [n for n in [a + b + c for a,b,c in zip(stemmed,data['Keyword'].str.split(",").values.tolist(),data['Topik'].str.split(",").values.tolist())]]
    print(1)
    ps = PrefixSpan(dx)
    print(2)
    pf_results = pd.DataFrame(ps.frequent(ms), columns=['freq','sequence'])
    print(3)
    pf_results['sequence'] = [', '.join(n) for n in pf_results['sequence'].values.tolist()]
    pf_results = pf_results[[len(n)<=mp for n in pf_results['sequence'].str.split(",").values.tolist()]]
    pf_results = pf_results[[len(n)>=mnp for n in pf_results['sequence'].str.split(",").values.tolist()]].sort_values(by='freq',ascending=False).reset_index(drop=True)

    return pf_results

def run(x,listtoberemoved, ms=3, mp=10, mnp=2):
    data, sheet= read_data(x)
    df, stemmed = preprocessing(data, listtoberemoved)
    pf= mining(data, stemmed, ms=ms, mp=mp, mnp=mnp)
    return sheet, pf,data, df


In [15]:

sheet, results, data, df = run(6,rmlist)


print(sheet)
results

1
2
3
Sistem Informasi


Unnamed: 0,freq,sequence
0,4,"learning, E-Learning"
1,3,"scientific, E-Learning"
2,3,"literacy, E-Learning"
3,3,"scientific, literacy, E-Learning"
4,3,"scientific, literacy"
...,...,...
172,2,"mobile, learning, application, school"
173,2,"mobile, learning, application, school, elemen..."
174,2,"mobile, learning, application, elementary school"
175,2,"mobile, learning, elementary"


In [None]:
topik = list(set(itertools.chain.from_iterable(data['Topik'].str.split(', ').values.tolist())))
topik

['Sistem Pendukung Keputusan',
 'Machine Learning',
 'Augmented Relaity',
 'Mobile Development',
 'Image Processing',
 'Cybersecurity',
 'Animation',
 'Recommendation System',
 'Data Mining',
 'Sistem Informasi',
 'GIS',
 'E-Learning',
 'E-Commerce',
 'Pendidikan',
 'Technopreneur']

In [None]:
from collections import Counter

rekomendasi = dict(Counter([n for n in ', '.join(results['sequence'].values.tolist()).split(', ') if n in topik])).keys()
', '.join(rekomendasi)

'E-Learning, Animation, Pendidikan, Sistem Pendukung Keputusan, Data Mining, GIS, Recommendation System, Image Processing'

In [None]:
results[[list(rekomendasi)[0] in n for n in results['sequence']]]

Unnamed: 0,freq,sequence
0,4,"learning, E-Learning"
1,3,"scientific, E-Learning"
2,3,"literacy, E-Learning"
3,3,"scientific, literacy, E-Learning"
6,3,"ict, E-Learning"
7,2,"scientific, literacy, students, E-Learning"
9,2,"scientific, literacy, scientific literacy, E-..."
12,2,"scientific, elementary, school, E-Learning"
14,2,"scientific, elementary, E-Learning"
16,2,"scientific, school, E-Learning"


In [None]:
results.to_csv("prefix_span.csv", index=False)

In [None]:
df

Unnamed: 0,Judul,cleaned,tokenized,removed,stemmed
0,pengenalan steganografi untuk pengamanan infor...,pengenalan steganografi untuk pengamanan infor...,"pengenalan, steganografi, untuk, pengamanan, i...","pengenalan, steganografi, pengamanan, informas...",kenal steganografi aman informasi siswa siswi ...
1,the analysis of rank fusion techniques to impr...,the analysis of rank fusion techniques to impr...,"the, analysis, of, rank, fusion, techniques, t...","analysis, rank, fusion, techniques, improve, q...",analysis rank fusion techniques improve query ...
2,improving performance of relation extraction a...,improving performance of relation extraction a...,"improving, performance, of, relation, extracti...","improving, performance, relation, extraction, ...",improving performance relation extraction algo...
3,pemanfaatan citra penginderaan jauh dan sistem...,pemanfaatan citra penginderaan jauh dan sistem...,"pemanfaatan, citra, penginderaan, jauh, dan, s...","pemanfaatan, citra, penginderaan, sistem, info...",manfaat citra penginderaan sistem informasi ge...
4,social influences in recommendation systems,social influences in recommendation systems,"social, influences, in, recommendation, systems","social, influences, recommendation, systems",social influences recommendation systems
5,cselene: privacy preserving query retrieval sy...,cselene privacy preserving query retrieval sys...,"cselene, privacy, preserving, query, retrieval...","cselene, privacy, preserving, query, retrieval...",cselene privacy preserving query retrieval sys...
6,highly relevant routing recommendation systems...,highly relevant routing recommendation systems...,"highly, relevant, routing, recommendation, sys...","highly, relevant, routing, recommendation, sys...",highly relevant routing recommendation systems...
7,prototipe interoperabilitas learning technolog...,prototipe interoperabilitas learning technolog...,"prototipe, interoperabilitas, learning, techno...","prototipe, interoperabilitas, learning, techno...",prototipe interoperabilitas learning technolog...
8,kompresi citra batik besurek menggunakan discr...,kompresi citra batik besurek menggunakan discr...,"kompresi, citra, batik, besurek, menggunakan, ...","kompresi, citra, batik, besurek, discrete, wav...",kompresi citra batik besurek discrete wavelet ...
9,disaster risk reduction for earthquake using m...,disaster risk reduction for earthquake using m...,"disaster, risk, reduction, for, earthquake, us...","disaster, risk, reduction, earthquake, using, ...",disaster risk reduction earthquake using mobil...


In [None]:
ms = 3
mp = 10
mnp = 2

data_ms, sheet_ms = read_data(3)
df_ms, stemmed_ms = preprocessing(data, rmlist)
mining(data_ms, stemmed_ms, ms=ms, mp=mp, mnp=mnp)

1
2


KeyboardInterrupt: 

In [None]:
dx_ms = [n for n in [a + b + c for a,b,c in zip(stemmed_ms,data_ms['Keyword'].str.split(",").values.tolist(),data_ms['Topik'].str.split(",").values.tolist())]]

In [None]:
ps_ms = PrefixSpan(dx_ms)

In [None]:
ps_ms.frequent(2)

[(4, ['area']),
 (2, ['area', 'earthquake']),
 (4, [' shrink fitting']),
 (4, [' shrink fitting', ' finite element method']),
 (4, [' shrink fitting', ' finite element method', 'Material Durability']),
 (4, [' shrink fitting', 'Material Durability']),
 (4, [' finite element method']),
 (4, [' finite element method', 'Material Durability']),
 (16, ['Material Durability']),
 (2, ['daerah']),
 (2, ['ceramic roller']),
 (2, ['ceramic roller', ' shrink fitting']),
 (2, ['ceramic roller', ' shrink fitting', ' finite element method']),
 (2,
  ['ceramic roller',
   ' shrink fitting',
   ' finite element method',
   'Material Durability']),
 (2, ['ceramic roller', ' shrink fitting', 'Material Durability']),
 (2, ['ceramic roller', ' finite element method']),
 (2, ['ceramic roller', ' finite element method', 'Material Durability']),
 (2, ['ceramic roller', 'Material Durability']),
 (7, ['pengaruh']),
 (4, ['pengaruh', 'beton']),
 (2, ['pengaruh', 'beton', 'kuat']),
 (2, ['pengaruh', 'beton', 'ku

In [None]:
ms = 2
mp = 10
mnp = 2

pf_results_ms = pd.DataFrame(ps_ms.frequent(ms), columns=['freq','sequence'])
print("first")
pf_results_ms['sequence'] = [', '.join(n) for n in pf_results_ms['sequence'].values.tolist()]
print("second")
pf_results_ms = pf_results_ms[[len(n)<=mp for n in pf_results_ms['sequence'].str.split(",").values.tolist()]]
print("third")
pf_results_ms = pf_results_ms[[len(n)>=mnp for n in pf_results_ms['sequence'].str.split(",").values.tolist()]].sort_values(by='freq',ascending=False).reset_index(drop=True)

first
second
third


In [None]:
pf_results_ms

Unnamed: 0,freq,sequence
0,7,"liquefaction, potential"
1,5,"pengaruh, kuat"
2,5,"ground, response"
3,4,"shrink fitting, Material Durability"
4,4,"kuat, tekan"
...,...,...
2450,2,"pengaruh, elemen, struktur, bangungan, pasar, ..."
2451,2,"pengaruh, elemen, struktur, bangungan, pasar, ..."
2452,2,"pengaruh, elemen, struktur, bangungan, pasar, ..."
2453,2,"pengaruh, elemen, struktur, bangungan, pasar, ..."


In [None]:
pf= mining(data_ms, stemmed_ms, ms=ms, mp=mp, mnp=mnp)

In [None]:
pf

Unnamed: 0,freq,sequence
0,7,"liquefaction, potential"
1,5,"pengaruh, kuat"
2,5,"ground, response"
3,4,"shrink fitting, Material Durability"
4,4,"kuat, tekan"
...,...,...
2450,2,"pengaruh, elemen, struktur, bangungan, pasar, ..."
2451,2,"pengaruh, elemen, struktur, bangungan, pasar, ..."
2452,2,"pengaruh, elemen, struktur, bangungan, pasar, ..."
2453,2,"pengaruh, elemen, struktur, bangungan, pasar, ..."


In [None]:
run(3,rmlist)

1
2


KeyboardInterrupt: 