In [1]:
import warnings
import itertools
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
import re
import string
import matplotlib.pyplot as plt

from prefixspan import PrefixSpan


warnings.filterwarnings("ignore")
pd.options.mode.chained_assignment = None
nltk.download('stopwords')
factory = StemmerFactory()
stemmer = factory.create_stemmer()
listStopword =  set(stopwords.words('indonesian'))  

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/jpawitro/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
22/01/16 16:04:57 WARN Utils: Your hostname, NBJP resolves to a loopback address: 127.0.1.1; using 192.168.1.6 instead (on interface wlp2s0)
22/01/16 16:04:57 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/01/16 16:05:00 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
22/01/16 16:05:04 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [2]:
rmlist = ['bengkulu', 'and', 'of', 'on','in','based','the','to','indonesia','thailand','from', 'for','berbasis']

In [10]:
def read_data(x):
    XL = pd.ExcelFile("data/data_mod.xlsx")
    sheet = XL.sheet_names[x]
    data = pd.read_excel(XL, sheet)
    if sheet != "LPPM":
        data = data[~data['Topik'].isnull()]
    else:
        data = data[~data['Judul'].isnull()]
    data['Tahun'] = data['Tahun'].astype(int)
    data = data.reset_index(drop=True)
    data = data[data.columns.tolist()[:-3]].join(data[data.columns.tolist()[-2:]]).join(data[data.columns.tolist()[-3]])
    return data, sheet

def preprocessing(data, listtoberemoved):
    # df = pd.DataFrame(data['Judul'] + " " + data['Abstark'] + " " + data['Keyword'],columns=['Judul']).copy()
    # df = pd.DataFrame(data['Judul'] + " " + data['Keyword'],columns=['Judul']).copy()
    # df = pd.DataFrame(data['Judul'] + " " + data['Abstark'],columns=['Judul']).copy()
    df = pd.DataFrame(data['Judul'],columns=['Judul']).copy()
    cleaned = []
    for n in df['Judul'].values:
        n = n.lower()
        n = re.sub(r':', '', n)
        n = re.sub(r'‚Ä¶', '', n)
        n = re.sub(r'[^\x00-\x7F]+',' ', n)
        n = re.sub('[^a-zA-Z]', ' ', n)
        n = re.sub("&lt;/?.*?&gt;","&lt;&gt;",n)
        n = re.sub("(\\d|\\W)+"," ",n)
        n = re.sub(r'â', '', n)
        n = re.sub(r'€', '', n)
        n = re.sub(r'¦', '', n)
        cleaned.append(n)
    df['cleaned'] = cleaned

    tokenized = []
    for n in cleaned:
        n = word_tokenize(n)
        tokenized.append(n)
    df['tokenized'] = [', '.join(n) for n in tokenized]

    removed = []
    for ts in tokenized:
        n = []
        for t in ts:
            if t not in listtoberemoved and t not in listStopword and t not in string.punctuation:
                n.append(t)
        removed.append(n)
    df['removed'] = [', '.join(n) for n in removed]

    stemmed = []
    for n in removed:
        n = ' '.join(n)
        n = stemmer.stem(n)
        n = n.split(' ')
        stemmed.append(n)
    df['stemmed'] = [' '.join(n) for n in stemmed]
    return df, stemmed

def mining(data,stemmed, ms=2, mp=10, mnp=2):
    dx = [n for n in [a + b + c for a,b,c in zip(stemmed,data['Keyword'].str.split(",").values.tolist(),data['Topik'].str.split(",").values.tolist())]]
    ps = PrefixSpan(dx)
    pf_results = pd.DataFrame(ps.frequent(ms), columns=['freq','sequence'])
    
    pf_results['sequence'] = [', '.join(n) for n in pf_results['sequence'].values.tolist()]
    pf_results = pf_results[[len(n)<=mp for n in pf_results['sequence'].str.split(",").values.tolist()]]
    pf_results = pf_results[[len(n)>=mnp for n in pf_results['sequence'].str.split(",").values.tolist()]].sort_values(by='freq',ascending=False).reset_index(drop=True)

    return pf_results

def run(x,listtoberemoved, ms=None, mp=None, mnp=2):
    data, sheet= read_data(x)
    df, stemmed = preprocessing(data, listtoberemoved)
    pf= mining(data, stemmed, ms=ms, mp=mp, mnp=mnp)
    return sheet, pf,data, df


In [21]:

sheet, results, data, df = run(6,rmlist)


print(sheet)
results

0.05 10 3


22/01/16 16:07:59 WARN PrefixSpan: Input data is not cached.


Sistem Informasi


Unnamed: 0,sequence,freq
0,"E-Learning, learning, literacy",3
1,"E-Learning, scientific, literacy",3
2,"E-Learning, learning, scientific",3
3,"E-Learning, learning, scientific, literacy",3
4,"learning, scientific, literacy",3
...,...,...
130,"E-Learning, school, scientific, literacy",2
131,"E-Learning, school, scientific",2
132,"E-Learning, school, literacy",2
133,"E-Learning, sains, literasi",2


In [22]:
topik = list(set(itertools.chain.from_iterable(data['Topik'].str.split(', ').values.tolist())))
topik

['Sistem Informasi',
 'Image Processing',
 'Mobile Development',
 'Cybersecurity',
 'Pendidikan',
 'Sistem Pendukung Keputusan',
 'Animation',
 'E-Commerce',
 'Technopreneur',
 'GIS',
 'Machine Learning',
 'E-Learning',
 'Recommendation System',
 'Augmented Relaity',
 'Data Mining']

In [23]:
from collections import Counter

rekomendasi = dict(Counter([n for n in ', '.join(results['sequence'].values.tolist()).split(', ') if n in topik])).keys()
', '.join(rekomendasi)

'E-Learning, Animation, Recommendation System, Data Mining, Pendidikan'

In [25]:
results[[list(rekomendasi)[0] in n for n in results['sequence']]]

Unnamed: 0,sequence,freq
0,"E-Learning, learning, literacy",3
1,"E-Learning, scientific, literacy",3
2,"E-Learning, learning, scientific",3
3,"E-Learning, learning, scientific, literacy",3
73,"E-Learning, learning, students, scientific, s...",2
74,"E-Learning, learning, students, scientific",2
75,"E-Learning, learning, students, literacy, sci...",2
76,"E-Learning, learning, students, literacy",2
77,"E-Learning, learning, students, scientific li...",2
78,"E-Learning, learning, students",2


In [None]:
results.to_csv("prefix_span.csv", index=False)