In [10]:
import numpy as np
import pandas as pd
import modSpellChecker as sc
from contractions import CONTRACTION_MAP
import re
import nltk
import string
from nltk.stem import WordNetLemmatizer
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
import gensim
from gensim.models import Word2Vec
from gensim import models
from pattern.en import tag
from nltk.corpus import wordnet as wn
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from gensim import corpora, models
from normalization import normalize_corpus
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.datasets import fetch_20newsgroups
from sklearn.cross_validation import train_test_split
from sklearn import metrics
from sklearn.naive_bayes import MultinomialNB

In [11]:
dataset = pd.read_csv('dataartikel.csv')
dataset

Unnamed: 0,Judul,Artikel,Jenis
0,Rupiah masih rawan bergejolak,Nilai tukar rupiah terhadap dolar Amerika Seri...,ekonomi
1,SRC sasar ritel tradisional jadi pilar ekonomi...,Perkembangan teknologi telah menyebabkan perub...,ekonomi
2,Kiat memaksimalkan Facebook untuk bisnis,Media sosial kini bukan hanya menjadi alat unt...,ekonomi
3,Kiat menambah relasi bisnis,"Dalam menjalankan bisnis, menjalin relasi puny...",ekonomi
4,Prinsip dasar merintis bisnis,Merintis usaha memang bukan pekerjaan mudah. A...,ekonomi
5,10 Sektor seret pelemahan IHSG,Perdagangan di Bursa Efek Indonesia (BEI) ditu...,ekonomi
6,8 Sektor perkasa lambungkan IHSG,Indeks Harga Saham Gabungan (IHSG) terkoreksi ...,ekonomi
7,Untung rugi ratifikasi tujuh perjanjian dagang...,Proses ratifikasi tujuh perjanjian perdagangan...,ekonomi
8,Uang muka KPR nol persen untuk PNS pada tahun ...,Pemerintah mematangkan skema pembiayaan fasilt...,ekonomi
9,Cari untung lewat obligasi negara atau deposit...,Anak muda zaman sekarang boleh jadi tidak asin...,ekonomi


In [24]:
label = dataset.iloc[:,2]
feature = dataset.iloc[:,0]

In [25]:
#fungsi2 untuk normalisasi dataset
character = ['z','y','x','w','v','u','t','s','r','q','p','o','n','m','l','k','j','i','h','g','f','e','d','c','b','a',
             ',','.',';',':','-','...','?','!','(',')','[',']','{','}','<','>','"','/','\'','#','-','@']
def repeatcharNormalize(text):
    for i in range(len(character)):
        charac_long = 5
        while charac_long>=2:
            char=character[i]*charac_long
            text=text.replace(char,character[i])
            charac_long-=1
        return text

def spellNormalize(text):
    spellCheck = []
    for i in text:
        if i not in character:
            j=sc.correction(i)
            spellCheck.append(j)
        else:
            spellCheck.append(i)
        return spellCheck  
    
def tokenize_text(text):
    tokens=nltk.word_tokenize(text)
    tokens=[token.strip() for token in tokens]
    return tokens

def expand_contractions(text, contraction_mapping):
    
    contractions_pattern = re.compile('({})'.format('|'.join(contraction_mapping.keys())),
                                    flags=re.IGNORECASE|re.DOTALL)
    def expand_match(contraction):
        match = contraction.group(0)
        first_char = match[0]
        expanded_contraction = contraction_mapping.get(match)\
                            if contraction_mapping.get(match)\
                            else contraction_mapping.get(match.lower())
        expanded_contraction = first_char + expanded_contraction[1:]
        return expanded_contraction
    
    expanded_text = contractions_pattern.sub(expand_match, text)
    expanded_text = re.sub("'","", expanded_text)
    return expanded_text

def stemmer_text(text):
    factory = StemmerFactory()
    stemmer = factory.create_stemmer()
    text = stemmer.stem(text)
    return text

# def lemmatize_text(text):
#     pos_tagged_text = pos_tag_text(text)
#     lemmatized_tokens = [wnl.lemmatize(word, pos_tag) if pos_tag
#                        else word for word,
#                        pos_tag in pos_tagged_text]
#     lemmatize_text=' '.join(lemmatized_tokens)
#     return lemmatize_text



In [26]:
def remove_special_characters(text):
    tokens = tokenize_text(text)
    pattern = re.compile('[{}]'.format(re.escape(string.punctuation)))
    filtered_tokens = filter(None, [pattern.sub('',token) for token in tokens])
    filtered_text = ' '.join(filtered_tokens)
    return filtered_text

factory = StopWordRemoverFactory()
stopword_list = factory.get_stop_words()

def remove_stopwords(text):
    tokens = tokenize_text(text)
    filtered_tokens = [token for token in tokens if token not in stopword_list]
    filtered_text = ' '.join(filtered_tokens)
    return filtered_text

def normalize_corpus(corpus, tokenize=False):
    normalized_corpus = []
    for text in corpus:
        text = expand_contractions(text, CONTRACTION_MAP)
        text = stemmer_text(text)
        text = remove_special_characters(text)
        text = repeatcharNormalize(text)
        text = remove_stopwords(text)
        
        normalized_corpus.append(text)
        if tokenize:
            text = tokenize_text(text)
            text = spellNormalize(text)
            normalized_corpus.append(text)
    return normalized_corpus

In [27]:
#Fungsi untuk mengekstraksi feature menggunakan TF-IDF Model

def tfidf_transformer(bow_matrix):
    transformer = TfidfTransformer(norm='l2',
                                  smooth_idf=True,
                                  use_idf=True)
    tfidf_matrix = transformer.fit_transform(bow_matrix)
    return transformer, tfidf_matrix

def tfidf_extractor(corpus, ngram_range=(1,1)):
    vectorizer = TfidfVectorizer(min_df=1,
                                norm='l2',
                                smooth_idf=True,
                                use_idf=True,
                                ngram_range=ngram_range)
    features = vectorizer.fit_transform(corpus)
    return vectorizer, features

In [28]:
#training dataset
def prepare_datasets(corpus, labels, test_data_proportion=0.3):
    train_X, test_X, train_Y, test_Y = train_test_split(corpus, labels,
                                                       test_size=0.33, random_state=42)
    return train_X, test_X, train_Y, test_Y

def remove_empty_docs(corpus, labels):
    filtered_corpus = []
    filtered_labels = []
    for doc, label in zip(corpus, labels):
        if doc.strip():
            filtered_corpus.append(doc)
            filtered_labels.append(label)
    return filtered_corpus, filtered_labels

train_corpus, test_corpus, train_labels, test_labels = prepare_datasets(feature,
                                                                       label,
                                                                       test_data_proportion=0.3)
#memanggil fungsi normalisasi dataset yang telah ditraining
norm_train_corpus = normalize_corpus(train_corpus)
norm_test_corpus = normalize_corpus(test_corpus)
''.strip()

''

In [29]:
norm_train_corpus

['pssi tahan luis milla',
 'lidi knkt kabar baru insiden lion air jt610',
 'fakta putar hiu raksasa megalodon',
 'sanksi berat persib sop tampak wujud',
 'tiga nyanyi bintang film baru ernest prakasa',
 'ac mil butuh waktu',
 'beda dana desa lurah',
 'kotak hitam temu satu korban makam',
 '10 sektor seret lemah ihsg',
 'mahal hasil imbang mil laju 7 tim mapan',
 '9 vendor ponsel bakal unjuk gigi ifa 2018',
 'hoaks tengah bencana polisi tangkap orang',
 'mk tolak gugat ambang batas caprescawapres',
 'tarif listrik tak naik subsidi makin tinggi',
 'ungkap kalimat paling seksi pasang',
 'alas apple buang 3d touch iphone datang',
 'orang rupawan kurang untung cinta',
 'keju tua temu makam mesir',
 'riset ungkap cara lebih sadar diri',
 '47 ilmuwan diaspora ikut simposium cendekia kelas dunia 2018',
 'parma 0 2 lazio liga serie a 2018 2019',
 'asal warna berlian biru hasil pecah',
 'kunto aji sorot isu sehat mental lewat album mantra mantra',
 'usaha masalah banyak alam',
 'ketua golkar jab

In [30]:
norm_test_corpus

['kini ai baca pribadi orang lalu gera mata',
 'dokumentasi karier coldplay a head full of dreams',
 'serah kaya sarita abdul mukti sumber kaya faisal harris sebut usaha kaya raya',
 'minimal risiko timbul libur asuransi jalan',
 'rock and roll dampak negatif ekosistem',
 'real madrid 1 2 levante la liga 2018 2019',
 'evakuasi harimau sumatera kian langka populasi',
 'isu senjata kimia arab saudi balik mati khashoggi',
 'kunci sukses tenang',
 'model bisnis 7eleven perlu jadi ajar',
 'orang indonesia butuh edukasi soal sampah plastik',
 'mahasiswa unair cipta alat pantau gagal jantung',
 'citra kirana coba genre horor lewat asih',
 'sulit kerja status lulus smk',
 'blackpink tampil singkat sentul',
 'joker baru versi joaquin phoenix ungkap',
 'internet ancam naik muka laut',
 'the negotiation jalan runding a lot',
 'komodo phinisi hingga masyarakat difabel upacara buka',
 'usaha ritel sayang kurang koordinasi perintah',
 'empoli 2 1 udinese liga serie a 2018 2019',
 'pulih ekonomi lomb

In [31]:
#mengunakan model TF-IDF untuk mengekstraksi feature
tfidf_vectorizer, tfidf_train_features = tfidf_extractor(norm_train_corpus)
tfidf_test_features = tfidf_vectorizer.transform(norm_test_corpus)

tokenized_train = [nltk.word_tokenize(text)
                  for text in norm_train_corpus]
tokenized_test = [nltk.word_tokenize(text)
                 for text in norm_test_corpus]

model = gensim.models.Word2Vec(tokenized_train,
                              size=500,
                              window=100,
                              min_count=30,
                              sample=1e-3)

In [32]:
#kami mengevaluasi model menggunakan fungsi train_predict_evaluate_model
#kami mengukur nilai accuracy,precission, recall, dan F1 score apakan nilainya sudah sesuai atau tidak. Jika nilainya mendekati 1 maka proses klasifikasi berhasil


from sklearn.linear_model import SGDClassifier

train_features=tfidf_train_features
train_labels=train_labels
test_features=tfidf_test_features
test_labels=test_labels

clf = SGDClassifier(loss='hinge', n_iter=100)
clf.fit(train_features, train_labels)
predictions = clf.predict(test_features)
    
print("Label test: " +format(test_labels[:10]))
print("Prediction test: "+ format(predictions[:10]))
print("Accuration: "+format(clf.score(test_features,test_labels)))




Label test: 1148      sains-tekno
1049    entertainment
982     entertainment
808        gaya hidup
1195      sains-tekno
240             sport
1118      sains-tekno
596              news
924        gaya hidup
65            ekonomi
Name: Jenis, dtype: object
Prediction test: ['gaya hidup' 'gaya hidup' 'news' 'gaya hidup' 'news' 'sport' 'sains-tekno'
 'news' 'gaya hidup' 'gaya hidup']
Accuration: 0.7754629629629629


In [33]:
from sklearn.pipeline import make_pipeline

pipe= make_pipeline(tfidf_vectorizer,clf)
pipe.fit(train_corpus,train_labels)

print(pipe.score(test_corpus,test_labels))

tempFeature=normalize_corpus(feature)



0.777777777778


In [34]:
tempData=pd.DataFrame(np.column_stack([feature,label]),columns=('Feature','Label'))
tempData
tempRest = [pipe,tempData]
line = np.array(['dampak perceraian pada psikologis anak'])
pipe.predict(line)
joblibFile = "SGDClassifier-new3.pkl"
from sklearn.externals import joblib
joblib.dump(tempRest,joblibFile)

['SGDClassifier-new3.pkl']

In [35]:
def get_metrics(true_labels, predicted_labels):
    print('Accuracy: ', np.round(metrics.accuracy_score(true_labels,
                                                     predicted_labels),2))
    print('Precision: ', np.round(metrics.precision_score(true_labels,
                                                     predicted_labels,
                                                        average='weighted'),2))
    print('Recall: ', np.round(metrics.recall_score(true_labels,
                                                     predicted_labels,
                                                        average='weighted'),2))
    print('F1 Score: ', np.round(metrics.f1_score(true_labels,
                                                     predicted_labels,
                                                        average='weighted'),2))

# mengevaluasi performa prediksi menggunakan model  
get_metrics(true_labels=test_labels,predicted_labels=predictions)

# def train_predict_evaluate_model(classifier,
#                             train_features, train_labels,
#                             test_features, test_labels):
#     #membuat model
#     classifier.fit(train_features, train_labels)

#     #melakukan prediksi menggunakan model
#     predictions = classifier.predict(test_features)
    
#     #mengevaluasi performa prediksi menggunakan model  
#     get_metrics(true_labels=test_labels,
#                predicted_labels=predictions)
#     return predictions

# from sklearn.linear_model import SGDClassifier
# from sklearn.pipeline import make_pipeline

# mnb = MultinomialNB()
# svm = SGDClassifier(loss='hinge', n_iter=100)
# pipeline = make_pipeline(tfidf_vectorizer,MultinomialNB())

# print("-------------------------------------------------------------")
# mnb_tfidf_predictions = train_predict_evaluate_model(classifier=mnb,
#                                                   train_features=tfidf_train_features,
#                                                   train_labels=train_labels,
#                                                   test_features=tfidf_test_features,
#                                                   test_labels=test_labels)
# print("-------------------------------------------------------------")
# svm_tfidf_predictions = train_predict_evaluate_model(classifier=svm,
#                                                   train_features=tfidf_train_features,
#                                                   train_labels=train_labels,
#                                                   test_features=tfidf_test_features,
#                                                   test_labels=test_labels)


Accuracy:  0.78
Precision:  0.79
Recall:  0.78
F1 Score:  0.78


In [None]:
tes = dataset.iloc[174,:]
tes