### Preprocessing

In [None]:
!pip install sastrawi

Collecting sastrawi
  Downloading Sastrawi-1.0.1-py2.py3-none-any.whl (209 kB)
[?25l[K     |█▋                              | 10 kB 5.0 MB/s eta 0:00:01[K     |███▏                            | 20 kB 8.6 MB/s eta 0:00:01[K     |████▊                           | 30 kB 11.7 MB/s eta 0:00:01[K     |██████▎                         | 40 kB 14.5 MB/s eta 0:00:01[K     |███████▉                        | 51 kB 16.8 MB/s eta 0:00:01[K     |█████████▍                      | 61 kB 19.2 MB/s eta 0:00:01[K     |███████████                     | 71 kB 19.8 MB/s eta 0:00:01[K     |████████████▌                   | 81 kB 21.5 MB/s eta 0:00:01[K     |██████████████                  | 92 kB 22.9 MB/s eta 0:00:01[K     |███████████████▋                | 102 kB 24.2 MB/s eta 0:00:01[K     |█████████████████▏              | 112 kB 24.2 MB/s eta 0:00:01[K     |██████████████████▊             | 122 kB 24.2 MB/s eta 0:00:01[K     |████████████████████▎           | 133 kB 24.2 MB/s e

In [None]:
import pandas as pd
import nltk
import re
import string

from string import punctuation
from nltk.corpus import stopwords
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn import svm, naive_bayes
from sklearn.linear_model import LogisticRegression


from sklearn.metrics import accuracy_score, multilabel_confusion_matrix, classification_report

In [None]:
nltk.download('stopwords')
factory = StemmerFactory()
stemmer = factory.create_stemmer()
stop_words = stopwords.words('indonesian')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
bisnis_df = pd.read_csv('/content/drive/MyDrive/DF NLP/bisnis.csv')
bisnis_df['Label'] = 0
bisnis_df.shape

(40, 6)

In [None]:
lifestyle_df = pd.read_csv('/content/drive/MyDrive/DF NLP/lifestyle.csv')
lifestyle_df['Label'] = 1
lifestyle_df.shape

(40, 6)

In [None]:
sport_df = pd.read_csv('/content/drive/MyDrive/DF NLP/sport.csv')
sport_df['Label'] = 2
sport_df.shape

(40, 6)

In [None]:
df = pd.concat([bisnis_df, lifestyle_df, sport_df], ignore_index=True)

In [None]:
df.drop(['id','url','Unnamed: 0','title'], axis=1, inplace=True)

In [None]:
#preprocessing function
def preprocessing(sentence):
    
    #lowecasing
    sentence = sentence.lower()
    sentence = re.sub(r"b'\S+", "", sentence)
    sentence = re.sub(r"https\S+", "", sentence)
    sentence = re.sub(r"x\S+", "", sentence)
    sentence = re.sub(r"@\S+", "", sentence)
    
    #remove white spaces
    sentence = sentence.strip()
    
    #tokenization
    words = sentence.split()
    
    #remove punctuation/ special character
    remove_table = str.maketrans("", "", punctuation)
    words = [x.translate(remove_table) for x in words]
    
    #remove nonalphanumeric <=3 chars
    words = [x for x in words if x.isalnum() and len(x) > 3]
    
    # stemming
    words = [stemmer.stem(w) for w in words]
    
    #remove stopwords
    words = [x for x in words if x not in stop_words]
    
    #rejoining the words
    sentence = " ".join(words)
    
    return sentence

In [None]:
df['content'] = [preprocessing(stc) for stc in df['content']]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df['content'], df['Label'], test_size=0.2)

## Count

In [None]:
cv = CountVectorizer(lowercase = True, stop_words = stop_words,token_pattern="[A-Za-z]+")
X_train_vect = cv.fit_transform(X_train)
X_test_vect = cv.transform(X_test)

  'stop_words.' % sorted(inconsistent))


### NBC

In [None]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()
clf.fit(X_train_vect, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [None]:
predicted2 = clf.predict(X_test_vect)

In [None]:
print(classification_report(y_test, predicted2))

              precision    recall  f1-score   support

           0       0.89      0.89      0.89         9
           1       0.88      0.88      0.88         8
           2       1.00      1.00      1.00         7

    accuracy                           0.92        24
   macro avg       0.92      0.92      0.92        24
weighted avg       0.92      0.92      0.92        24



In [None]:
from sklearn.metrics import roc_auc_score
prob = clf.predict_proba(X_test_vect)
score = roc_auc_score(y_test, prob, multi_class='ovr')
print(score)

0.9799768518518519


In [None]:
d = ['Aku bermain bola basket']
tes = cv.transform(d)
clf.predict(tes)

array([2])