In [7]:
import random
import re
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
import csv
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn import datasets
from sklearn.metrics import accuracy_score
from joblib import dump, load
from datetime import datetime
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

#Import Classifier Models
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

class Preprocessor():
    def __init__(self,text):
        """
        Preprocessing Text: Digunakan untuk membersihkan teks sebelum dilakukan analisis.
        mencakup proses casefolding, filtering
        """
        self.text=text
        self._casefolding()
        self._filtering()
        self._tokenize()
        self._standarize()
        self._stemming()

    def get_text(self):
        return " ".join(self.text)
    
    def _casefolding(self):
        #Mengubah menjadi huruf kecil        
        self.text=self.text.lower()
    
    def _filtering(self):        
        #Url
        self.text=re.sub("https\S+","",self.text)
        self.text=re.sub("http\S+","",self.text)
        self.text=re.sub("\S+\.com\S+","",self.text)
        self.text=re.sub("\S+\.com","",self.text)
        
        #Remove Hashtag
        self.text=re.sub("#\S+","",self.text)
        
        #Remove Mention
        self.text=re.sub("@\S+","",self.text)
        
        #Remove Symbol and Number
        self.text=re.sub("[^A-Za-z\s]"," ",self.text)
        
        #Remove Spacing
        self.text=re.sub("\s+"," ",self.text)
        self.text=re.sub("^\s","",self.text)
        self.text=self.text
    
    def _tokenize(self):
        #Membagi kata
        self.text=word_tokenize(self.text)

    def _standarize(self):        
        #Mengubah menjadi kata baku
        j={}
        with open("standard_word.csv","r") as file:
            data=csv.reader(file,delimiter=",")
            for k,i in enumerate(data):
                if k==0: continue
                j[i[0]]=i[1]
                
        for k,t in enumerate(self.text):
            if t in j:
                self.text[k]=j[t]
    
    def _stemming(self):
        #Mengubah menjadi kata dasar
        factory=StemmerFactory()
        stemmer=factory.create_stemmer()
        
        for k,i in enumerate(self.text):
            self.text[k]=stemmer.stem(i)
    
class Analyzer():  
    def __init__(self):
        """
        Membuat model dan melakukan prediksi
        """
        pass
    
    def predict(self,model,data:pd.DataFrame):
        target_column:int=len(data.columns)-1
        X=data.iloc[:,data.columns!=data.columns[target_column]]
        y=data[data.columns[target_column]]
        return model.predict(X)
    
    def train_model(self,model,data:pd.DataFrame):
        target_column:int=len(data.columns)-1
        X=data.iloc[:,data.columns!=data.columns[target_column]]
        y=data[data.columns[target_column]]
        X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2)
        
        models_used=[
            KNeighborsClassifier(),
            SVC(),
            GaussianNB(),
            MultinomialNB(),
            DecisionTreeClassifier(),
            RandomForestClassifier(),
            GradientBoostingClassifier(),
        ]
        model.fit(X_train,y_train)
        prediction=model.predict(X_test)
        if accuracy_score(prediction,y_test)>0.9:
            print("Model has beed trained with new data and saved using "+model.__class__.__name__+" model. Accuracy Score: ",accuracy_score(prediction,y_test))
            dump(model,"models/"+model.__class__.__name__+" "+str(datetime.now()).replace(":","")+".joblib")
        else:
            print("Data is not good enough. Model is not saved. Accuracy score: ",accuracy_score(prediction,y_test))
        
    def create_model(self,data:pd.DataFrame):
        target_column:int=len(data.columns)-1
        X=data.iloc[:,data.columns!=data.columns[target_column]]
        y=data[data.columns[target_column]]
        X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2)
        
        models_used=[
            KNeighborsClassifier(),
            SVC(),
            GaussianNB(),
            MultinomialNB(),
            DecisionTreeClassifier(),
            RandomForestClassifier(),
            GradientBoostingClassifier(),
        ]
        
        max_accuracy=0
        for i in models_used:
            i.fit(X_train,y_train)
            prediction=i.predict(X_test)
            accuracy=accuracy_score(prediction,y_test)
            
            if accuracy>max_accuracy:
                max_accuracy=accuracy
                model=i
            print(i,prediction)
            print("Accuracy Score:",accuracy,"\n")
        
        print("Model saved using "+model.__class__.__name__+" model. Accuracy: "+str(max_accuracy))
        dump(model,"models/"+model.__class__.__name__+" "+str(datetime.now()).replace(":","")+".joblib")

def check_tweet(order):
    tweet=data["Tweet"][order]
    pre=Preprocessor(tweet)
    return "Tweet order: "+str(order)+" Tweet: "+pre.get_text()

def check_random_tweet():
    order=random.randint(0,len(data["Tweet"])-1)
    tweet=data["Tweet"][order]
    pre=Preprocessor(tweet)
    return "Tweet order: "+str(order)+" Tweet: "+pre.get_text()

# #Load Model
# model=load('models/DecisionTreeClassifier 2023-02-28 005926.237284.joblib')

# #Load Data
full=pd.read_csv('data/clean.csv') #Full Data
labeled=pd.read_csv('data/labeled.csv') #Labeled Data

# #Preprocessing
# data["Clean"]=[Preprocessor(i).get_text() for i in data["Tweet"]]

# #TFIDF Calculation
# vect=TfidfVectorizer()
# X=vect.fit_transform(data["Clean"])
# df=pd.DataFrame(X.toarray())

# #Predict
# ana=Analyzer()
# ana.predict(model,df)
# # data["Clean"]

# check_random_tweet()
labeled=labeled[labeled["Label"].notnull()]
labeled

Unnamed: 0,Id,Search Keyword,URL,Datetime,Tweet,Username,View Count,Reply Count,Retweet Count,Like Count,...,Mentioned Users Count,Mentioned Users,User Verified,User Followers Count,User Statuses Count,Year,Month,Day,Kota,Label
567,1553396995456368643,PPN Naik,https://twitter.com/AkuTerpesona8/status/15533...,2022-07-30 15:07:59,@Viko_0000000 @abu_waras PPN naik cuma 1% ribu...,AkuTerpesona8,0,1,0,1,...,2,"['Viko_0000000', 'abu_waras']",False,62,7695,2022,7,30,,2.0
823,1537221617780424704,PPN Naik,https://twitter.com/txtqiqi/status/15372216177...,2022-06-15 23:52:48,"Halga bahan pokok selba mahal, BBM naik, PPN n...",txtqiqi,0,2,7,11,...,0,,False,2990,2057,2022,6,15,,1.0
834,1536734928460623872,PPN Naik,https://twitter.com/erna_st/status/15367349284...,2022-06-14 15:38:52,Belanja Online Bakal Kena Bea Meterai Rp10 Rib...,erna_st,0,0,7,8,...,0,,False,12912,139015,2022,6,14,,1.0
996,1527425821346091009,PPN Naik,https://twitter.com/jumialfonso/status/1527425...,2022-05-19 23:07:48,@CNNIndonesia Saya yg kerja sbgai karyawan apa...,jumialfonso,0,0,0,5,...,1,['CNNIndonesia'],False,9,1295,2022,5,19,,1.0
1041,1526180328640983040,PPN Naik,https://twitter.com/mtriwinar/status/152618032...,2022-05-16 12:38:40,"@BossTemlen Mantap, apa kabar pajak, ppn, naik...",mtriwinar,0,0,0,0,...,1,['BossTemlen'],False,307,11112,2022,5,16,,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28463,1391241523522592775,Kenaikan PPN,https://twitter.com/LokadataID/status/13912415...,2021-05-09 04:00:01,Menteri Keuangan Sri Mulyani Indrawati sebelum...,LokadataID,0,0,1,1,...,0,,False,501082,549614,2021,5,9,,0.0
28467,1390958339077537794,Kenaikan PPN,https://twitter.com/edy_siregar_77/status/1390...,2021-05-08 09:14:45,Menteri Koordinator Bidang Perekonomian Airlan...,edy_siregar_77,0,0,0,1,...,0,,False,4089,73350,2021,5,8,,0.0
28498,1390554953144430592,Kenaikan PPN,https://twitter.com/LaNyallaAcademy/status/139...,2021-05-07 06:31:50,"Kami memahami jika peningkatan tarif PPN, dari...",LaNyallaAcademy,0,1,1,1,...,0,,False,10682,171810,2021,5,7,,2.0
28622,1272900694157275138,Kenaikan PPN,https://twitter.com/IndiHome/status/1272900694...,2020-06-16 14:35:49,"@fitra_dely Benar Kak Fitra, nantinya tagihan ...",IndiHome,0,0,0,0,...,1,['fitra_dely'],True,256673,931660,2020,6,16,,0.0
