In [97]:
import random
import re
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
import csv
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn import datasets
from sklearn.metrics import accuracy_score
from joblib import dump, load
from datetime import datetime
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

#Import Classifier Models
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

class Preprocessor():
    def __init__(self,text):
        """
        Preprocessing Text: Digunakan untuk membersihkan teks sebelum dilakukan analisis.
        mencakup proses casefolding, filtering
        """
        self.text=text
        self._casefolding()
        self._filtering()
        self._tokenize()
        self._standarize()
        self._stemming()

    def get_text(self):
        return " ".join(self.text)
    
    def _casefolding(self):
        #Mengubah menjadi huruf kecil        
        self.text=self.text.lower()
    
    def _filtering(self):        
        #Url
        self.text=re.sub("https\S+","",self.text)
        self.text=re.sub("http\S+","",self.text)
        self.text=re.sub("\S+\.com\S+","",self.text)
        self.text=re.sub("\S+\.com","",self.text)
        
        #Remove Hashtag
        self.text=re.sub("#\S+","",self.text)
        
        #Remove Mention
        self.text=re.sub("@\S+","",self.text)
        
        #Remove Symbol and Number
        self.text=re.sub("[^A-Za-z\s]"," ",self.text)
        
        #Remove Spacing
        self.text=re.sub("\s+"," ",self.text)
        self.text=re.sub("^\s","",self.text)
        self.text=self.text
    
    def _tokenize(self):
        #Membagi kata
        self.text=word_tokenize(self.text)

    def _standarize(self):        
        #Mengubah menjadi kata baku
        j={}
        with open("standard_word.csv","r") as file:
            data=csv.reader(file,delimiter=",")
            for k,i in enumerate(data):
                if k==0: continue
                j[i[0]]=i[1]
                
        for k,t in enumerate(self.text):
            if t in j:
                self.text[k]=j[t]
    
    def _stemming(self):
        #Mengubah menjadi kata dasar
        factory=StemmerFactory()
        stemmer=factory.create_stemmer()
        
        for k,i in enumerate(self.text):
            self.text[k]=stemmer.stem(i)
    
class Analyzer():  
    def __init__(self):
        """
        Membuat model dan melakukan prediksi
        """
        pass
    
    def predict(self,model,data:pd.DataFrame):
        target_column:int=len(data.columns)-1
        X=data.iloc[:,data.columns!=data.columns[target_column]]
        y=data[data.columns[target_column]]
        return model.predict(X)
    
    def train_model(self,model,data:pd.DataFrame):
        target_column:int=len(data.columns)-1
        X=data.iloc[:,data.columns!=data.columns[target_column]]
        y=data[data.columns[target_column]]
        X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2)
        
        models_used=[
            KNeighborsClassifier(),
            SVC(),
            GaussianNB(),
            MultinomialNB(),
            DecisionTreeClassifier(),
            RandomForestClassifier(),
            GradientBoostingClassifier(),
        ]
        model.fit(X_train,y_train)
        prediction=model.predict(X_test)
        if accuracy_score(prediction,y_test)>0.9:
            print("Model has beed trained with new data and saved using "+model.__class__.__name__+" model. Accuracy Score: ",accuracy_score(prediction,y_test))
            dump(model,"models/"+model.__class__.__name__+" "+str(datetime.now()).replace(":","")+".joblib")
        else:
            print("Data is not good enough. Model is not saved. Accuracy score: ",accuracy_score(prediction,y_test))
        
    def create_model(self,data:pd.DataFrame):
        target_column:int=len(data.columns)-1
        X=data.iloc[:,data.columns!=data.columns[target_column]]
        y=data[data.columns[target_column]]
        X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2)
        
        models_used=[
            KNeighborsClassifier(),
            SVC(),
            GaussianNB(),
            MultinomialNB(),
            DecisionTreeClassifier(),
            RandomForestClassifier(),
            GradientBoostingClassifier(),
        ]
        
        max_accuracy=0
        for i in models_used:
            i.fit(X_train,y_train)
            prediction=i.predict(X_test)
            accuracy=accuracy_score(prediction,y_test)
            
            if accuracy>max_accuracy:
                max_accuracy=accuracy
                model=i
            print(i,prediction)
            print("Accuracy Score:",accuracy,"\n")
        
        print("Model saved using "+model.__class__.__name__+" model. Accuracy: "+str(max_accuracy))
        dump(model,"models/"+model.__class__.__name__+" "+str(datetime.now()).replace(":","")+".joblib")

def check_tweet(order):
    tweet=data["Tweet"][order]
    pre=Preprocessor(tweet)
    return "Tweet order: "+str(order)+" Tweet: "+pre.get_text()

def check_random_tweet():
    order=random.randint(0,len(data["Tweet"])-1)
    tweet=data["Tweet"][order]
    pre=Preprocessor(tweet)
    return "Tweet order: "+str(order)+" Tweet: "+pre.get_text()

# #Load Model
# model=load('models/DecisionTreeClassifier 2023-02-28 005926.237284.joblib')

# #Load Data
full=pd.read_csv('data/clean.csv') #Full Data
labeled=pd.read_csv('data/labeled.csv') #Labeled Data

# #Preprocessing
# data["Clean"]=[Preprocessor(i).get_text() for i in data["Tweet"]]

# #TFIDF Calculation
# vect=TfidfVectorizer()
# X=vect.fit_transform(data["Clean"])
# df=pd.DataFrame(X.toarray())

# #Predict
# ana=Analyzer()
# ana.predict(model,df)
# # data["Clean"]

# check_random_tweet()
labeled




Unnamed: 0,Id,Search Keyword,URL,Datetime,Tweet,Username,View Count,Reply Count,Retweet Count,Like Count,...,Mentioned Users Count,Mentioned Users,User Verified,User Followers Count,User Statuses Count,Year,Month,Day,Kota,Label
0,1608941325113774080,PPN Naik,https://twitter.com/andi_hpattera/status/16089...,2022-12-30 21:41:19,"Kslo mau naik byk lg naikkan lg PPN jadi 12,5%...",andi_hpattera,8,0,0,0,...,0,,False,2206,107203,2022,12,30,,0
1,1608824705276063746,PPN Naik,https://twitter.com/stephanusn/status/16088247...,2022-12-30 13:57:54,"@prastow Bayar pajak, lapor sendiri\nValidasi,...",stephanusn,391,0,0,4,...,1,['prastow'],False,269,51150,2022,12,30,,0
2,1608682909761949696,PPN Naik,https://twitter.com/andi_hpattera/status/16086...,2022-12-30 04:34:28,Setelah thn ini berhasil tercapai 110% yg lbh ...,andi_hpattera,12,0,0,0,...,0,,False,2206,107203,2022,12,30,,0
3,1608666864628158466,PPN Naik,https://twitter.com/HestiBambang/status/160866...,2022-12-30 03:30:42,Jika Pemerintah tetapkan UMN setara KHL/PTKP R...,HestiBambang,26,0,0,0,...,0,,False,4,6884,2022,12,30,,0
4,1608487033860915203,PPN Naik,https://twitter.com/marizalass/status/16084870...,2022-12-29 15:36:07,@IndiHomeJBN +ppn 11% = 327.450 ya bang? itu b...,marizalass,32,0,0,0,...,1,['IndiHomeJBN'],False,6972,162949,2022,12,29,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28791,1289402360406532098,Tarif baru PPN,https://twitter.com/VIVAcoid/status/1289402360...,2020-08-01 03:27:33,"Kena PPN, Ini Tarif Baru Langganan Netflix htt...",VIVAcoid,0,0,0,0,...,0,,True,4670160,1658577,2020,8,1,,0
28792,1288775814851383296,Tarif baru PPN,https://twitter.com/RadarKorupsi/status/128877...,2020-07-30 09:57:53,terlalu dibesar-2kan. lagi pula BB nya 61 jt. ...,RadarKorupsi,0,0,1,1,...,0,,False,22660,138542,2020,7,30,,0
28793,1280992817007915008,Tarif baru PPN,https://twitter.com/hmzailanispog/status/12809...,2020-07-08 22:31:01,@Dennysiregar7 @PBIDI SE terbit hari ini. Test...,hmzailanispog,0,0,0,0,...,2,"['Dennysiregar7', 'PBIDI']",False,775,5214,2020,7,8,,0
28794,1279055985802276866,Tarif baru PPN,https://twitter.com/dewantara_adhi/status/1279...,2020-07-03 14:14:45,"@pln_123 Enak mah kalau yg 450, bisa gratis. k...",dewantara_adhi,0,0,0,0,...,1,['pln_123'],False,87,3205,2020,7,3,,0
