In [81]:
import re
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
import csv
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn import datasets
from sklearn.metrics import accuracy_score
from joblib import dump, load
from datetime import datetime
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

#Import Classifier Models
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

class Preprocessor():
    def __init__(self,text):
        """
        Preprocessing Text: Digunakan untuk membersihkan teks sebelum dilakukan analisis.
        mencakup proses casefolding, filtering
        """
        self.text=text
        self._casefolding()
        self._filtering()
        self._tokenize()
        self._standarize()
#         self._stemming()

    def get_text(self):
        return " ".join(self.text)
    
    def _casefolding(self):
        #Mengubah menjadi huruf kecil        
        self.text=self.text.lower()
    
    def _filtering(self):        
        #Url
        self.text=re.sub("https\S+","",self.text)
        self.text=re.sub("http\S+","",self.text)
        self.text=re.sub("\S+\.com\S+","",self.text)
        self.text=re.sub("\S+\.com","",self.text)
        
        #Hashtag
        self.text=re.sub("#\S+","",self.text)
        
        #Mention
        self.text=re.sub("@\S+","",self.text)
        
        #Symbol and Number
        self.text=re.sub("[^A-Za-z\s]","",self.text)
        
        #Spacing
        self.text=re.sub("\s+"," ",self.text)
        self.text=re.sub("^\s","",self.text)
        self.text=self.text
    
    def _tokenize(self):
        #Membagi kata
        self.text=word_tokenize(self.text)

    def _standarize(self):        
        #Mengubah menjadi kata baku
        j={}
        with open("standard_word.csv","r") as file:
            data=csv.reader(file,delimiter=",")
            for k,i in enumerate(data):
                if k==0: continue
                j[i[0]]=i[1]
                
        for k,t in enumerate(self.text):
            if t in j:
                self.text[k]=j[t]
    
    def _stemming(self):        
        factory=StemmerFactory()
        stemmer=factory.create_stemmer()
        
        for k,i in enumerate(self.text):
            self.text[k]=stemmer.stem(i)
    
class Analyzer():  
    def __init__(self):
        pass
    
    def predict(self,model,data:pd.DataFrame):
        target_column:int=len(data.columns)-1
        X=data.iloc[:,data.columns!=data.columns[target_column]]
        y=data[data.columns[target_column]]
        return model.predict(X)
    
    def train_model(self,model,data:pd.DataFrame):
        target_column:int=len(data.columns)-1
        X=data.iloc[:,data.columns!=data.columns[target_column]]
        y=data[data.columns[target_column]]
        X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2)
        
        models_used=[
            KNeighborsClassifier(),
            SVC(),
            GaussianNB(),
            MultinomialNB(),
            DecisionTreeClassifier(),
            RandomForestClassifier(),
            GradientBoostingClassifier(),
        ]
        model.fit(X_train,y_train)
        prediction=model.predict(X_test)
        if accuracy_score(prediction,y_test)>0.9:
            print("Model has beed trained with new data and saved using "+model.__class__.__name__+" model. Accuracy Score: ",accuracy_score(prediction,y_test))
            dump(model,"models/"+model.__class__.__name__+" "+str(datetime.now()).replace(":","")+".joblib")
        else:
            print("Data is not good enough. Model is not saved. Accuracy score: ",accuracy_score(prediction,y_test))
        
    def create_model(self,data:pd.DataFrame):
        target_column:int=len(data.columns)-1
        X=data.iloc[:,data.columns!=data.columns[target_column]]
        y=data[data.columns[target_column]]
        X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2)
        
        models_used=[
            KNeighborsClassifier(),
            SVC(),
            GaussianNB(),
            MultinomialNB(),
            DecisionTreeClassifier(),
            RandomForestClassifier(),
            GradientBoostingClassifier(),
        ]
        
        max_accuracy=0
        for i in models_used:
            i.fit(X_train,y_train)
            prediction=i.predict(X_test)
            accuracy=accuracy_score(prediction,y_test)
            
            if accuracy>max_accuracy:
                max_accuracy=accuracy
                model=i
            print(i,prediction)
            print("Accuracy Score:",accuracy,"\n")
        
        print("Model saved using "+model.__class__.__name__+" model. Accuracy: "+str(max_accuracy))
        dump(model,"models/"+model.__class__.__name__+" "+str(datetime.now()).replace(":","")+".joblib")

#Load Model
model=load('models/DecisionTreeClassifier 2023-02-28 005926.237284.joblib')

#Load Data
data=pd.read_csv('data/2023-02-27 08.24.25.743067 Keywords pajak.csv')

#Preprocessing
data["Clean"]=[Preprocessor(i).get_text() for i in data["Tweet"]]

#TFIDF Calculation
vect=TfidfVectorizer()
X=vect.fit_transform(data["Clean"])
df=pd.DataFrame(X.toarray())

#Predict
ana=Analyzer()
ana.predict(model,df)
# data["Clean"]

ValueError: X has 4555 features, but DecisionTreeClassifier is expecting 1071 features as input.