In [59]:
## import random
import re
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
import csv
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn import datasets
from sklearn.metrics import accuracy_score
from joblib import dump, load
from datetime import datetime
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

#Import Classifier Models
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

from IPython.display import clear_output

class Preprocessor():
    def __init__(self,text,order=""):
        """
        Preprocessing Text: Digunakan untuk membersihkan teks sebelum dilakukan analisis.
        mencakup proses casefolding, filtering
        """
        self.text=text
        self._casefolding()
        self._filtering()
        self._tokenize()
        self._standarize()
#         self._stemming()

    def get_text(self):
        return " ".join(self.text)
    
    def _casefolding(self):
        #Mengubah menjadi huruf kecil        
        self.text=self.text.lower()
    
    def _filtering(self):        
        #Url
        self.text=re.sub("https\S+","",self.text)
        self.text=re.sub("http\S+","",self.text)
        self.text=re.sub("\S+\.com\S+","",self.text)
        self.text=re.sub("\S+\.com","",self.text)
        
        #Remove Hashtag
        self.text=re.sub("#\S+","",self.text)
        
        #Remove Mention
        self.text=re.sub("@\S+","",self.text)
        
        #Remove Symbol and Number
        self.text=re.sub("[^A-Za-z\s]"," ",self.text)
        
        #Remove Spacing
        self.text=re.sub("\s+"," ",self.text)
        self.text=re.sub("^\s","",self.text)
        self.text=self.text
    
    def _tokenize(self):
        #Membagi kata
        self.text=word_tokenize(self.text)

    def _standarize(self):        
        #Mengubah menjadi kata baku
        j={}
        with open("standard_word.csv","r") as file:
            data=csv.reader(file,delimiter=",")
            for k,i in enumerate(data):
                if k==0: continue
                j[i[0]]=i[1]
                
        for k,t in enumerate(self.text):
            if t in j:
                self.text[k]=j[t]
    
    def _stemming(self):
        #Mengubah menjadi kata dasar
        factory=StemmerFactory()
        stemmer=factory.create_stemmer()
        
        for k,i in enumerate(self.text):
            self.text[k]=stemmer.stem(i)
    
class Analyzer():  
    def __init__(self):
        """
        Membuat model dan melakukan prediksi
        """
        pass
    
    def predict(self,training_data:pd.DataFrame,data_to_predict:pd.DataFrame):
        #Train and Predict Directly
        model=self.create_model(training_data)
        return self.predict_by_model(model,data_to_predict)
    
    def predict_by_model(self,model,data:pd.DataFrame):

        #Output Data
        target_column:int=len(data.columns)-1
        X=data.iloc[:,data.columns!=data.columns[target_column]]
        y=data[data.columns[target_column]]
        prediction=model.predict(X)
        return prediction
    
        
    def create_model(self,data:pd.DataFrame,is_save:bool=False):
        target_column:int=len(data.columns)-1
        X=data.iloc[:,data.columns!=data.columns[target_column]]
        y=data[data.columns[target_column]]
        
        models_used=[
            KNeighborsClassifier(),
            SVC(),
            GaussianNB(),
            MultinomialNB(),
            DecisionTreeClassifier(),
            RandomForestClassifier(),
            GradientBoostingClassifier(),
        ]
        
        
        max_accuracy=0
        for i in models_used:
            accuracies=[]
            for j in range(10):
                X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.5)
                i.fit(X_train,y_train)
                prediction=i.predict(X_test)
                accuracies.append(accuracy_score(prediction,y_test))
                print("Testing.. Accuracy Score: ",accuracy_score(prediction,y_test))
            
            accuracy=np.average(accuracies)
            if accuracy>max_accuracy:
                max_accuracy=accuracy
                model=i
            print(i,prediction)
            print("Average Accuracy Score:",accuracy,"\n")

        print("Training Data Size: ",data.shape[0])
        print("Model used: "+model.__class__.__name__+" model. Accuracy: "+str(max_accuracy))
        if is_save:
            dump(model,"models/"+model.__class__.__name__+" "+str(datetime.now()).replace(":","")+".joblib")
        return model

def check_tweet(order):
    tweet=data["Tweet"][order]
    pre=Preprocessor(tweet)
    return "Tweet order: "+str(order)+" Tweet: "+pre.get_text()

def check_random_tweet():
    order=random.randint(0,len(data["Tweet"])-1)
    tweet=data["Tweet"][order]
    pre=Preprocessor(tweet)
    return "Tweet order: "+str(order)+" Tweet: "+pre.get_text()

# #Load Data
data=pd.read_csv('data/clean.csv')
labeled=pd.read_csv('data/labeled.csv') #Labeled Data

#Creating Feature to Predict
vect=TfidfVectorizer(max_features=500)
V=vect.fit_transform(labeled["Tweet"])
features=vect.get_feature_names_out()
tfidf_data=pd.DataFrame(V.toarray(),columns=features)

labeled=labeled.join(tfidf_data)
labeled.insert(len(labeled.columns)-1, "Label", labeled.pop("Label"))

# #Select Feature and Target to Predict
train_data=labeled[labeled["Label"].notnull()].copy()
train_data=train_data.iloc[:,6:10].join(train_data.iloc[:,27:])

data_to_predict=labeled.copy()
data_to_predict["Label"]=None
data_to_predict=data_to_predict.iloc[:,6:10].join(data_to_predict.iloc[:,27:])

#Direct Data Predict and Training Labled Data
prediction=Analyzer().predict(train_data,data_to_predict)
predicted_data=data_to_predict.copy()
predicted_data["Label"]=prediction
data["Label"]=prediction

#Save Data
data.to_csv('Predicted.csv',index=False)
print("Data Saved to Predicted.csv")

#Show Table
predicted_data



Testing.. Accuracy Score:  0.7459016393442623
Testing.. Accuracy Score:  0.7131147540983607
Testing.. Accuracy Score:  0.7377049180327869
Testing.. Accuracy Score:  0.6311475409836066
Testing.. Accuracy Score:  0.6885245901639344
Testing.. Accuracy Score:  0.6639344262295082
Testing.. Accuracy Score:  0.7950819672131147
Testing.. Accuracy Score:  0.680327868852459
Testing.. Accuracy Score:  0.7459016393442623
Testing.. Accuracy Score:  0.7868852459016393
KNeighborsClassifier() [0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 1. 0. 0. 0. 0. 0. 1. 0. 0. 0. 1. 0. 1. 1.
 0. 1. 0. 0. 0. 1. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 1. 0. 0. 0. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 1. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 1. 0. 0. 0. 0. 1. 0. 0. 1. 0. 1.
 0. 1. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1.
 0. 1.]
Average Accuracy Score: 0.7188524590163936 

Testing.. Accuracy Score:  0.6967213114754098
Testing.. Accuracy Score:  0.7213114754098361
Testing.

Unnamed: 0,View Count,Reply Count,Retweet Count,Like Count,00,000,10,11,12,2022,...,termasuk,tidak,tks,udah,untuk,uu,ya,yang,yg,Label
0,8,0,0,0,0.0,0.000000,0.000000,0.000000,0.515543,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,1.0
1,391,0,0,4,0.0,0.000000,0.000000,0.000000,0.000000,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.379019,0.0
2,12,0,0,0,0.0,0.000000,0.413804,0.000000,0.000000,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.370251,1.0
3,26,0,0,0,0.0,0.000000,0.000000,0.080713,0.000000,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.166737,0.0
4,32,0,0,0,0.0,0.000000,0.000000,0.179465,0.000000,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.366576,0.000000,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28791,0,0,0,0,0.0,0.000000,0.000000,0.000000,0.000000,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0
28792,0,0,1,1,0.0,0.000000,0.000000,0.000000,0.000000,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.248596,0.000000,0.0
28793,0,0,0,0,0.0,0.533903,0.000000,0.000000,0.000000,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.389892,0.000000,0.0
28794,0,0,0,0,0.0,0.000000,0.000000,0.000000,0.000000,0.0,...,0.260903,0.0,0.0,0.0,0.0,0.0,0.210079,0.000000,0.424927,0.0
