In [18]:
#Import Custom Library
from Idlysis import Analyzer,Preprocessor
import time
from datetime import datetime
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer


def clean_data():
    #Data Cleaner
    data=pd.read_csv('data/2023-02-28 19.36.57.730834 PPN Naik.csv')
    data=data.drop_duplicates(subset=["Tweet"])
    
    #Set Data Type
    data=data.astype({
        'Id':'int64',
        'Datetime':'datetime64',
        'View Count':'int32',
        'Reply Count':'int32',
        'Retweet Count':'int32',
        'Like Count':'int32',
        'Quote Count':'int32',
        'Mentioned Users Count':'int32',
        'User Verified':'bool',
        'User Followers Count':'int32',
        'User Statuses Count':'int32',
        'User Verified':'bool'
    })
    
    #Separate Date
    data["Year"]=data["Datetime"].dt.year
    data["Month"]=data["Datetime"].dt.month
    data["Day"]=data["Datetime"].dt.day
    
    #Add New Column : Kota
    data["Kota"]=[str(i).split(",")[0] for i in data["Place"]]
    
    #Date Cutoff
    data=data[(data["Datetime"]<datetime(2022,12,31)) & (data["Datetime"]>datetime(2020,5,1))]
    
    #Remove Sentiment Label Column
    data=data.drop(columns=["Sentiment Label"])
    
    #Indonesian Only
    data=data[data["Language"]=="in"]
    
    #Save Data
    data.to_csv('data/clean.csv',index=False)

def check_tweet(order):
    tweet=data["Tweet"][order]
    pre=Preprocessor(tweet)
    return "Tweet order: "+str(order)+" Tweet: "+pre.get_text()

def check_random_tweet():
    order=random.randint(0,len(data["Tweet"])-1)
    tweet=data["Tweet"][order]
    pre=Preprocessor(tweet)
    return "Tweet order: "+str(order)+" Tweet: "+pre.get_text()

def sentimen_kenaikan_ppn():
    
    #Load Data
    data=pd.read_csv('data/clean.csv')
    labeled=pd.read_csv('data/labeled.csv') #Labeled Data


    # Creating/Edit Feature to Predict
    labeled["Clean"]=[Preprocessor(i).get_text() for i in labeled["Tweet"]]
    labeled["Exclamation Mark"]=[i.count("!") for i in labeled["Tweet"]]
    labeled["Question Mark"]=[i.count("?") for i in labeled["Tweet"]]
    vect=TfidfVectorizer(max_features=1000,ngram_range=(1,3))
    V=vect.fit_transform(labeled["Clean"])
    features=vect.get_feature_names_out()
    tfidf_data=pd.DataFrame(V.toarray(),columns=features)
    labeled=labeled.join(tfidf_data)
    labeled.insert(len(labeled.columns)-1, "Label", labeled.pop("Label"))
    labeled.all()


    # Select Feature and Target to Predict
    train_data=labeled[labeled["Label"].notnull()].copy()
    train_data=train_data.iloc[:,7:10].join(train_data.iloc[:,28:])

    data_to_predict=labeled.copy()
    data_to_predict["Label"]=None
    data_to_predict=data_to_predict.iloc[:,7:10].join(data_to_predict.iloc[:,28:])

    #Direct Data Predict and Training Labled Data
    prediction=Analyzer().predict(train_data,data_to_predict)
    predicted_data=data_to_predict.copy()
    predicted_data["Label"]=prediction
    data["Label"]=prediction

    #Save Data
    data.to_csv('data/predicted.csv',index=False)
    return data
    print("Data Saved to predicted.csv")

def sample_dataset():
    data=pd.read_csv("dataset sample/Twitter US Airline Sentiment.csv")
    sentiment=[]
    for i in data["airline_sentiment"]:
        if i=="neutral":
            sentiment.append(0)
        elif i=="negative":
            sentiment.append(1)
        elif i=="positive":
            sentiment.append(2)

    data["airline_sentiment"]=sentiment
    data["clean"]=[Idlysis.Preprocessor(i).get_text() for i in data["text"]]
    data=data.iloc[:,[15,1]]
    vect=TfidfVectorizer(max_features=500,ngram_range=(1,3))
    V=vect.fit_transform(data["clean"])
    features=vect.get_feature_names_out()
    tfidf_data=pd.DataFrame(V.toarray(),columns=features)
    data=data.iloc[:,2:].join(data.iloc[:,[1]])
    data=tfidf_data.join(data)

    train_data=data.copy()
    data_to_predict=data.copy()
    data_to_predict["airline_sentiment"]=None
    predicted_data=Analyzer().predict(train_data,data_to_predict)
    
    #Save Data
    data.to_csv('data/predicted.csv',index=False)
    return data
    print("Data Saved to predicted.csv")



In [15]:
#Cleaning Data
startTime=time.time()
clean_data()
data=pd.read_csv("data/clean.csv")
print("Excecution time: ",time.time()-startTime)
data

Excecution time:  2.463632345199585


Unnamed: 0,Id,Search Keyword,URL,Datetime,Tweet,Username,View Count,Reply Count,Retweet Count,Like Count,...,Reply to User,Mentioned Users Count,Mentioned Users,User Verified,User Followers Count,User Statuses Count,Year,Month,Day,Kota
0,1608941325113774080,PPN Naik,https://twitter.com/andi_hpattera/status/16089...,2022-12-30 21:41:19,"Kslo mau naik byk lg naikkan lg PPN jadi 12,5%...",andi_hpattera,8,0,0,0,...,,0,,False,2206,107203,2022,12,30,
1,1608824705276063746,PPN Naik,https://twitter.com/stephanusn/status/16088247...,2022-12-30 13:57:54,"@prastow Bayar pajak, lapor sendiri\nValidasi,...",stephanusn,391,0,0,4,...,prastow,1,['prastow'],False,269,51150,2022,12,30,
2,1608682909761949696,PPN Naik,https://twitter.com/andi_hpattera/status/16086...,2022-12-30 04:34:28,Setelah thn ini berhasil tercapai 110% yg lbh ...,andi_hpattera,12,0,0,0,...,,0,,False,2206,107203,2022,12,30,
3,1608666864628158466,PPN Naik,https://twitter.com/HestiBambang/status/160866...,2022-12-30 03:30:42,Jika Pemerintah tetapkan UMN setara KHL/PTKP R...,HestiBambang,26,0,0,0,...,,0,,False,4,6884,2022,12,30,
4,1608487033860915203,PPN Naik,https://twitter.com/marizalass/status/16084870...,2022-12-29 15:36:07,@IndiHomeJBN +ppn 11% = 327.450 ya bang? itu b...,marizalass,32,0,0,0,...,IndiHomeJBN,1,['IndiHomeJBN'],False,6972,162949,2022,12,29,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27309,1289402360406532098,Tarif baru PPN,https://twitter.com/VIVAcoid/status/1289402360...,2020-08-01 03:27:33,"Kena PPN, Ini Tarif Baru Langganan Netflix htt...",VIVAcoid,0,0,0,0,...,,0,,True,4670160,1658577,2020,8,1,
27310,1288775814851383296,Tarif baru PPN,https://twitter.com/RadarKorupsi/status/128877...,2020-07-30 09:57:53,terlalu dibesar-2kan. lagi pula BB nya 61 jt. ...,RadarKorupsi,0,0,1,1,...,,0,,False,22660,138542,2020,7,30,
27311,1280992817007915008,Tarif baru PPN,https://twitter.com/hmzailanispog/status/12809...,2020-07-08 22:31:01,@Dennysiregar7 @PBIDI SE terbit hari ini. Test...,hmzailanispog,0,0,0,0,...,Dennysiregar7,2,"['Dennysiregar7', 'PBIDI']",False,775,5214,2020,7,8,
27312,1279055985802276866,Tarif baru PPN,https://twitter.com/dewantara_adhi/status/1279...,2020-07-03 14:14:45,"@pln_123 Enak mah kalau yg 450, bisa gratis. k...",dewantara_adhi,0,0,0,0,...,pln_123,1,['pln_123'],False,87,3205,2020,7,3,


In [19]:
#Predict Data
startTime=time.time()
predicted_data=sentimen_kenaikan_ppn()
print("Excecution time: ",time.time()-startTime)
predicted_data


Testing.. Accuracy Score:  0.42857142857142855
Testing.. Accuracy Score:  0.14285714285714285
Testing.. Accuracy Score:  0.42857142857142855
Testing.. Accuracy Score:  0.2857142857142857
Testing.. Accuracy Score:  0.5714285714285714
Testing.. Accuracy Score:  0.2857142857142857
Testing.. Accuracy Score:  0.42857142857142855
Testing.. Accuracy Score:  0.2857142857142857
Testing.. Accuracy Score:  0.42857142857142855
Testing.. Accuracy Score:  0.42857142857142855
KNeighborsClassifier() [1. 1. 1. 1. 1. 1. 1.]
Average Accuracy Score: 0.37142857142857133 

Testing.. Accuracy Score:  0.42857142857142855
Testing.. Accuracy Score:  0.42857142857142855
Testing.. Accuracy Score:  0.42857142857142855
Testing.. Accuracy Score:  0.2857142857142857
Testing.. Accuracy Score:  0.42857142857142855
Testing.. Accuracy Score:  0.42857142857142855
Testing.. Accuracy Score:  0.2857142857142857
Testing.. Accuracy Score:  0.2857142857142857
Testing.. Accuracy Score:  0.2857142857142857
Testing.. Accuracy Scor

Unnamed: 0,Id,Search Keyword,URL,Datetime,Tweet,Username,View Count,Reply Count,Retweet Count,Like Count,...,Mentioned Users Count,Mentioned Users,User Verified,User Followers Count,User Statuses Count,Year,Month,Day,Kota,Label
0,1608941325113774080,PPN Naik,https://twitter.com/andi_hpattera/status/16089...,2022-12-30 21:41:19,"Kslo mau naik byk lg naikkan lg PPN jadi 12,5%...",andi_hpattera,8,0,0,0,...,0,,False,2206,107203,2022,12,30,,0.0
1,1608824705276063746,PPN Naik,https://twitter.com/stephanusn/status/16088247...,2022-12-30 13:57:54,"@prastow Bayar pajak, lapor sendiri\nValidasi,...",stephanusn,391,0,0,4,...,1,['prastow'],False,269,51150,2022,12,30,,1.0
2,1608682909761949696,PPN Naik,https://twitter.com/andi_hpattera/status/16086...,2022-12-30 04:34:28,Setelah thn ini berhasil tercapai 110% yg lbh ...,andi_hpattera,12,0,0,0,...,0,,False,2206,107203,2022,12,30,,0.0
3,1608666864628158466,PPN Naik,https://twitter.com/HestiBambang/status/160866...,2022-12-30 03:30:42,Jika Pemerintah tetapkan UMN setara KHL/PTKP R...,HestiBambang,26,0,0,0,...,0,,False,4,6884,2022,12,30,,0.0
4,1608487033860915203,PPN Naik,https://twitter.com/marizalass/status/16084870...,2022-12-29 15:36:07,@IndiHomeJBN +ppn 11% = 327.450 ya bang? itu b...,marizalass,32,0,0,0,...,1,['IndiHomeJBN'],False,6972,162949,2022,12,29,,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27309,1289402360406532098,Tarif baru PPN,https://twitter.com/VIVAcoid/status/1289402360...,2020-08-01 03:27:33,"Kena PPN, Ini Tarif Baru Langganan Netflix htt...",VIVAcoid,0,0,0,0,...,0,,True,4670160,1658577,2020,8,1,,1.0
27310,1288775814851383296,Tarif baru PPN,https://twitter.com/RadarKorupsi/status/128877...,2020-07-30 09:57:53,terlalu dibesar-2kan. lagi pula BB nya 61 jt. ...,RadarKorupsi,0,0,1,1,...,0,,False,22660,138542,2020,7,30,,0.0
27311,1280992817007915008,Tarif baru PPN,https://twitter.com/hmzailanispog/status/12809...,2020-07-08 22:31:01,@Dennysiregar7 @PBIDI SE terbit hari ini. Test...,hmzailanispog,0,0,0,0,...,2,"['Dennysiregar7', 'PBIDI']",False,775,5214,2020,7,8,,1.0
27312,1279055985802276866,Tarif baru PPN,https://twitter.com/dewantara_adhi/status/1279...,2020-07-03 14:14:45,"@pln_123 Enak mah kalau yg 450, bisa gratis. k...",dewantara_adhi,0,0,0,0,...,1,['pln_123'],False,87,3205,2020,7,3,,1.0
