# TUGAS AKHIR

## Data Selection

In [None]:
import twint
import nest_asyncio
import pandas as pd

In [None]:
nest_asyncio.apply()

c = twint.Config()
c.Search = "perubahan rute krl"
c.Since = '2022-05-26'
c.Until= '2023-02-28'
c.Output_csv = True
c.Pandas = True

twint.run.Search(c)
Tweets_df = twint.storage.panda.Tweets_df

In [None]:
df = twint.storage.panda.Tweets_df
df

In [None]:
df.info()

In [None]:
df = df[["username", "name", "date", "tweet"]]
df

In [None]:
df.loc[df['username'] == 'CommuterLine']

In [None]:
data= df.drop(df.loc[df['username'] == 'CommuterLine'].index)

In [None]:
data

In [None]:
data.to_csv('data.csv', index = False)

In [None]:
data["tweet"] = data["tweet"].astype(str)

In [None]:
print(data["tweet"])

## Preprocessing

In [None]:
import nltk
import re
import string
from nltk.probability import FreqDist

### 1. Cleansing

In [None]:
#menghapus number
def remove_number(text):
    return  re.sub(r"\d+", " ", text)

data['tweet'] = data['tweet'].apply(remove_number)

In [None]:
#menghapus punctuation
def remove_punctuation(text):
    return text.translate(str.maketrans("","",string.punctuation))

data['tweet'] = data['tweet'].apply(remove_punctuation)

In [None]:
def remove_links(text):
    # menghapus non ASCII (emoticon, chinese word, .etc)
    text = text.encode('ascii', 'replace').decode('ascii')
    # menghapus mention, link, hashtag
    text = ' '.join(re.sub("([@#][A-Za-z0-9]+)|(\w+:\/\/\S+)"," ", text).split())
    # menghapus URL
    return text.replace("http://", " ").replace("https://", " ")
                
data['tweet'] = data['tweet'].apply(remove_links)

In [None]:
data

In [None]:
print (data["tweet"])

### 2. Case Folding

In [None]:
data['tweet'] = data['tweet'].str.lower()

In [None]:
data 

In [None]:
print (data["tweet"])

### 3. Tokenizing

In [None]:
from nltk.tokenize import word_tokenize

def word_tokenize_wrapper(text):
    return word_tokenize(text)

data['tweet_tokenize'] = data['tweet'].apply(word_tokenize_wrapper)

In [None]:
def freqDist_wrapper(text):
    return FreqDist(text)

Ulasan_fqsist = data['tweet_tokenize'].apply(freqDist_wrapper)

print('Frequency Tokens : \n') 
print(Ulasan_fqsist.head().apply(lambda x : x.most_common()))

In [None]:
data

In [None]:
print(data["tweet_tokenize"])

### 4. Normalization

In [None]:
normalizad_word = pd.read_excel('kamus_coba.xlsx')

normalizad_word_dict = {}

for index, row in normalizad_word.iterrows():
    if row[0] not in normalizad_word_dict:
        normalizad_word_dict[row[0]] = row[1] 

def normalized_term(document):
    return [normalizad_word_dict[term] if term in normalizad_word_dict else term for term in document]

data['tweet_normalized'] = data['tweet_tokenize'].apply(normalized_term)

In [None]:
data

In [None]:
print(data['tweet_normalized'])

### 5. Stopword Removal

In [None]:
from nltk.corpus import stopwords

list_stopwords = stopwords.words('indonesian')

#remove stopword pada list token
def stopwords_removal(words):
    return [word for word in words if word not in list_stopwords]

data['tweet_stop_removed'] = data['tweet_normalized'].apply(stopwords_removal)

In [None]:
data

In [None]:
print(data['tweet_stop_removed'])

### 6. Stemming

In [None]:
pip install Sastrawi

In [None]:
pip install swifter

In [None]:
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
import swifter


# create stemmer
factory = StemmerFactory()
stemmer = factory.create_stemmer()

# stemmed
def stemmed_wrapper(term):
    return stemmer.stem(term)

term_dict = {}

for document in data['tweet_stop_removed']:
    for term in document:
        if term not in term_dict:
            term_dict[term] = ' '

for term in term_dict:
    term_dict[term] = stemmed_wrapper(term)
    print(term,":" ,term_dict[term])
    

# apply stemmed term to dataframe
def get_stemmed_term(document):
    return [term_dict[term] for term in document]

data['tweet_Stemmed'] = data['tweet_stop_removed'].swifter.apply(get_stemmed_term)

In [None]:
data

In [None]:
print(data['tweet_Stemmed'])

In [None]:
data['tweet_Stemmed']

In [None]:
data["tweet_Stemmed"] = data["tweet_Stemmed"].astype(str)

In [None]:
data.to_csv("data_hasil_preprocessing.csv", index = False)

In [None]:
import pandas as pd
import numpy as np

data=pd.read_csv("data_hasil_preprocessing.csv")

## Transformation

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
tfidf_tweet_stemed = vectorizer.fit_transform(data['tweet_Stemmed'])
features_names = vectorizer.get_features_name_out()
dense = tfidf_tweet_stemed.todense()
denselist = dense.tolist()
df =pd.DataFrame(denselst, columns= features_names) 

In [None]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=1, k_neighbors=5)
X_smote,Y_smote = smote.fit_resample(tfidf_tweet_stemed, data['label'])

## Text Mining & Evaluation

In [None]:
from sklearn import svm

SVM1 = svm.SVC(kernel='rbf')    
SVM2 = svm.SVC(kernel='linear') 
SVM3 = svm.SVC(kernel='poly')

In [None]:
from sklearn import train_test_split

x_train1, x_test1, Y_train1, Y_test1= train_test_split(X_smote,Y_smote, test_size= 0.1, random state= 42)

SVM1.fit(x_train1, Y_train1) # RBF
SVM2.fit(x_train1, Y_train1) # Linear
SVM3.fit(x_train1, Y_train1) # Polynomial

from sklearn.metrics import accuracy_score
y_rbf = SVM1.predict(x_test1)
y_linear = SVM2.predict(x_test1)
y_polynomial = SVM3.predict(x_test1)

print(accuracy_score(Y_test1,y_rbf))
print(accuracy_score(Y_test1,y_linear))
print(accuracy_score(Y_test1,y_polynomial))

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

print(confusion_matrix(Y_test1, y_rbf))
print(confusion_matrix(Y_test1, y_linear))
print(confusion_matrix(Y_test1, y_polynomial))

print(classification_report(Y_test1, y_rbf))
print(classification_report(Y_test1, y_linear))
print(classification_report(Y_test1, y_polynomial))

In [None]:
from sklearn.model_selection import train_test_split

x_train2, x_test2, Y_train2, Y_test2= train_test_split(X_smote,Y_smote, test_size= 0.2, random_state= 42)

SVM1.fit(x_train2, Y_train2) # RBF
SVM2.fit(x_train2, Y_train2) # Linear
SVM3.fit(x_train2, Y_train2) # Polynomial

from sklearn.metrics import accuracy_score
y_rbf = SVM1.predict(x_test2)
y_linear = SVM2.predict(x_test2)
y_polynomial = SVM3.predict(x_test2)

print(accuracy_score(Y_test2,y_rbf))
print(accuracy_score(Y_test2,y_linear))
print(accuracy_score(Y_test2,y_polynomial))

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

print(confusion_matrix(Y_test2, y_rbf))
print(confusion_matrix(Y_test2, y_linear))
print(confusion_matrix(Y_test2, y_polynomial))

print(classification_report(Y_test2, y_rbf))
print(classification_report(Y_test2, y_linear))
print(classification_report(Y_test2, y_polynomial))

In [None]:
from sklearn.model_selection import train_test_split

x_train3, x_test3, Y_train3, Y_test3= train_test_split(X_smote,Y_smote, test_size= 0.3, random_state= 42)

SVM1.fit(x_train3, Y_train3) # RBF
SVM2.fit(x_train3, Y_train3) # Linear
SVM3.fit(x_train3, Y_train3) # Polynomial

from sklearn.metrics import accuracy_score
y_rbf = SVM1.predict(x_test3)
y_linear = SVM2.predict(x_test3)
y_polynomial = SVM3.predict(x_test3)

print(accuracy_score(Y_test3,y_rbf))
print(accuracy_score(Y_test3,y_linear))
print(accuracy_score(Y_test3,y_polynomial))

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

print(confusion_matrix(Y_test3, y_rbf))
print(confusion_matrix(Y_test3, y_linear))
print(confusion_matrix(Y_test3, y_polynomial))

print(classification_report(Y_test3, y_rbf))
print(classification_report(Y_test3, y_linear))
print(classification_report(Y_test3, y_polynomial))

### Wordcloud

In [None]:
data["tweet_Clean"] = [' '.join(map(str, l)) for l in data['tweet_Stemmed']]
data.head()

In [None]:
tweet = ' '.join(str(v) for v in data['tweet_Clean'])

In [None]:
tokenize_tweet = word_tokenize(tweet)

In [None]:
tokenize_tweet

In [None]:
fqdist = FreqDist(tokenize_tweet)

In [None]:
fqdist

In [None]:
fqdist.most_common(15)

In [None]:
positif = data.loc[data['label'] == 'Positif']
negatif = data.loc[data['label'] == 'Negatif']
netral  = data.loc[data['label'] == 'Netral']

In [None]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt

stop= ('krl','rute','ubah', 'manggarai')
wc = WordCloud(width = 800, height = 800,
               background_color = 'white',
               min_font_size=10, stopwords = stop ).generate_from_text(''.join(positif["tweet_Clean"]))

plt.figure(figsize = (10,10), facecolor = None)
plt.imshow(wc, interpolation="bilinear")
plt.axis('off')
plt.show()
#wc.to_file('wordcloudpositif.png')

In [None]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt

stop= ('krl','rute','ubah', 'manggarai')
wc = WordCloud(width = 800, height = 800,
               background_color = 'white',
               min_font_size=10, stopwords = stop).generate_from_text(''.join(negatif["tweet_Clean"]))

plt.figure(figsize = (10,10), facecolor = None)
plt.imshow(wc, interpolation="bilinear")
plt.axis('off')
plt.show()
#wc.to_file('wordcloudnegatif.png')