<a href="https://colab.research.google.com/github/ftrnailahhh/Penambangan-Data-Teks/blob/main/19_097_PDT_UAS.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np

#for preprocessing
import regex
import nltk
nltk.download("stopwords")
nltk.download("punkt")
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords 
from nltk.stem.porter import PorterStemmer

#for vektorisasi + split data 
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

#for classififcation + accuracy
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# **Load Data**

In [2]:
data = pd.read_excel("IMDB Dataset.xlsx")
data.head()

Unnamed: 0,review,sentiment
0,"Prussic gas, a murderer donning a red clansman...",positive
1,I was very lucky to see this film as part of t...,positive
2,This movie is really bad. Most of it looks lik...,negative
3,I think this is one hell of a movie..............,positive
4,"I saw this movie in the theater, and was thoro...",positive


# **Preprocessing**

In [3]:
data['sentiment'].value_counts()

positive    501
negative    497
Name: sentiment, dtype: int64

In [4]:
data.isna().sum()

review       0
sentiment    0
dtype: int64

In [5]:
data.shape

(998, 2)

In [6]:
def cleaning_text(a):
    # Remove Symbol/Characters
    a = regex.sub(r"<[^<]+?>", "", a)
    a = regex.sub(r"[^a-zA-Z0-9\s]", "", a) 
    
    # Change to Lower Alphabet
    a = a.lower()
    
    return a

data["review"] = data["review"].apply(cleaning_text)
data.head()

Unnamed: 0,review,sentiment
0,prussic gas a murderer donning a red clansman ...,positive
1,i was very lucky to see this film as part of t...,positive
2,this movie is really bad most of it looks like...,negative
3,i think this is one hell of a moviewe can see ...,positive
4,i saw this movie in the theater and was thorou...,positive


In [7]:
#stopword in english
stopword_list = stopwords.words('english')
print(stopword_list)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [8]:
# Tokenization and Remove StopWords
def stopwords_removal(a):
    # Memisah kata per kata + menghapus spasi string
    tokens = [token.strip() for token in word_tokenize(a)] 
    filtered_tokens = [token for token in tokens if token.lower() not in stopword_list]
    filtered_a = ' '.join(filtered_tokens)

    return filtered_a

data['review'] = data['review'].apply(stopwords_removal)
data.head()

Unnamed: 0,review,sentiment
0,prussic gas murderer donning red clansman suit...,positive
1,lucky see film part melbourne international fi...,positive
2,movie really bad looks like filmed either park...,negative
3,think one hell moviewe see steven fighting aro...,positive
4,saw movie theater thoroughly impressed claire ...,positive


In [9]:
# mengambil kata dasar
def stemming(a):
    ps = PorterStemmer()
    a= ' '.join([ps.stem(word) for word in a.split()])
    return a

data['review'] = data['review'].apply(stemming)
data.head()

Unnamed: 0,review,sentiment
0,prussic ga murder don red clansman suit hood w...,positive
1,lucki see film part melbourn intern film festi...,positive
2,movi realli bad look like film either park bas...,negative
3,think one hell moview see steven fight around ...,positive
4,saw movi theater thoroughli impress clair dane...,positive


In [10]:
X = data['review']
y = data['sentiment']


# **Vektorisasi**

In [11]:
vect = CountVectorizer()

X = vect.fit_transform(X).astype(np.int8)
print('Vocabulary: ')
print(vect.vocabulary_)
print(X)



Vocabulary: 
{'prussic': 11005, 'ga': 5541, 'murder': 9312, 'don': 4067, 'red': 11341, 'clansman': 2710, 'suit': 13512, 'hood': 6640, 'wield': 15373, 'white': 15339, 'whip': 15330, 'colleg': 2869, 'school': 12097, 'girl': 5733, 'hand': 6176, 'paid': 10141, 'convict': 3107, 'enlist': 4558, 'mysteri': 9352, 'mastermind': 8605, 'keep': 7601, 'face': 4871, 'hidden': 6468, 'within': 15451, 'offic': 9796, 'contain': 3071, 'aquarium': 960, 'turtl': 14447, 'fish': 5167, 'inspector': 7115, 'scotland': 12129, 'yard': 15624, 'higginsijoachim': 6474, 'fuchsbergerand': 5482, 'superior': 13552, 'sir': 12646, 'johnsiegfri': 7465, 'schrenbergcertainli': 12101, 'full': 5489, 'case': 2350, 'seem': 12223, 'center': 2441, 'around': 1022, 'student': 13406, 'ann': 834, 'portlanduschi': 10735, 'gla': 5751, 'turn': 14441, '21': 167, 'inherit': 7067, 'great': 5962, 'deal': 3569, 'wealth': 15201, 'target': 13747, 'share': 12419, 'room': 11794, 'reason': 11298, 'remain': 11450, 'sy': 13655, 'finest': 5135, 'must

# **Split Data** 80%Training 20%Testing

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, stratify=y)

In [13]:
# Splitting training and testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, stratify=y)
def classifier_testing(classification, X_train, X_test, y_train, y_test):
    classification.fit(X_train, y_train)

    y_pred = classification.predict(X_test)

    # Accuracy Score
    classification_accuracy_score = accuracy_score(y_test, y_pred)
    print("Accuracy Score:\n", classification_accuracy_score, "\n")

    # Classification
    class_result= classification_report(y_test, y_pred)
    print("Classification Report:\n", class_result, "\n")

    # Confusion Matrix
    conf_mtx = confusion_matrix(y_test, y_pred)
    print("Confusion Matrix:\n", conf_mtx, "\n")

In [14]:
# Initializing NaiveBayes-MultinomialNB Classifier
MNB = MultinomialNB()

classifier_testing(MNB, X_train, X_test, y_train, y_test)

Accuracy Score:
 0.775 

Classification Report:
               precision    recall  f1-score   support

    negative       0.77      0.79      0.78       100
    positive       0.78      0.76      0.77       100

    accuracy                           0.78       200
   macro avg       0.78      0.78      0.77       200
weighted avg       0.78      0.78      0.77       200
 

Confusion Matrix:
 [[79 21]
 [24 76]] 

