In [None]:
import pandas as pd
import numpy as np


import matplotlib.pyplot as plt
import seaborn as sns

# text processing
import re
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
stopwords = set(stopwords.words('english'))



# sklearn
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.feature_extraction.text import CountVectorizer

# utils
import os
from tqdm import tqdm
tqdm.pandas()
from collections import Counter

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

In [None]:
# # import file csv
# from google.colab import files
# df = files.upload()

In [None]:
import pandas as pd
# Membaca data dari file CSV
df = pd.read_csv('/content/ctweet_dataset.csv', delimiter='\t')

# Melihat lima baris pertama dari DataFrame
df.head()

Unnamed: 0,text;;;;
0,"Grap family keep safe,;;;;"
1,Key insights effect COVID - consumer behavior:...
2,Chemists selling Masks Sanitizers damn high pr...
3,Oil Prices Jump Over % After Top Producers Agr...
4,Worldwide fuel consumption roughly % COVID- pa...


In [None]:
df = df.rename(columns={'text;;;;': 'text'})
df.head()

Unnamed: 0,text
0,"Grap family keep safe,;;;;"
1,Key insights effect COVID - consumer behavior:...
2,Chemists selling Masks Sanitizers damn high pr...
3,Oil Prices Jump Over % After Top Producers Agr...
4,Worldwide fuel consumption roughly % COVID- pa...


# Text Cleaning

1. Removing links
2. Removing punctuations
3. Removing html
4. Removing numbers
5. Removing consecutive whitespaces
6. Removing nonascii characters
7. Removing emoji

In [None]:
def rm_link(text):
    return re.sub(r'https?://\S+|www\.\S+', '', text)

def rm_punct2(text):
    return re.sub(r'[\!\"\#\$\%\&\'\(\)\*\+\,\-\.\/\:\;\<\=\>\?\@\[\\\]\^\_\`\{\|\}\~]', ' ', text)

def rm_html(text):
    return re.sub(r'<[^>]+>', '', text)

def rm_number(text):
    return re.sub(r'\d+', '', text)

def rm_whitespaces(text):
    return re.sub(r' +', ' ', text)

def rm_nonascii(text):
    return re.sub(r'[^\x00-\x7f]', r'', text)

def rm_emoji(text):
    emojis = re.compile(
        '['
        u'\U0001F600-\U0001F64F'  # emoticons
        u'\U0001F300-\U0001F5FF'  # symbols & pictographs
        u'\U0001F680-\U0001F6FF'  # transport & map symbols
        u'\U0001F1E0-\U0001F1FF'  # flags (iOS)
        u'\U00002702-\U000027B0'
        u'\U000024C2-\U0001F251'
        ']+',
        flags=re.UNICODE
    )
    return emojis.sub(r'', text)

def spell_correction(text):
    return re.sub(r'(.)\1+', r'\1\1', text)

def clean_pipeline(text):
    no_link = rm_link(text)
    no_html = rm_html(no_link)
    no_punct = rm_punct2(no_html)
    no_number = rm_number(no_punct)
    no_whitespaces = rm_whitespaces(no_number)
    no_nonasci = rm_nonascii(no_whitespaces)
    no_emoji = rm_emoji(no_nonasci)
    spell_corrected = spell_correction(no_emoji)
    clean_lowered = spell_corrected.lower()
    return clean_lowered

# Text Preprocessing

In [None]:
# preprocessing
def tokenize(text):
    return word_tokenize(text)

def rm_stopwords(text):
    return [i for i in text if i not in stopwords]

def preprocess_pipeline(text):
    tokens = tokenize(text)
    no_stopwords = rm_stopwords(tokens)
    return ' '.join(no_stopwords)

In [None]:
df['cleaned'] = df['text'].progress_apply(clean_pipeline)
df['preprocessed'] = df['cleaned'].progress_apply(preprocess_pipeline)
df.head()

100%|██████████| 8030/8030 [00:00<00:00, 44445.90it/s]
100%|██████████| 8030/8030 [00:00<00:00, 9329.28it/s]


Unnamed: 0,text,cleaned,preprocessed
0,"Grap family keep safe,;;;;",grap family keep safe,grap family keep safe
1,Key insights effect COVID - consumer behavior:...,key insights effect covid consumer behavior,key insights effect covid consumer behavior
2,Chemists selling Masks Sanitizers damn high pr...,chemists selling masks sanitizers damn high pr...,chemists selling masks sanitizers damn high pr...
3,Oil Prices Jump Over % After Top Producers Agr...,oil prices jump over after top producers agree...,oil prices jump top producers agree output cut...
4,Worldwide fuel consumption roughly % COVID- pa...,worldwide fuel consumption roughly covid pande...,worldwide fuel consumption roughly covid pande...


In [None]:
def tokenizing_each_sentence(df,column):
  list_of_words = []
  for i in df[column]:
    list_of_words.append(word_tokenize(i))
  return list_of_words

In [None]:
tests = tokenizing_each_sentence(df,'preprocessed')

In [None]:
# Tokenize the text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df['preprocessed'])  # Fit on the preprocessed text
total_words = len(tokenizer.word_index) + 1

# Create input sequences and labels
input_sequences = []
for sentence in tests:
    token_list = tokenizer.texts_to_sequences([sentence])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)

max_sequence_length = max([len(x) for x in input_sequences])
input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_length, padding='pre'))

# Create predictors and labels
predictors, label = input_sequences[:,:-1],input_sequences[:,-1]
label = to_categorical(label, num_classes=total_words)

In [None]:
# Build and train the model
model = Sequential()
model.add(Embedding(total_words, 100, input_length=max_sequence_length-1))
model.add(LSTM(150))
model.add(Dense(total_words, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(predictors, label, epochs=30, verbose=1)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.src.callbacks.History at 0x7e5024bcd060>

In [None]:
# Function to predict next word
def predict_next_word(seed_text):
    token_list = tokenizer.texts_to_sequences([seed_text])[0]
    token_list = pad_sequences([token_list], maxlen=max_sequence_length-1, padding='pre')
    predicted = np.argmax(model.predict(token_list), axis=-1)
    return list(tokenizer.word_index.keys())[list(tokenizer.word_index.values()).index(predicted)]

In [None]:
input_kata = "i want to"
seed_text = input_kata
predicted_text = seed_text

pred_kata = []
for _ in range(3):
    next_word = predict_next_word(seed_text)
    pred_kata.append(next_word)
    seed_text = " ".join(pred_kata)

print(f"The predicted next 3 words for '{input_kata}' are: {pred_kata}")


The predicted next 3 words for 'i want to' are: ['go', 'school', 'today']


EVALUASI PERFORMA

dengan PERPLEXITY

In [None]:
from math import exp
from sklearn.model_selection import train_test_split

# Split data menjadi data pelatihan dan data validasi
X_train, X_val, y_train, y_val = train_test_split(predictors, label, test_size=0.2, random_state=42)

# Hitung perplexity pada data validasi
def calculate_perplexity(model, X, y):
    total_log_prob = 0
    num_samples = len(X)

    for i in range(num_samples):
        log_prob = model.evaluate(X[i:i+1], y[i:i+1], verbose=0)[0]
        total_log_prob += log_prob

    avg_log_prob = total_log_prob / num_samples
    perplexity = exp(avg_log_prob)

    return perplexity

perplexity_val = calculate_perplexity(model, X_val, y_val)
print(f"Perplexity on validation data: {perplexity_val}")


Perplexity on validation data: 2.0107801778682877


AUTO COMPLETED DENGAN SIMPLE PROBABILITY

In [None]:
def count_words(tokenized_sentences):
    word_counts = {}

    for sentence in tokenized_sentences:
        for token in sentence:
            word_counts[token] = word_counts.get(token, 0) + 1

    return word_counts

In [None]:
def get_words_with_nplus_frequency(tokenized_sentences, count_threshold):
    closed_vocab = []
    word_counts = count_words(tokenized_sentences)

    for word, count in word_counts.items():
        if count >= count_threshold:
            closed_vocab.append(word)

    return closed_vocab

vocabulary = get_words_with_nplus_frequency(tests, count_threshold=2)

In [None]:
def get_words_with_nplus_frequency(tokenized_sentences, count_threshold):
    closed_vocab = []
    word_counts = count_words(tokenized_sentences)

    for word, count in word_counts.items():
        if count >= count_threshold:
            closed_vocab.append(word)

    return closed_vocab

In [None]:
vocabulary = get_words_with_nplus_frequency(tests, count_threshold=2)

In [None]:
def estimate_probability(word, previous_n_gram, n_gram_counts, n_plus1_gram_counts, vocabulary_size, k=1.0):
    previous_n_gram = tuple(previous_n_gram)
    previous_n_gram_count = n_gram_counts.get(previous_n_gram, 0)
    denominator = previous_n_gram_count + k * vocabulary_size

    n_plus1_gram = previous_n_gram + (word,)
    n_plus1_gram_count = n_plus1_gram_counts.get(n_plus1_gram, 0)

    numerator = n_plus1_gram_count + k
    probability = numerator / denominator

    return probability

In [None]:
def estimate_probabilities(previous_n_gram, n_gram_counts, n_plus1_gram_counts, vocabulary, k=1.0):
    previous_n_gram = tuple(previous_n_gram)

    vocabulary = vocabulary + ['<e>', '<unk>']
    vocabulary_size = len(vocabulary)

    probabilities = {}
    for word in vocabulary:
        probability = estimate_probability(word, previous_n_gram,
                                           n_gram_counts, n_plus1_gram_counts,
                                           vocabulary_size, k=k)
        probabilities[word] = probability

    return probabilities


In [None]:
def suggest_a_word(previous_tokens, n_gram_counts, n_plus1_gram_counts, vocabulary, k=1.0, start_with=None):
    n = len(list(n_gram_counts.keys())[0])
    previous_n_gram = previous_tokens[-n:]
    probabilities = estimate_probabilities(previous_n_gram, n_gram_counts, n_plus1_gram_counts, vocabulary, k=k)
    suggestion = None
    max_prob = 0

    for word, prob in probabilities.items():
        if start_with:
            if not word.startswith(start_with):
                continue

        if prob > max_prob:
            suggestion = word
            max_prob = prob

    return suggestion, max_prob


In [None]:
def count_n_grams(data, n, start_token='<s>', end_token='<e>'):
    n_grams = {}

    for sentence in data:
        sentence = [start_token] * n + sentence + [end_token]
        sentence = tuple(sentence)

        for i in range(len(sentence) - n + 1):
            n_gram = sentence[i : i + n]
            n_grams[n_gram] = n_grams.get(n_gram, 0) + 1

    return n_grams


In [None]:
def get_suggestions(previous_tokens, n_gram_counts_list, vocabulary, k=1.0, start_with=None):
    model_counts = len(n_gram_counts_list)
    suggestions = []
    for i in range(model_counts-1):
        n_gram_counts = n_gram_counts_list[i]
        n_plus1_gram_counts = n_gram_counts_list[i+1]

        suggestion = suggest_a_word(previous_tokens, n_gram_counts,
                                    n_plus1_gram_counts, vocabulary,
                                    k=k, start_with=start_with)
        suggestions.append(suggestion)
    return suggestions

In [None]:
n_gram_counts_list = []
for n in range(1, 6):
    print("Computing n-gram counts with n =", n, "...")
    n_model_counts = count_n_grams(tests, n)
    n_gram_counts_list.append(n_model_counts)

Computing n-gram counts with n = 1 ...
Computing n-gram counts with n = 2 ...
Computing n-gram counts with n = 3 ...
Computing n-gram counts with n = 4 ...
Computing n-gram counts with n = 5 ...


In [None]:
previous_tokens = ["i", "like" ,"political" ,"cartoon"]
suggest = get_suggestions(previous_tokens,n_gram_counts_list,vocabulary, k=1.0)

print(f"The previous words are {previous_tokens}, the suggestions are:")
display(suggest)

The previous words are ['i', 'like', 'political', 'cartoon'], the suggestions are:


[('family', 0.000351493848857645),
 ('family', 0.000351493848857645),
 ('family', 0.000351493848857645),
 ('family', 0.000351493848857645)]

In [None]:
highest_probability_word = suggest[0][0]
autocomplete = previous_tokens + [highest_probability_word]
print(autocomplete)

['i', 'like', 'political', 'cartoon', 'family']


In [None]:
print(' '.join(autocomplete))

i like political cartoon family


evaluasi


In [None]:
import math

def calculate_perplexity(test_data, n_gram_counts_list, vocabulary, k=1.0):
    total_log_prob = 0
    total_words = 0

    for sentence in test_data:
        sentence = ["<s>"] + sentence + ["<e>"]

        for i in range(len(sentence) - 1):
            n_gram = tuple(sentence[i : i + 2])  # Adjust the tuple size for your model
            word = sentence[i + 1]

            probabilities = estimate_probabilities(n_gram, n_gram_counts_list[len(n_gram) - 1],
                                                   n_gram_counts_list[len(n_gram)], vocabulary, k=k)
            word_probability = probabilities.get(word, probabilities["<unk>"])

            total_log_prob += math.log2(word_probability)
            total_words += 1

    perplexity = 2 ** (-total_log_prob / total_words)
    return perplexity

# Example usage
test_data = [["i", "like", "political", "cartoon", "family"]]
perplexity = calculate_perplexity(test_data, n_gram_counts_list, vocabulary, k=1.0)
print(f"Perplexity: {perplexity}")


Perplexity: 2847.6620719322145
