In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/ntust-text-classification/train.csv
/kaggle/input/ntust-text-classification/test.csv


In [2]:
import pandas as pd
import re
from nltk.corpus import wordnet
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Fungsi untuk menghapus Unicode strings dan noise
def remove_unicode_noise(text):
    return re.sub(r'[^\x00-\x7F]+', ' ', text)

# Fungsi untuk mengganti URLs, user mentions, dan hashtags
def replace_urls_usermentions_hashtags(text):
    text = re.sub(r"http\S+|www\S+|https\S+", '', text)  # Menghapus URL
    text = re.sub(r"@\w+", '', text)  # Menghapus user mentions
    text = re.sub(r"#\w+", '', text)  # Menghapus hashtags
    return text

# Fungsi untuk mengganti slang dan abbreviations
def replace_slang_abbreviations(text):
    # Tambahkan dictionary slang dan abbreviations sesuai kebutuhan
    slang_abbreviations = {
        "lol": "laugh out loud",
        "brb": "be right back",
        # Tambahkan lebih banyak jika diperlukan
    }
    for word, replacement in slang_abbreviations.items():
        text = re.sub(r"\b" + word + r"\b", replacement, text)
    return text

# Fungsi untuk mengganti contractions
def replace_contractions(text):
    # Tambahkan lebih banyak contractions sesuai kebutuhan
    contractions = {
        "ain't": "am not",
        "aren't": "are not",
        # Tambahkan lebih banyak jika diperlukan
    }
    for word, replacement in contractions.items():
        text = re.sub(r"\b" + word + r"\b", replacement, text)
    return text

# Fungsi untuk menghapus angka
def remove_numbers(text):
    return re.sub(r'\d+', '', text)

# Fungsi untuk mengganti repetisi tanda baca
def replace_repeated_punctuation(text):
    return re.sub(r'([!?.]){2,}', r'\1', text)

 
# Fungsi untuk menghapus tanda baca
def remove_punctuation(text):
    return re.sub(r'[^\w\s]', '', text)

# Fungsi untuk menangani kata-kata yang diawali huruf besar
def handle_capitalized_words(text):
    # Implementasi dapat disesuaikan berdasarkan kebutuhan
    words = text.split()
    words = [word if word.islower() else word.lower() for word in words]
    return ' '.join(words)

# Fungsi untuk menghapus stopwords
def remove_stopwords(text):
    stop_words = set(stopwords.words('english'))
    words = text.split()
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return ' '.join(filtered_words)

# Fungsi untuk mengganti elongated words
def replace_elongated_words(text):
    return re.sub(r'\b(\S*?)(.)\2{2,}\b', r'\1\2', text)

# Fungsi untuk koreksi ejaan
def spelling_correction(text):
    # Implementasi dapat disesuaikan berdasarkan kebutuhan
    # Misalnya menggunakan pustaka seperti TextBlob atau autocorrect
    # Sebagai contoh, menggunakan TextBlob
    from textblob import TextBlob
    text = str(TextBlob(text).correct())
    return text

# Fungsi untuk tagging part of speech
def pos_tagging(text):
    # Implementasi dapat disesuaikan berdasarkan kebutuhan
    # Misalnya menggunakan pustaka seperti nltk atau spacy
    # Sebagai contoh, menggunakan NLTK
    from nltk import pos_tag, word_tokenize
    tokens = word_tokenize(text)
    pos_tags = pos_tag(tokens)
    return pos_tags

# Fungsi untuk mengecilkan huruf pada teks
def lowercase_text(text):
    return text.lower()

# Baca dataset latih
train_data = pd.read_csv('/kaggle/input/ntust-text-classification/train.csv')

# Preprocessing data: mengecilkan huruf pada teks
train_data['reviews_content'] = train_data['reviews_content'].apply(lowercase_text)

# Preprocessing tambahan:
train_data['reviews_content'] = train_data['reviews_content'].apply(remove_unicode_noise)
train_data['reviews_content'] = train_data['reviews_content'].apply(replace_urls_usermentions_hashtags)
train_data['reviews_content'] = train_data['reviews_content'].apply(replace_slang_abbreviations)
train_data['reviews_content'] = train_data['reviews_content'].apply(replace_contractions)
train_data['reviews_content'] = train_data['reviews_content'].apply(remove_numbers)
train_data['reviews_content'] = train_data['reviews_content'].apply(replace_repeated_punctuation)
 
train_data['reviews_content'] = train_data['reviews_content'].apply(remove_punctuation)
train_data['reviews_content'] = train_data['reviews_content'].apply(handle_capitalized_words)
train_data['reviews_content'] = train_data['reviews_content'].apply(remove_stopwords)
train_data['reviews_content'] = train_data['reviews_content'].apply(replace_elongated_words)
train_data['reviews_content'] = train_data['reviews_content'].apply(spelling_correction)

# Ekstraksi fitur menggunakan TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=60000)  # Anda dapat menyesuaikan jumlah fitur maksimal
X_train = tfidf_vectorizer.fit_transform(train_data['reviews_content'])
y_train = train_data['category']

# Pelatihan model
svm_model = SVC(kernel='linear', C=1, gamma='auto')
svm_model.fit(X_train, y_train)

# Baca dataset uji
test_data = pd.read_csv('/kaggle/input/ntust-text-classification/test.csv')

# Preprocessing data uji: mengecilkan huruf pada teks
test_data['reviews_content'] = test_data['reviews_content'].apply(lowercase_text)

# Preprocessing tambahan untuk data uji:
test_data['reviews_content'] = test_data['reviews_content'].apply(remove_unicode_noise)
test_data['reviews_content'] = test_data['reviews_content'].apply(replace_urls_usermentions_hashtags)
test_data['reviews_content'] = test_data['reviews_content'].apply(replace_slang_abbreviations)
test_data['reviews_content'] = test_data['reviews_content'].apply(replace_contractions)
test_data['reviews_content'] = test_data['reviews_content'].apply(remove_numbers)
test_data['reviews_content'] = test_data['reviews_content'].apply(replace_repeated_punctuation)
 
test_data['reviews_content'] = test_data['reviews_content'].apply(remove_punctuation)
test_data['reviews_content'] = test_data['reviews_content'].apply(handle_capitalized_words)
test_data['reviews_content'] = test_data['reviews_content'].apply(remove_stopwords)
test_data['reviews_content'] = test_data['reviews_content'].apply(replace_elongated_words)
test_data['reviews_content'] = test_data['reviews_content'].apply(spelling_correction)

# Prediksi pada data uji
X_test = tfidf_vectorizer.transform(test_data['reviews_content'])
predictions = svm_model.predict(X_test)

hasil = pd.DataFrame({'Row' : range(1,501), 'Label':predictions})
hasil.to_csv('submission.csv', index=False)

# Baca dataset uji
hasil = pd.read_csv('/kaggle/working/submission.csv')
print(hasil)


     Row     Label
0      1  positive
1      2  positive
2      3  positive
3      4  positive
4      5  negative
..   ...       ...
495  496  positive
496  497  positive
497  498  positive
498  499  positive
499  500  positive

[500 rows x 2 columns]
