In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from yellowbrick.text import TSNEVisualizer
from sklearn.svm import SVC
from sklearn.metrics import f1_score, recall_score, precision_score, confusion_matrix, accuracy_score
from sklearn.model_selection import learning_curve
import numpy as np
from Preprocessing import preprocess_data

In [None]:
# Open data file .csv
tweets_df = pd.read_csv('../Dataset/Bersih.csv', encoding='latin-1')

In [None]:
# Check value counts of Labels
tweets_df['Labels'].value_counts()

In [None]:
# Change labels positif to 0 and label negatif to 1
tweets_df['Labels'] = tweets_df['Labels'].map({'positif': 0, 'negatif': 1})
X = tweets_df['Text_bersih']
y = tweets_df['Labels']

In [None]:
# Optional # Optional # Optional # Optional # Optional # Optional # Optional # Optional # Optional # Optional # Optional # Optional # Optional 
# Polarity == 0 negative
train_s0 = tweets_df[tweets_df["Labels"] == 1]
all_text_s0 = ' '.join(word for word in train_s0["Text_bersih"])
wordcloud = WordCloud(colormap='Reds', width=1000, height=1000, mode='RGBA', background_color='white').generate(all_text_s0)
plt.figure(figsize=(20,10))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.margins(x=0, y=0)
plt.show()

# Polarity == 1 positive
train_s1 = tweets_df[tweets_df["Labels"] == 0]
all_text_s1 = ' '.join(word for word in train_s1["Text_bersih"])
wordcloud = WordCloud(width=1000, height=1000, colormap='Blues', background_color='white', mode='RGBA').generate(all_text_s1)
plt.figure( figsize=(20,10))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.margins(x=0, y=0)
plt.show()

wf=wordcloud.words_

In [None]:
# Split data testing and training test_size testing 0.1(10%) of dataset 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=13)

In [None]:
# Process TF-IDF
vectorizer = TfidfVectorizer()

X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

In [None]:
# Create the visualizer and draw the vectors
tsne = TSNEVisualizer()
tsne.fit(X_train, y_train)
tsne.show()

In [None]:
# Process of making models Klasifikasi SVM LINEAR
linear = SVC(kernel="linear", gamma=1, C=1)
linear.fit(X_train,y_train)
linear1 = linear.predict(X_test)

In [None]:
# F1_score
print("F1 score hasil prediksi adalah : ", f1_score(y_test, linear1))
# Accuracy score
print("Accuracy score hasil prediksi adalah : ", accuracy_score(y_test, linear1))
# Precision score
print("Precision score hasil prediksi adalah : ", precision_score(y_test, linear1))
# Recall score
print("Recall score hasil prediksi adalah : ", recall_score(y_test, linear1))
print('\n')

In [None]:
# Confusion matrix
tn, fp, fn, tp = confusion_matrix(y_test, linear1).ravel()
print("True Positive : ", tp)
# Accuracy score
print("True Negative : ", tn)
# Precision score
print("False Positive : ", fp)
# Recall score
print("False Negative  : ", fn)
print('\n')

In [None]:
#Learning Curve
# Create CV training and test scores for various training set sizes
train_sizes, train_scores, test_scores = learning_curve(SVC(kernel="linear", gamma=1, C=1), X_train, y_train, cv=20, scoring='accuracy', n_jobs=-1, train_sizes=np.linspace(0.01, 1.0, 50))

# Create means and standard deviations of training set scores
train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)

# Create means and standard deviations of test set scores
test_mean = np.mean(test_scores, axis=1)
test_std = np.std(test_scores, axis=1)

# Draw lines
plt.plot(train_sizes, train_mean, '--', color="#111111",  label="Training score")
plt.plot(train_sizes, test_mean, color="#111111", label="Cross-validation score")

# Draw bands
plt.fill_between(train_sizes, train_mean - train_std, train_mean + train_std, color="#DDDDDD")
plt.fill_between(train_sizes, test_mean - test_std, test_mean + test_std, color="#DDDDDD")

# Create plot
plt.title("Learning Curve SVM Kernel Linear")
plt.xlabel("Training Set Size"), plt.ylabel("Accuracy Score"), plt.legend(loc="best")
plt.tight_layout()
plt.show()

In [None]:
# Example Term
review_positif = "ganti biznet aja dari pada indihome"
review_negatif = "biznet masalah mulu"

In [None]:
# Check prediction of term [0] for positif [1] for negatif
review_positif = preprocess_data(review_positif)
review_positif = vectorizer.transform([review_positif])

In [None]:
linear.predict(review_positif)

In [None]:
# Check prediction of term [0] for positif [1] for negatif
review_negatif = preprocess_data(review_negatif)
review_negatif = vectorizer.transform([review_negatif])

In [None]:
linear.predict(review_negatif)