In [1]:
import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [2]:
df1_temp = pd.read_csv('https://raw.githubusercontent.com/ghazafm/SocialMediaSentiment/main/Datasets/labeled%20data/Anies%20Baswedan.csv',encoding = 'unicode_escape')
df_anis = pd.read_csv('https://raw.githubusercontent.com/ghazafm/SocialMediaSentiment/main/Datasets/original%20data/Anies%20Baswedan.csv',encoding = 'unicode_escape')
df2_temp = pd.read_csv('https://raw.githubusercontent.com/ghazafm/SocialMediaSentiment/main/Datasets/labeled%20data/Prabowo%20Subianto.csv',encoding = 'unicode_escape')
df_prabowo = pd.read_csv('https://raw.githubusercontent.com/ghazafm/SocialMediaSentiment/main/Datasets/original%20data/Prabowo%20Subianto.csv',encoding = 'unicode_escape')
df3_temp = pd.read_csv('https://raw.githubusercontent.com/ghazafm/SocialMediaSentiment/main/Datasets/labeled%20data/Ganjar%20Pranowo.csv',encoding = 'unicode_escape')
df_ganjar = pd.read_csv('https://raw.githubusercontent.com/ghazafm/SocialMediaSentiment/main/Datasets/original%20data/Ganjar%20Pranowo.csv',encoding = 'unicode_escape')

In [3]:
df_anis['label'] = df1_temp['label']
df_prabowo['label'] = df2_temp['label']
df_ganjar['label'] = df3_temp['label']

df_anis = df_anis[['Tweet','label']]
df_prabowo = df_prabowo[['Tweet','label']]
df_ganjar = df_ganjar[['Tweet','label']]



In [4]:
df = pd.concat([df_anis,df_prabowo,df_ganjar])

In [5]:
df.head()

Unnamed: 0,Tweet,label
0,info anies presiden,Positive
1,politisi partai gerindra sandiaga uno menjawab...,Positive
2,lanjut pak anies kita kawal sampai jadi presiden,Positive
3,semoga allah swt menyelamatkan bangsa dan nega...,Positive
4,chotimah kasian ya pa anies makanya sudah teka...,Positive


In [6]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to C:\Users\Fatoni Murfid
[nltk_data]     S\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [7]:
print(stopwords.words('indonesian'))

['ada', 'adalah', 'adanya', 'adapun', 'agak', 'agaknya', 'agar', 'akan', 'akankah', 'akhir', 'akhiri', 'akhirnya', 'aku', 'akulah', 'amat', 'amatlah', 'anda', 'andalah', 'antar', 'antara', 'antaranya', 'apa', 'apaan', 'apabila', 'apakah', 'apalagi', 'apatah', 'artinya', 'asal', 'asalkan', 'atas', 'atau', 'ataukah', 'ataupun', 'awal', 'awalnya', 'bagai', 'bagaikan', 'bagaimana', 'bagaimanakah', 'bagaimanapun', 'bagi', 'bagian', 'bahkan', 'bahwa', 'bahwasanya', 'baik', 'bakal', 'bakalan', 'balik', 'banyak', 'bapak', 'baru', 'bawah', 'beberapa', 'begini', 'beginian', 'beginikah', 'beginilah', 'begitu', 'begitukah', 'begitulah', 'begitupun', 'bekerja', 'belakang', 'belakangan', 'belum', 'belumlah', 'benar', 'benarkah', 'benarlah', 'berada', 'berakhir', 'berakhirlah', 'berakhirnya', 'berapa', 'berapakah', 'berapalah', 'berapapun', 'berarti', 'berawal', 'berbagai', 'berdatangan', 'beri', 'berikan', 'berikut', 'berikutnya', 'berjumlah', 'berkali-kali', 'berkata', 'berkehendak', 'berkeinginan'

In [8]:
df.dropna(inplace=True)

In [9]:
df['label'].value_counts()

label
Positive    21654
Negative     8074
Name: count, dtype: int64

In [10]:
keyword = ['anis', 'anies', 'Anies', 'anies', 'anies baswedan', 'Anies Baswedan', 'prabowo', 'Prabowo', 'Prabowo Subianto', 'prabowo subianto', 'Ganjar', 'ganjar', 'Ganjar Pranowo', 'ganjar pranowo']
for i in keyword:
    df['Tweet'] = df['Tweet'].str.replace(i,'')

In [11]:
df

Unnamed: 0,Tweet,label
0,info presiden,Positive
1,politisi partai gerindra sandiaga uno menjawab...,Positive
2,lanjut pak kita kawal sampai jadi presiden,Positive
3,semoga allah swt menyelamatkan bangsa dan nega...,Positive
4,chotimah kasian ya pa makanya sudah tekad klu...,Positive
...,...,...
9995,rt istdltras ist deltras jatim siap dukung fo...,Positive
9996,artinya pranowo presiden,Positive
9997,semakin banyak rakyat yang siap mendukung unt...,Positive
9998,rt istdltras ist deltras jatim siap dukung fo...,Positive


In [12]:
port_stem = PorterStemmer()

In [13]:
def stemming(content):
    stemmed = re.sub('[^a-z]',' ',content)
    stemmed = stemmed.lower()
    stemmed = stemmed.split()
    stemmed = [port_stem.stem(word) for word in stemmed if not word in stopwords.words('indonesian')]
    stemmed = ' '.join(stemmed)
    return stemmed

In [14]:
df['stemmed'] = df['Tweet'].apply(stemming)

In [15]:
df.head()

Unnamed: 0,Tweet,label,stemmed
0,info presiden,Positive,info presiden
1,politisi partai gerindra sandiaga uno menjawab...,Positive,politisi partai gerindra sandiaga uno diduetka...
2,lanjut pak kita kawal sampai jadi presiden,Positive,kawal presiden
3,semoga allah swt menyelamatkan bangsa dan nega...,Positive,semoga allah swt menyelamatkan bangsa negara r...
4,chotimah kasian ya pa makanya sudah tekad klu...,Positive,chotimah kasian ya pa tekad kluarga pilih pa p...


In [16]:
df['label'].value_counts()

label
Positive    21654
Negative     8074
Name: count, dtype: int64

In [17]:
x =df['stemmed']
y =df['label']

In [18]:
# Undersampling
from imblearn.under_sampling import RandomUnderSampler
rus = RandomUnderSampler()
x, y = rus.fit_resample(x.values.reshape(-1,1),y)

# Oversampling
# from imblearn.over_sampling import RandomOverSampler
# ros = RandomOverSampler(random_state=42)
# x, y = ros.fit_resample(x.values.reshape(-1,1),y)


In [19]:
x = x.flatten()


In [20]:
y.value_counts()

label
Negative    8074
Positive    8074
Name: count, dtype: int64

In [21]:
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.2, stratify=y, random_state=2)

In [22]:
print(x_train)

['jakartalembaga survei indonesia lsi tren elektabilita subianto mengalami penguatan dibandingkan nama pranowo pemilihan presiden'
 'presiden lanjutin jokowi rakyat mendukung'
 'loyali baswedan mundur komisari ancol jakarta penyebabnya jokowi kesambet firaun kontra kade depok eliez bonsang yati'
 ... 'foto bareng calon presiden yagesya'
 'mak selamat terima kasih membawaku dunia cinta perhatianmu kepadaku selamat'
 'selamat memperingati pahlawan nasion novemb']


In [23]:
print(x.shape, x_train.shape, x_test.shape, y_train.shape, y_test.shape)

(16148,) (12918,) (3230,) (12918,) (3230,)


In [24]:
vectorizer = TfidfVectorizer()

x_train = vectorizer.fit_transform(x_train)
x_test = vectorizer.transform(x_test)

In [25]:
print(x_test)

  (0, 15393)	0.3234596971065618
  (0, 13866)	0.3191688938390311
  (0, 12998)	0.272369887271818
  (0, 12641)	0.1445098773426004
  (0, 12041)	0.05854978531557311
  (0, 11994)	0.2630667105580668
  (0, 10120)	0.2817915299754675
  (0, 8453)	0.32126660417078085
  (0, 8024)	0.33338994591250626
  (0, 4322)	0.3161840406819209
  (0, 1482)	0.34772928101117495
  (0, 265)	0.33338994591250626
  (1, 12641)	0.17391416940584445
  (1, 12041)	0.07046326153822459
  (1, 10356)	0.3391293438378589
  (1, 10110)	0.30745949310373644
  (1, 5863)	0.37394056693158717
  (1, 5711)	0.4281822306718293
  (1, 5431)	0.177493708216018
  (1, 1499)	0.3497140883534764
  (1, 460)	0.3211611433766588
  (1, 398)	0.4184838448224355
  (2, 14011)	0.24436002654713607
  (2, 13589)	0.5619035678091604
  (2, 12641)	0.27492550894351575
  :	:
  (3227, 3776)	0.45997338227085377
  (3227, 3745)	0.34639413864401425
  (3227, 964)	0.4666306945658817
  (3228, 15685)	0.1865112010749423
  (3228, 15350)	0.31545216819435495
  (3228, 13377)	0.3554028

# LogisticRegression 
score = 0.8024767801857585

In [26]:
model_logistic = LogisticRegression(max_iter=2000)

In [27]:
model_logistic.fit(x_train,y_train)

In [28]:
# Accuracy score
predict = model_logistic.predict(x_test)
accuracy = accuracy_score(y_test,predict)

In [29]:
print(accuracy)

0.7978328173374613


In [30]:
contoh = ['adapun hal itu disampaikan hashim dalam sambutannya pada acara yang diselenggarakan prabowo mania deklarasikan prabowo subianto presiden di museum joang jakarta minggu',
'presiden indonesia tahun kata dia harus prabowo subianto',
'pasti dan pedofilia akan diberantas prabowo presiden',
'dukungan terhadap prabowo subianto untuk maju sebagai calon presiden pada pemilu datang dari sulawesi utaracapres pemilu dekade terusmajubersamaprabowo mendingprabowo mendingprabowo',
'pemerintah gak becus',
'pemerintah becus',
'prabowo ga gemoy'

]
contoh = vectorizer.transform(contoh)
predict = model_logistic.predict(contoh)
print(predict)

['Negative' 'Negative' 'Negative' 'Positive' 'Negative' 'Negative'
 'Negative']


# Decision Tree
score = 0.7780185758513932

In [31]:
from sklearn.tree import DecisionTreeClassifier

tree_model = DecisionTreeClassifier()

tree_model = tree_model.fit(x_train, y_train)

In [32]:
from sklearn.metrics import accuracy_score

y_predict = tree_model.predict(x_test)

accuration = accuracy_score(y_predict,y_test)

print(f"Accuracy : {accuration}")

Accuracy : 0.778328173374613


In [33]:
contoh = ['adapun hal itu disampaikan hashim dalam sambutannya pada acara yang diselenggarakan prabowo mania deklarasikan prabowo subianto presiden di museum joang jakarta minggu',
'presiden indonesia tahun kata dia harus prabowo subianto',
'pasti dan pedofilia akan diberantas prabowo presiden',
'dukungan terhadap prabowo subianto untuk maju sebagai calon presiden pada pemilu datang dari sulawesi utaracapres pemilu dekade terusmajubersamaprabowo mendingprabowo mendingprabowo',
'pemerintah gak becus',
'pemerintah becus',
'prabowo ga gemoy'

]
contoh = vectorizer.transform(contoh)
prediksi = tree_model.predict(contoh)
print(prediksi)

['Negative' 'Negative' 'Negative' 'Positive' 'Negative' 'Negative'
 'Negative']


# Random Forest
score = 0.81671826625387

In [34]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [35]:
random_forest = RandomForestClassifier(n_estimators=100, random_state=42)
random_forest.fit(x_train, y_train)
y_pred = random_forest.predict(x_test)
accuracy = accuracy_score(y_test, y_pred)
print("Akurasi model Random Forest:", accuracy)

Akurasi model Random Forest: 0.8089783281733746


In [36]:
contoh = ['adapun hal itu disampaikan hashim dalam sambutannya pada acara yang diselenggarakan prabowo mania deklarasikan prabowo subianto presiden di museum joang jakarta minggu',
'presiden indonesia tahun kata dia harus prabowo subianto',
'pasti dan pedofilia akan diberantas prabowo presiden',
'dukungan terhadap prabowo subianto untuk maju sebagai calon presiden pada pemilu datang dari sulawesi utaracapres pemilu dekade terusmajubersamaprabowo mendingprabowo mendingprabowo',
'pemerintah gak becus',
'pemerintah becus',
'prabowo ga gemoy'

]
contoh = vectorizer.transform(contoh)
prediksi = random_forest.predict(contoh)
print(prediksi)

['Negative' 'Negative' 'Negative' 'Positive' 'Negative' 'Negative'
 'Negative']


# SVM
score = 0.8099071207430341

In [37]:
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [38]:
svm_classifier = SVC(kernel='linear', random_state=42)

svm_classifier.fit(x_train, y_train)

y_pred = svm_classifier.predict(x_test)

akurasi = accuracy_score(y_test, y_pred)
print("Akurasi model SVM:", akurasi)



Akurasi model SVM: 0.8095975232198143


In [39]:
contoh = ['adapun hal itu disampaikan hashim dalam sambutannya pada acara yang diselenggarakan prabowo mania deklarasikan prabowo subianto presiden di museum joang jakarta minggu',
'presiden indonesia tahun kata dia harus prabowo subianto',
'pasti dan pedofilia akan diberantas prabowo presiden',
'dukungan terhadap prabowo subianto untuk maju sebagai calon presiden pada pemilu datang dari sulawesi utaracapres pemilu dekade terusmajubersamaprabowo mendingprabowo mendingprabowo',
'pemerintah gak becus',
'pemerintah becus',
'prabowo ga gemoy'

]
contoh = vectorizer.transform(contoh)
prediksi = svm_classifier.predict(contoh)
print(prediksi)

['Negative' 'Negative' 'Negative' 'Positive' 'Negative' 'Negative'
 'Negative']
