In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import re

In [2]:
df = pd.read_csv('Twitter_Emotion_Dataset.csv')

In [3]:
df['tweet_length'] = df['tweet'].apply(lambda x :len(x))

In [4]:
df.head()

Unnamed: 0,label,tweet,tweet_length
0,anger,"Soal jln Jatibaru,polisi tdk bs GERTAK gubernu...",220
1,anger,"Sesama cewe lho (kayaknya), harusnya bisa lebi...",235
2,happy,Kepingin gudeg mbarek Bu hj. Amad Foto dari go...,116
3,anger,"Jln Jatibaru,bagian dari wilayah Tn Abang.Peng...",250
4,happy,"Sharing pengalaman aja, kemarin jam 18.00 bata...",203


In [5]:
def clean_text(x):
    letter_only = re.sub('(^a-zA-Z)',' ', x )
    return ' '.join(letter_only.split()).lower()

In [6]:
df['tweet_text'] = df['tweet'].apply(clean_text)

In [7]:
df['tweet_text_length'] = df['tweet_text'].apply(lambda x : len(x))

In [8]:
df.head()

Unnamed: 0,label,tweet,tweet_length,tweet_text,tweet_text_length
0,anger,"Soal jln Jatibaru,polisi tdk bs GERTAK gubernu...",220,"soal jln jatibaru,polisi tdk bs gertak gubernu...",220
1,anger,"Sesama cewe lho (kayaknya), harusnya bisa lebi...",235,"sesama cewe lho (kayaknya), harusnya bisa lebi...",235
2,happy,Kepingin gudeg mbarek Bu hj. Amad Foto dari go...,116,kepingin gudeg mbarek bu hj. amad foto dari go...,116
3,anger,"Jln Jatibaru,bagian dari wilayah Tn Abang.Peng...",250,"jln jatibaru,bagian dari wilayah tn abang.peng...",250
4,happy,"Sharing pengalaman aja, kemarin jam 18.00 bata...",203,"sharing pengalaman aja, kemarin jam 18.00 bata...",202


In [9]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer

In [10]:
count_vectorizer_transformer = CountVectorizer().fit(df['tweet_text'])

In [11]:
count_vectorizer_res = count_vectorizer_transformer.transform(df['tweet_text'])

In [12]:
pd.DataFrame(count_vectorizer_res.todense(), columns = count_vectorizer_transformer.get_feature_names())

Unnamed: 0,00,000,01,011060039617518,013150189591518,02,02122302243,0281_bobotoh,03,05,...,zohri,zon,zona,zonasi,zone,zonk,zonkies,zonknya,zuhri,zulkifli
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [13]:
tf_idf_transformer = TfidfTransformer().fit(count_vectorizer_res)
tf_idf_res = tf_idf_transformer.transform(count_vectorizer_res)

In [14]:
df_tf_idf = pd.DataFrame(tf_idf_res.todense(), columns = count_vectorizer_transformer.get_feature_names())

In [15]:
df_tf_idf.iloc[0].sort_values(ascending = False).head(3)

polisi      0.309771
gubernur    0.307500
soal        0.270441
Name: 0, dtype: float64

In [16]:
listMeaning = []
for item in range(len(df)):
    mean_ing = list(df_tf_idf.iloc[item].sort_values(ascending = False).head(3).index)
    listMeaning.append(mean_ing)

In [17]:
df['meaningful_words'] = listMeaning

In [18]:
df.head()

Unnamed: 0,label,tweet,tweet_length,tweet_text,tweet_text_length,meaningful_words
0,anger,"Soal jln Jatibaru,polisi tdk bs GERTAK gubernu...",220,"soal jln jatibaru,polisi tdk bs gertak gubernu...",220,"[polisi, gubernur, soal]"
1,anger,"Sesama cewe lho (kayaknya), harusnya bisa lebi...",235,"sesama cewe lho (kayaknya), harusnya bisa lebi...",235,"[rasain, wajarlah, paniknya]"
2,happy,Kepingin gudeg mbarek Bu hj. Amad Foto dari go...,116,kepingin gudeg mbarek bu hj. amad foto dari go...,116,"[teman, mbarek, membayangkannya]"
3,anger,"Jln Jatibaru,bagian dari wilayah Tn Abang.Peng...",250,"jln jatibaru,bagian dari wilayah tn abang.peng...",250,"[wilayah, agr, bermnfaat]"
4,happy,"Sharing pengalaman aja, kemarin jam 18.00 bata...",203,"sharing pengalaman aja, kemarin jam 18.00 bata...",202,"[jam, twips, menitan]"


In [19]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [20]:
X_train,X_test,y_train,y_test = train_test_split(tf_idf_res, df['label'], random_state = 101)

In [21]:
model_nlb = MultinomialNB().fit(X_train,y_train)
prediction = model_nlb.predict(X_test)
print(classification_report(y_test,prediction))

              precision    recall  f1-score   support

       anger       0.56      0.82      0.67       284
        fear       0.96      0.14      0.24       170
       happy       0.70      0.43      0.53       269
        love       0.90      0.24      0.38       154
     sadness       0.33      0.68      0.45       224

    accuracy                           0.51      1101
   macro avg       0.69      0.46      0.45      1101
weighted avg       0.66      0.51      0.48      1101



In [22]:
mes = 'Lu gimana Sih ?'
cv = count_vectorizer_transformer.transform([mes])
tf = tf_idf_transformer.transform(cv)
model_nlb.predict(tf)

array(['anger'], dtype='<U7')

In [23]:
mes = 'Bawel lu pada !'
cv = count_vectorizer_transformer.transform([mes])
tf = tf_idf_transformer.transform(cv)
model_nlb.predict(tf)

array(['anger'], dtype='<U7')

In [24]:
mes = 'bandung hawanya panas banget ya sekarang..'
cv = count_vectorizer_transformer.transform([mes])
tf = tf_idf_transformer.transform(cv)
model_nlb.predict(tf)

array(['sadness'], dtype='<U7')

In [29]:
mes = 'gue berhasil !'
cv = count_vectorizer_transformer.transform([mes])
tf = tf_idf_transformer.transform(cv)
model_nlb.predict(tf)

array(['sadness'], dtype='<U7')