# Project 3 - Emotion Tweet Classification

### Grup 5 - AI 2

Anggota Kelompok: 
* Diah Ayu Setyaningsih
* I Nyoman Warsana
* Iman Santoso

## Training Data

In [84]:
import pandas as pd
import numpy as np

In [85]:
data_train = pd.read_csv('2018-E-c-En-train.txt', sep='\t')
data_train.head()

Unnamed: 0,ID,Tweet,anger,anticipation,disgust,fear,joy,love,optimism,pessimism,sadness,surprise,trust
0,2017-En-21441,“Worry is a down payment on a problem you may ...,0,1,0,0,0,0,1,0,0,0,1
1,2017-En-31535,Whatever you decide to do make sure it makes y...,0,0,0,0,1,1,1,0,0,0,0
2,2017-En-21068,@Max_Kellerman it also helps that the majorit...,1,0,1,0,1,0,1,0,0,0,0
3,2017-En-31436,Accept the challenges so that you can literall...,0,0,0,0,1,0,1,0,0,0,0
4,2017-En-22195,My roommate: it's okay that we can't spell bec...,1,0,1,0,0,0,0,0,0,0,0


### Data Preprocessing

In [86]:
from nltk.tokenize import WordPunctTokenizer
from bs4 import BeautifulSoup
import re
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

tokenizer = WordPunctTokenizer()
mention_pattern = r'@[\w]+'
http_pattern = r'(http|https)?://[^ ]+'
www_pattern = r'www.[^ ]+'
pattern = r'|'.join((mention_pattern, http_pattern))
stop_words = set(stopwords.words("english"))

# sub tweet_cleaner
def tweet_cleaner(tweet):
  soup = BeautifulSoup(tweet, 'lxml')
  text = soup.get_text()
  sub = re.sub(pattern, '', text)
  try:
    clean_text = sub.decode("utf-8-sig").replace(u"\ufffd", "?")
  except:
    clean_text = sub
  letter_text = re.sub('[^\w]', ' ', clean_text)
  letter_text = re.sub(www_pattern, '', letter_text)
  letter_lc = letter_text.lower()
  word_list = tokenizer.tokenize(letter_lc)
  clean_word_list = word_list[:]
  for word in clean_word_list:
    if word in stop_words:
      clean_word_list.remove(word)
  word = (" ".join(clean_word_list)).strip()
  return word

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [87]:
# Cleaning train tweet
clean_tweet_train = []
clean_train_df = data_train[:]
for i in range(len(data_train.index)):
  clean_tweet_train.append(tweet_cleaner(data_train['Tweet'][i]))
clean_train_df['Tweet'] = clean_tweet_train
clean_train_df.head()

Unnamed: 0,ID,Tweet,anger,anticipation,disgust,fear,joy,love,optimism,pessimism,sadness,surprise,trust
0,2017-En-21441,worry a payment a problem may never joyce meye...,0,1,0,0,0,0,1,0,0,0,1
1,2017-En-31535,whatever decide do make sure makes happy,0,0,0,0,1,1,1,0,0,0,0
2,2017-En-21068,also helps the majority nfl coaching inept of ...,1,0,1,0,1,0,1,0,0,0,0
3,2017-En-31436,accept challenges that can literally even feel...,0,0,0,0,1,0,1,0,0,0,0
4,2017-En-22195,roommate s okay we t spell we autocorrect terr...,1,0,1,0,0,0,0,0,0,0,0


### Feature Extraction

In [88]:
from sklearn.feature_extraction.text import CountVectorizer

In [89]:
tweet = clean_train_df['Tweet']

In [90]:
count_vect = CountVectorizer()
messages_bow = count_vect.fit_transform(tweet)

In [91]:
from sklearn.feature_extraction.text import TfidfTransformer

In [92]:
tfidf_transformer = TfidfTransformer()
messages_tfidf = tfidf_transformer.fit_transform(messages_bow)

In [93]:
col = clean_train_df.columns
x = messages_tfidf
y = clean_train_df[col[2:]]
y.head()

Unnamed: 0,anger,anticipation,disgust,fear,joy,love,optimism,pessimism,sadness,surprise,trust
0,0,1,0,0,0,0,1,0,0,0,1
1,0,0,0,0,1,1,1,0,0,0,0
2,1,0,1,0,1,0,1,0,0,0,0
3,0,0,0,0,1,0,1,0,0,0,0
4,1,0,1,0,0,0,0,0,0,0,0


### Training Model

In [94]:
from sklearn.model_selection import train_test_split

In [95]:
text_train, text_test, output_train, output_test = train_test_split(x, y, 
                                                                    train_size=0.75, 
                                                                    test_size=0.25, 
                                                                    random_state=0)

In [96]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

In [97]:
from sklearn.metrics import classification_report, accuracy_score
from sklearn.externals import joblib

In [98]:
clf = [MultinomialNB(), SVC(), LogisticRegression(), RandomForestClassifier(), DecisionTreeClassifier()]
df_accuracy = {}
best_clf = {}
for i in col[2:]:
  best_accuracy = 0
  df_accuracy[i] = []
  for classifier in clf:
    model = classifier
    model.fit(text_train, output_train[i])
    hasil = model.predict(text_test)
    score = accuracy_score(output_test[i],hasil)
    df_accuracy[i].append(score)
    joblib_file = i + "_model.pkl"
    if best_accuracy < score:
      best_accuracy = score
      best_clf.update({i:classifier})
      print('Best Model {}: '.format(i), model)
      joblib.dump(model, joblib_file)
    else:
      best_clf[i] = classifier

Best Model anger:  MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)




Best Model anger:  LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)
Best Model anger:  RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
Best Model anticipation:  MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)




Best Model disgust:  MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)




Best Model disgust:  LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)
Best Model fear:  MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)




Best Model fear:  LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)
Best Model fear:  RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
Best Model fear:  DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            m



Best Model joy:  LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)
Best Model love:  MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)




Best Model love:  LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)
Best Model optimism:  MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)




Best Model optimism:  LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)
Best Model pessimism:  MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)




Best Model pessimism:  LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)
Best Model sadness:  MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)




Best Model sadness:  LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)
Best Model sadness:  RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
Best Model surprise:  MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)




Best Model surprise:  LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)
Best Model trust:  MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)




### Model Evaluation

In [99]:
df_accuracy = pd.DataFrame(df_accuracy)
method = [str(classifier).split("(")[0] for classifier in clf]
df_accuracy.rename(index={i:method[i] for i in range(len(clf))}, inplace=True)
df_accuracy

Unnamed: 0,anger,anticipation,disgust,fear,joy,love,optimism,pessimism,sadness,surprise,trust
MultinomialNB,0.735673,0.852047,0.715205,0.819298,0.760819,0.893567,0.739181,0.880117,0.735088,0.945029,0.947368
SVC,0.62807,0.852047,0.623977,0.810526,0.646784,0.892982,0.710526,0.880117,0.705848,0.945029,0.947368
LogisticRegression,0.774854,0.852047,0.740936,0.845614,0.799415,0.898246,0.762573,0.881287,0.762573,0.946199,0.947368
RandomForestClassifier,0.776608,0.842105,0.718129,0.884795,0.78655,0.897661,0.756725,0.877193,0.781287,0.944444,0.942105
DecisionTreeClassifier,0.745029,0.798246,0.678947,0.88655,0.75848,0.875439,0.729825,0.849708,0.735673,0.92924,0.911696


## Test Data

In [100]:
data_test = pd.read_csv('2018-E-c-En-test.txt', sep='\t')

### Data Preprocessing

In [101]:
# Cleaning test tweet
clean_tweet_test = []
clean_test_df = data_test[:]
for i in range(len(data_test.index)):
  clean_tweet_test.append(tweet_cleaner(data_test['Tweet'][i]))
clean_test_df['Tweet'] = clean_tweet_test
clean_test_df.shape

(3259, 13)

### Feature Extraction

In [102]:
tweet_test = clean_test_df['Tweet']
X_test_count = count_vect.transform(tweet_test)

In [103]:
X_test_tfidf = tfidf_transformer.transform(X_test_count)

### Prediction

In [104]:
col_test = clean_test_df.columns

In [105]:
file_name =[emotion+"_model.pkl" for emotion in col_test[2:]]
for file in file_name:
    loaded_model = joblib.load(file)
    output_predict = loaded_model.predict(X_test_tfidf)
    cols = str(file).split("_")[0]
    data_test[cols]=output_predict
data_test

Unnamed: 0,ID,Tweet,anger,anticipation,disgust,fear,joy,love,optimism,pessimism,sadness,surprise,trust
0,2018-En-01559,dont worry indian army on ways dispatch terror...,0,0,0,1,0,0,0,0,0,0,0
1,2018-En-03739,academy sciences eschews normally sober tone s...,0,0,0,0,0,0,0,0,0,0,0
2,2018-En-00385,blew opportunity __ mad,1,0,0,0,0,0,0,0,0,0,0
3,2018-En-03001,time 2 weeks will 30,0,0,0,0,0,0,0,0,0,0,0
4,2018-En-01988,deppression real partners w depressed people t...,1,0,0,1,0,0,0,0,0,0,0
5,2018-En-03463,interesting choice words you confirming govern...,1,0,0,1,0,0,0,0,0,0,0
6,2018-En-04315,cnn for sure,0,0,0,0,0,0,0,0,0,0,0
7,2018-En-01426,distance once stretched your friends impose se...,0,0,0,0,0,0,0,0,0,0,0
8,2018-En-03332,happy confident kind n n kissableslovesmshopma...,0,0,0,0,1,0,1,0,0,0,0
9,2018-En-01938,visit hospital care triggered trauma accident ...,0,0,0,0,0,0,0,0,1,0,0
