# Data reading and arranging

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import keras
from sklearn import metrics
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
from keras.models import Model
from keras.layers import LSTM, Activation, Dense, Dropout, Input, Embedding
from keras.optimizers import RMSprop
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
from keras.callbacks import EarlyStopping

Using TensorFlow backend.


In [2]:
import json
from pandas.io.json import json_normalize

In [3]:
# read data files
import os
basePath = os.path.dirname(os.path.abspath("tweets_DM.json"))
print(basePath)
raw_data=pd.read_json(basePath+"/dm19-lab2-nthu/tweets_DM.json",lines=True)
tweets=json_normalize(data=raw_data['_source'])
identify=pd.read_csv(basePath+"/dm19-lab2-nthu/data_identification.csv")
emotion=pd.read_csv(basePath+"/dm19-lab2-nthu/emotion.csv")

/Users/huangmanlin/Github/DMlab2/DM19-Lab2-Homework


In [4]:
tweets.head()

Unnamed: 0,tweet.hashtags,tweet.tweet_id,tweet.text
0,[Snapchat],0x376b20,"People who post ""add me on #Snapchat"" must be ..."
1,"[freepress, TrumpLegacy, CNN]",0x2d5350,"@brianklaas As we see, Trump is dangerous to #..."
2,[bibleverse],0x28b412,"Confident of your obedience, I write to you, k..."
3,[],0x1cd5b0,Now ISSA is stalking Tasha 😂😂😂 <LH>
4,[],0x2de201,"""Trust is not the same as faith. A friend is s..."


In [5]:
# rename column names
tweets=tweets.rename(index=str,columns={"tweet.text":"text", "tweet.tweet_id":"tweet_id",
                                       "tweet.hashtags":"hashtags"})

# add identify tags to dataframe
tweets=pd.merge(tweets,identify, on="tweet_id")

tweets.head()

Unnamed: 0,hashtags,tweet_id,text,identification
0,[Snapchat],0x376b20,"People who post ""add me on #Snapchat"" must be ...",train
1,"[freepress, TrumpLegacy, CNN]",0x2d5350,"@brianklaas As we see, Trump is dangerous to #...",train
2,[bibleverse],0x28b412,"Confident of your obedience, I write to you, k...",test
3,[],0x1cd5b0,Now ISSA is stalking Tasha 😂😂😂 <LH>,train
4,[],0x2de201,"""Trust is not the same as faith. A friend is s...",test


In [6]:
#get training set and test set
train_df=tweets[tweets["identification"] == "train"]
test_df=tweets[tweets["identification"] == "test"]

train_df.head()
test_df.head()

Unnamed: 0,hashtags,tweet_id,text,identification
2,[bibleverse],0x28b412,"Confident of your obedience, I write to you, k...",test
4,[],0x2de201,"""Trust is not the same as faith. A friend is s...",test
9,"[materialism, money, possessions]",0x218443,When do you have enough ? When are you satisfi...,test
30,"[GodsPlan, GodsWork]",0x2939d5,"God woke you up, now chase the day #GodsPlan #...",test
33,[],0x26289a,"In these tough times, who do YOU turn to as yo...",test


In [7]:
#add emotion column
train_df=pd.merge(train_df,emotion, on="tweet_id")
test_df["emotion"]=""

train_df.head()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,hashtags,tweet_id,text,identification,emotion
0,[Snapchat],0x376b20,"People who post ""add me on #Snapchat"" must be ...",train,anticipation
1,"[freepress, TrumpLegacy, CNN]",0x2d5350,"@brianklaas As we see, Trump is dangerous to #...",train,sadness
2,[],0x1cd5b0,Now ISSA is stalking Tasha 😂😂😂 <LH>,train,fear
3,"[authentic, LaughOutLoud]",0x1d755c,@RISKshow @TheKevinAllison Thx for the BEST TI...,train,joy
4,[],0x2c91a8,Still waiting on those supplies Liscus. <LH>,train,anticipation


In [8]:
#drop identification tags
train_df.drop(columns=["identification"],inplace=True)
test_df.drop(columns=["identification"],inplace=True)

test_df.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


Unnamed: 0,hashtags,tweet_id,text,emotion
2,[bibleverse],0x28b412,"Confident of your obedience, I write to you, k...",
4,[],0x2de201,"""Trust is not the same as faith. A friend is s...",
9,"[materialism, money, possessions]",0x218443,When do you have enough ? When are you satisfi...,
30,"[GodsPlan, GodsWork]",0x2939d5,"God woke you up, now chase the day #GodsPlan #...",
33,[],0x26289a,"In these tough times, who do YOU turn to as yo...",


In [9]:
#use tweet_id as index
train_df.set_index("tweet_id",inplace=True)
test_df.set_index("tweet_id",inplace=True)

train_df.head()


Unnamed: 0_level_0,hashtags,text,emotion
tweet_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0x376b20,[Snapchat],"People who post ""add me on #Snapchat"" must be ...",anticipation
0x2d5350,"[freepress, TrumpLegacy, CNN]","@brianklaas As we see, Trump is dangerous to #...",sadness
0x1cd5b0,[],Now ISSA is stalking Tasha 😂😂😂 <LH>,fear
0x1d755c,"[authentic, LaughOutLoud]",@RISKshow @TheKevinAllison Thx for the BEST TI...,joy
0x2c91a8,[],Still waiting on those supplies Liscus. <LH>,anticipation


In [10]:
# save to pickle file
train_df.to_pickle("train_df.pkl")
test_df.to_pickle("test_df.pkl")

In [11]:
## load a pickle file
train_df = pd.read_pickle(basePath+"/train_df.pkl")
test_df = pd.read_pickle(basePath+"/test_df.pkl")

train_df.head()

Unnamed: 0_level_0,hashtags,text,emotion
tweet_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0x376b20,[Snapchat],"People who post ""add me on #Snapchat"" must be ...",anticipation
0x2d5350,"[freepress, TrumpLegacy, CNN]","@brianklaas As we see, Trump is dangerous to #...",sadness
0x1cd5b0,[],Now ISSA is stalking Tasha 😂😂😂 <LH>,fear
0x1d755c,"[authentic, LaughOutLoud]",@RISKshow @TheKevinAllison Thx for the BEST TI...,joy
0x2c91a8,[],Still waiting on those supplies Liscus. <LH>,anticipation


In [12]:
test_df.head()

Unnamed: 0_level_0,hashtags,text,emotion
tweet_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0x28b412,[bibleverse],"Confident of your obedience, I write to you, k...",
0x2de201,[],"""Trust is not the same as faith. A friend is s...",
0x218443,"[materialism, money, possessions]",When do you have enough ? When are you satisfi...,
0x2939d5,"[GodsPlan, GodsWork]","God woke you up, now chase the day #GodsPlan #...",
0x26289a,[],"In these tough times, who do YOU turn to as yo...",


In [13]:
for i, text in enumerate(test_df['text'][3000:4000]):
    print(i, text)
    print()

0 No vid today but tomorrow and Saturday ... #sorry  #Forgive  <LH>

1 Wth is happening to me, I married Farkas in Skyrim and i find myself dreaming about our little wolf cubs, gahd he's to handsome help <LH>

2 I dint just read the 'discos are rejecting electricity from GenCos'.....Nigeria...whyyy.? @ikedc + others why?????? <LH> SaveNigeria

3 Enjoying The Royal British Legion Remembrance on the telly tonight despite it making my cry #RemembranceDay2017 #moving <LH>

4 within 20 minutes of being in this bar (designated driver) I already got hit on and asked to play pool by some 21+ guy <LH>

5 My life is far from perfect but i have everything i could ever ask for plus more. <LH>

6 Had the honor of drumming at a different church and I'm feeling <LH>

7 How does one sleep with a sexathon going on upstairs and construction outside your window? 😧😡😴 <LH>

8 @politico He even sucks at deportations.  <LH>

9 @BecketAdams Every time you do one of these I realize what a terrible person you a

758 So here we are 10 years later with #iOS11 and the Voice Memos app STILL doesn’t play in the background. <LH> @Apple?

759 @Devinder_Sharma Flip flop of voters be congess and BJP is not going to change any thing in india  .. <LH>

760 All these news talking heads have something to say about the Vegas shootings,k do not remember seeing any of them there saving lives. <LH>

761 Just watched Follies and I am actually in awe. Just wow. Absolute wow at the performance from every member of that cast 😍 <LH>

762 Wait... y'all ain't see Jered in the Mexican Restaurant?!?! He was sittin' in a booth w/ Lionel and Molly's denial <LH> <LH>

763 Ahh Stressful past week thanks to Hurricane Irma! Work is far from finished but the almost sleepless nights are virtually over. <LH>

764 Well? So he says--? So he says, resumed the convict I had recognized,--it was all said and black then he sent me this... <LH> #itsgold #nz

765 We're here. We're queer. Don't be afraid, To be loud, And proud! ❤🌈 #Natio

# Clean text and preprocess

In [2]:
from nltk.tokenize import TweetTokenizer
tknzr = TweetTokenizer()

In [3]:
tknzr = TweetTokenizer(strip_handles=True, reduce_len=True)
s1 = '@wan: Love yaaa all, see you soon!!!!!!'
tknzr.tokenize(s1)

[':', 'Love', 'yaaa', 'all', ',', 'see', 'you', 'soon', '!', '!', '!']

## Feature engineering TF-IDF

In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import TweetTokenizer
tknzr = TweetTokenizer(preserve_case=False)
tfidf = TfidfVectorizer(max_features=20000, stop_words='english',
                                     tokenizer=tknzr.tokenize)

# fitting
tfidf.fit(train_df['text'])

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=20000,
                min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words='english', strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=<bound method TweetTokenizer.tokenize of <nltk.tokenize.casual.TweetTokenizer object at 0x2c1957390>>,
                use_idf=True, vocabulary=None)

In [17]:
# transforming training sets
X_train = tfidf.transform(train_df['text'])
X_train.shape

(1455563, 20000)

In [18]:
# transforming testing sets
X_test = tfidf.transform(test_df['text'])
X_test.shape

(411972, 20000)

In [19]:
# set pointers
y_train = train_df['emotion']
y_test = test_df['emotion']

![title](img/picture.png)

## After search for few papers and websites, it seems that Logistic Regression have a pretty good result and it will not take a long time and strict computer hardware to make the prediction. So I first decided to try this method first

In [20]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(C=6,n_jobs=-1,max_iter=1000)
lr.fit(X_train,y_train)

  " = {}.".format(effective_n_jobs(self.n_jobs)))


LogisticRegression(C=6, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='warn', n_jobs=-1, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [21]:
pred_result_lr = lr.predict(X_test)
pred_result_lr.shape

(411972,)

In [22]:
# save the result
test_df['emotion']=pred_result_lr
test_df.drop(columns=['hashtags','text'],inplace=True)
test_df.index.rename('id',inplace=True)
test_df.columns=['emotion']
test_df.to_csv('lr_tfidf.csv')

# LSTM

In [23]:
## load a pickle file
train_df = pd.read_pickle(basePath+"/train_df.pkl")
test_df = pd.read_pickle(basePath+"/test_df.pkl")

## make words to sequence

In [24]:
max_words = 20000
max_len = 300
tok = Tokenizer(num_words=max_words)
tok.fit_on_texts(train_df['text'])

In [25]:
train_seq = tok.texts_to_sequences(train_df['text'])
test_seq = tok.texts_to_sequences(test_df['text'])

train_seq_mat = sequence.pad_sequences(train_seq,maxlen=max_len)
test_seq_mat = sequence.pad_sequences(test_seq,maxlen=max_len)

print(train_seq_mat.shape)
print(test_seq_mat.shape)

(1455563, 300)
(411972, 300)


## Label one hot encoding

In [26]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
label_encoder.fit(y_train)
print('check label: ', label_encoder.classes_)
print('\n## Before convert')
print('y_train[0:4]:\n', y_train[0:4])
print('\ny_train.shape: ', y_train.shape)
print('y_test.shape: ', y_test.shape)

def label_encode(le, labels):
    enc = le.transform(labels)
    return keras.utils.to_categorical(enc)

def label_decode(le, one_hot_label):
    dec = np.argmax(one_hot_label, axis=1)
    return le.inverse_transform(dec)

y_train = label_encode(label_encoder, y_train)
y_test = label_encode(label_encoder, y_test)

print('\n\n## After convert')
print('y_train[0:4]:\n', y_train[0:4])
print('\ny_train.shape: ', y_train.shape)
print('y_test.shape: ', y_test.shape)

check label:  ['anger' 'anticipation' 'disgust' 'fear' 'joy' 'sadness' 'surprise'
 'trust']

## Before convert
y_train[0:4]:
 tweet_id
0x376b20    anticipation
0x2d5350         sadness
0x1cd5b0            fear
0x1d755c             joy
Name: emotion, dtype: object

y_train.shape:  (1455563,)
y_test.shape:  (411972,)


## After convert
y_train[0:4]:
 [[0. 1. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0.]]

y_train.shape:  (1455563, 8)
y_test.shape:  (411972, 8)


In [27]:
input_shape = X_train.shape[1]
print('input_shape: ', input_shape)

output_shape = len(label_encoder.classes_)
print('output_shape: ', output_shape)

input_shape:  20000
output_shape:  8


## define model

In [28]:
inputs = Input(name='inputs',shape=[max_len])
## Embedding
layer = Embedding(max_words+1,128,input_length=max_len)(inputs)
layer = LSTM(128)(layer)
layer = Dense(128,activation="relu",name="FC1")(layer)
layer = Dropout(0.5)(layer)
layer = Dense(output_shape,activation="softmax",name="FC2")(layer)
model = Model(inputs=inputs,outputs=layer)
model.summary()
model.compile(loss="categorical_crossentropy",optimizer=RMSprop(),metrics=["accuracy"])

Instructions for updating:
If using Keras pass *_constraint arguments to layers.
Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
inputs (InputLayer)          (None, 300)               0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 300, 128)          2560128   
_________________________________________________________________
lstm_1 (LSTM)                (None, 128)               131584    
_________________________________________________________________
FC1 (Dense)                  (None, 128)               16512     
_________________________________________________________________
dropout_1 (Dropout)          (None, 128)               0         
_________________________________________________________________
FC2 (Dense)                  (None, 8)                 1032      
Total params: 2,709,256
Trainable params: 2,

In [29]:
model_fit = model.fit(train_seq_mat,y_train,batch_size=128,epochs=3,
                      callbacks=[EarlyStopping(monitor='val_loss',min_delta=0.0001)] )


Epoch 1/3
Epoch 2/3




Epoch 3/3


In [30]:
# predict the result using our model
pred_result_lstm = label_decode(label_encoder, model.predict(test_seq_mat, batch_size=128))
pred_result_lstm[:5]

array(['anticipation', 'anticipation', 'joy', 'joy', 'trust'],
      dtype=object)

In [31]:
# save the result
test_df['emotion']=pred_result_lstm
test_df.drop(columns=['hashtags','text'],inplace=True)
test_df.index.rename('id',inplace=True)
test_df.columns=['emotion']
test_df.to_csv('keras_tfidf.csv')