In [1]:
import warnings
import pandas as pd
warnings.filterwarnings('ignore')
%config Completer.use_jedi = False

In [2]:
df_train = pd.read_csv('emotions-dataset-for-nlp/train.txt', header = None, sep = ';', names = ['Input', 'Sentiment'], encoding = 'utf-8')
df_test = pd.read_csv('emotions-dataset-for-nlp/test.txt', header = None, sep = ';', names = ['Input', 'Sentiment'], encoding = 'utf-8')
df_val = pd.read_csv('emotions-dataset-for-nlp/val.txt', header = None, sep = ';', names = ['Input', 'Sentiment'], encoding = 'utf-8')

In [3]:
df_full = pd.concat([df_train, df_test, df_val], axis = 0)
df_full

Unnamed: 0,Input,Sentiment
0,i didnt feel humiliated,sadness
1,i can go from feeling so hopeless to so damned...,sadness
2,im grabbing a minute to post i feel greedy wrong,anger
3,i am ever feeling nostalgic about the fireplac...,love
4,i am feeling grouchy,anger
...,...,...
1995,im having ssa examination tomorrow in the morn...,sadness
1996,i constantly worry about their fight against n...,joy
1997,i feel its important to share this info for th...,joy
1998,i truly feel that if you are passionate enough...,joy


In [4]:
import text_hammer as th

In [5]:
%%time 

import text_preprocessing as tp 
import text_hammer as th
from tqdm.notebook import tqdm_notebook
tqdm_notebook.pandas()

def text_preprocessing(df, col_name):
    column = col_name
    df[column] = df[column].progress_apply(lambda x: tp.expand_contraction(x))
    df[column] = df[column].progress_apply(lambda x: tp.remove_email(x))
    df[column] = df[column].progress_apply(lambda x: tp.remove_url(x))
    df[column] = df[column].progress_apply(lambda x: tp.remove_special_character(x))
    df[column] = df[column].progress_apply(lambda x: tp.normalize_unicode(x))
    df[column] = df[column].progress_apply(lambda x: tp.remove_stopword(x))
    df[column] = df[column].progress_apply(lambda x: tp.stem_word(x))
    
    return df

CPU times: user 129 ms, sys: 14.2 ms, total: 143 ms
Wall time: 272 ms


[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/ekanshgupta/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [6]:
import text_preprocessing as tp

In [7]:
df_cleaned = text_preprocessing(df_full, 'Input')

  0%|          | 0/20000 [00:00<?, ?it/s]

  0%|          | 0/20000 [00:00<?, ?it/s]

  0%|          | 0/20000 [00:00<?, ?it/s]

  0%|          | 0/20000 [00:00<?, ?it/s]

  0%|          | 0/20000 [00:00<?, ?it/s]

  0%|          | 0/20000 [00:00<?, ?it/s]

  0%|          | 0/20000 [00:00<?, ?it/s]

In [8]:
df_cleaned.head()

Unnamed: 0,Input,Sentiment
0,"[feel, humili]",sadness
1,"[go, feel, hopeless, damn, hope, around, someo...",sadness
2,"[grab, minut, post, feel, greedi, wrong]",anger
3,"[ever, feel, nostalg, fireplac, know, still, p...",love
4,"[feel, grouchi]",anger


In [9]:
df_cleaned = df_cleaned.copy()

In [10]:
df_cleaned['num_words'] = df_cleaned.Input.apply(lambda x: len(x))

In [11]:
df_cleaned['Sentiment'] = df_cleaned.Sentiment.astype('category')
df_cleaned.Sentiment.cat.codes

0       4
1       4
2       0
3       3
4       0
       ..
1995    4
1996    2
1997    2
1998    2
1999    2
Length: 20000, dtype: int8

In [19]:
df_cleaned.Sentiment.unique()

['sadness', 'anger', 'love', 'surprise', 'fear', 'joy']
Categories (6, object): ['anger', 'fear', 'joy', 'love', 'sadness', 'surprise']

In [20]:
df_cleaned.Sentiment

0       sadness
1       sadness
2         anger
3          love
4         anger
         ...   
1995    sadness
1996        joy
1997        joy
1998        joy
1999        joy
Name: Sentiment, Length: 20000, dtype: category
Categories (6, object): ['anger', 'fear', 'joy', 'love', 'sadness', 'surprise']

In [21]:
encoded_dict  = {'anger': 0, 'fear': 1, 'joy': 2, 'love': 3, 'sadness': 4, 'surprise': 5}

In [22]:
df_cleaned['Sentiment'] = df_cleaned.Sentiment.cat.codes
df_cleaned.Sentiment

0       4
1       4
2       0
3       3
4       0
       ..
1995    4
1996    2
1997    2
1998    2
1999    2
Name: Sentiment, Length: 20000, dtype: int8

In [23]:
df_cleaned.head()

Unnamed: 0,Input,Sentiment,num_words
0,"[feel, humili]",4,2
1,"[go, feel, hopeless, damn, hope, around, someo...",4,9
2,"[grab, minut, post, feel, greedi, wrong]",0,6
3,"[ever, feel, nostalg, fireplac, know, still, p...",3,7
4,"[feel, grouchi]",0,2


In [24]:
df_cleaned.num_words.max()

35

In [25]:
from sklearn.model_selection import train_test_split
data_train, data_test = train_test_split(df_cleaned, test_size=0.3, random_state=42, stratify=df_cleaned.Sentiment)

In [26]:
data_train.head().Input

7181                               [feel, sure, go, busi]
9479                 [feel, uptight, wonder, know, right]
812     [allow, eat, food, know, bother, sinc, feel, a...
8670    [happi, progress, also, begin, feel, littl, ho...
1014    [alon, apart, get, overwhelm, feel, like, watc...
Name: Input, dtype: object

In [27]:
data_train.shape

(14000, 3)

In [28]:
data_test.shape

(6000, 3)

In [29]:
data_train.head()

Unnamed: 0,Input,Sentiment,num_words
7181,"[feel, sure, go, busi]",2,4
9479,"[feel, uptight, wonder, know, right]",1,5
812,"[allow, eat, food, know, bother, sinc, feel, a...",4,16
8670,"[happi, progress, also, begin, feel, littl, ho...",4,7
1014,"[alon, apart, get, overwhelm, feel, like, watc...",4,8


In [30]:
from tensorflow.keras.utils import to_categorical

In [33]:
to_categorical(data_train.Sentiment)

array([[0., 0., 1., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0.],
       ...,
       [1., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 0., 1.]], dtype=float32)

In [34]:
from transformers import AutoTokenizer, TFBertModel
tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')
bert = TFBertModel.from_pretrained('bert-base-cased')

Some layers from the model checkpoint at bert-base-cased were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-cased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [35]:
tokenizer.save_pretrained('bert-tokenizer')
bert.save_pretrained('bert-model')

In [36]:
import shutil
shutil.make_archive('bert-tokenizer', 'zip', 'bert-tokenizer')
shutil.make_archive('bert-model', 'zip', 'bert-model')

'/Users/ekanshgupta/StockSentiment/bert-model.zip'

In [37]:
from transformers import BertTokenizer, TFBertModel, BertConfig, TFDistilBertModel, DistilBertTokenizer, DistilBertConfig
dbert_model = TFDistilBertModel.from_pretrained('distilbert-base-uncased')

Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertModel: ['activation_13', 'vocab_layer_norm', 'vocab_projector', 'vocab_transform']
- This IS expected if you are initializing TFDistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFDistilBertModel were initialized from the model checkpoint at distilbert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.


In [38]:
x_train = tokenizer(
    text=data_train.Input.tolist(),
    is_split_into_words=True,
    add_special_tokens=True,
    max_length=36,
    truncation=True,
    padding=True,
    return_tensors='tf',
    return_token_type_ids=False,
    return_attention_mask=True,
    verbose=True
)
x_test = tokenizer(
    text=data_test.Input.tolist(),
    is_split_into_words=True,
    add_special_tokens=True,
    max_length=36,
    truncation=True,
    padding=True,
    return_tensors='tf',
    return_token_type_ids=False,
    return_attention_mask=True,
    verbose=True
)

In [39]:
x_train
x_test

{'input_ids': <tf.Tensor: shape=(6000, 36), dtype=int32, numpy=
array([[ 101, 1474, 1271, ...,    0,    0,    0],
       [ 101, 1631, 1176, ..., 1403,  102,    0],
       [ 101, 2647, 1631, ...,    0,    0,    0],
       ...,
       [ 101, 1631, 2393, ...,    0,    0,    0],
       [ 101, 1180, 1631, ...,    0,    0,    0],
       [ 101, 5178, 5427, ...,    0,    0,    0]], dtype=int32)>, 'attention_mask': <tf.Tensor: shape=(6000, 36), dtype=int32, numpy=
array([[1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 1, 1, 0],
       [1, 1, 1, ..., 0, 0, 0],
       ...,
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0]], dtype=int32)>}

In [40]:
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.initializers import TruncatedNormal
from tensorflow.keras.losses import CategoricalCrossentropy
from tensorflow.keras.metrics import CategoricalAccuracy
from tensorflow.keras.utils import to_categorical

In [41]:
import tensorflow as tf
tf.config.experimental.list_physical_devices('GPU')


[]

In [42]:
max_len = 36
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense
   
input_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_ids")
input_mask = Input(shape=(max_len,), dtype=tf.int32, name="attention_mask")
# embeddings = dbert_model(input_ids,attention_mask = input_mask)[0]


embeddings = bert(input_ids,attention_mask = input_mask)[0] #(0 is the last hidden states,1 means pooler_output)
out = tf.keras.layers.GlobalMaxPool1D()(embeddings)
out = Dense(128, activation='relu')(out)
out = tf.keras.layers.Dropout(0.1)(out)
out = Dense(32,activation='relu')(out)

y = Dense(6,activation='sigmoid')(out)

model = tf.keras.Model(inputs=[input_ids, input_mask], outputs=y)
model.layers[2].trainable = True

In [43]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 36)]         0           []                               
                                                                                                  
 attention_mask (InputLayer)    [(None, 36)]         0           []                               
                                                                                                  
 tf_bert_model (TFBertModel)    TFBaseModelOutputWi  108310272   ['input_ids[0][0]',              
                                thPoolingAndCrossAt               'attention_mask[0][0]']         
                                tentions(last_hidde                                               
                                n_state=(None, 36,                                            

In [44]:
tf.keras.utils.plot_model(model, show_shapes=True)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
You must install pydot (`pip install pydot`) and install graphviz (see instructions at https://graphviz.gitlab.io/download/) for plot_model to work.


In [45]:
optimizer = Adam(
    learning_rate=5e-05,
    epsilon=1e-08,
    decay=0.01,
    clipnorm=1.0
)

# set loss and metrics
loss = CategoricalCrossentropy(from_logits=True)
metric = CategoricalAccuracy('balanced_accuracy')
# compile the model
model.compile(optimizer=optimizer, loss=loss, metrics=metric)

In [46]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 36)]         0           []                               
                                                                                                  
 attention_mask (InputLayer)    [(None, 36)]         0           []                               
                                                                                                  
 tf_bert_model (TFBertModel)    TFBaseModelOutputWi  108310272   ['input_ids[0][0]',              
                                thPoolingAndCrossAt               'attention_mask[0][0]']         
                                tentions(last_hidde                                               
                                n_state=(None, 36,                                            

In [47]:
tf.config.experimental_run_functions_eagerly(True)
tf.config.run_functions_eagerly(True)

Instructions for updating:
Use `tf.config.run_functions_eagerly` instead of the experimental version.


In [48]:
train_history = model.fit(
    x = {'input_ids':x_train['input_ids'], 'attention_mask':x_train['attention_mask']},
    y = to_categorical(data_train.Sentiment),
    validation_data = (
        {'input_ids': x_test['input_ids'], 'attention_mask':x_test['attention_mask']}, to_categorical(data_train.Sentiment)
    ),
    epochs = 1,
    batch_size = 36
)

2023-04-15 15:05:25.970169: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz




ValueError: Data cardinality is ambiguous:
  x sizes: 6000, 6000
  y sizes: 14000
Make sure all arrays contain the same number of samples.

In [49]:
predicted_raw = model.predict({'input_ids' : x_test['input_ids'], 'attention_mask': x_test['attention_mask']})



In [50]:
predicted_raw[0]

array([0.9249266 , 0.79025155, 0.3010679 , 0.03423624, 0.23898873,
       0.28776443], dtype=float32)

In [51]:
from pandas import np
y_predicted = np.argmax(predicted_raw, axis=1)

In [52]:
from sklearn.metrics import classification_report
classification_report(data_test.Sentiment, y_predicted)

'              precision    recall  f1-score   support\n\n           0       0.89      0.87      0.88       813\n           1       0.90      0.80      0.85       712\n           2       0.86      0.91      0.89      2028\n           3       0.75      0.62      0.68       492\n           4       0.91      0.94      0.93      1739\n           5       0.77      0.71      0.74       216\n\n    accuracy                           0.87      6000\n   macro avg       0.85      0.81      0.83      6000\nweighted avg       0.87      0.87      0.87      6000\n'

In [67]:
#import from Tweet.ipynb
%store -r tweets_simplify
%store -r body
%store -r ticker
%store -r stock_price

In [68]:
#texts = ["i think this stock has great potential"]

x_val = tokenizer(
    text = body,
    add_special_tokens=True,
    max_length = 36,
    truncation = True,
    padding='max_length',
    return_tensors='tf',
    return_token_type_ids=False,
    return_attention_mask=True,
    verbose=True
)

validation = model.predict({'input_ids':x_val['input_ids'],'attention_mask':x_val['attention_mask']})*100
validation



array([[42.530777 , 19.809439 , 88.05628  , 37.564724 , 41.909397 ,
        14.473362 ],
       [42.530777 , 19.809439 , 88.05628  , 37.564724 , 41.909397 ,
        14.473362 ],
       [43.271942 , 35.537632 , 62.97938  , 30.342516 , 80.75502  ,
        12.181705 ],
       [44.801712 , 15.168844 , 94.86046  , 32.72899  , 33.637608 ,
        14.825275 ],
       [41.93366  , 38.6899   , 75.14487  , 24.16026  , 47.19691  ,
        30.67519  ],
       [48.329533 , 39.418404 , 81.320694 , 29.387987 , 63.207924 ,
        18.214079 ],
       [45.09989  , 34.412354 , 63.447697 , 25.810505 , 79.02218  ,
        13.131216 ],
       [25.348902 , 16.199373 , 92.95924  , 41.826202 , 40.570496 ,
        16.56161  ],
       [37.68558  , 19.203468 , 89.26843  , 44.82439  , 34.320812 ,
        18.64868  ],
       [37.68558  , 19.203468 , 89.26843  , 44.82439  , 34.320812 ,
        18.64868  ],
       [29.618484 , 16.897245 , 90.52772  , 39.612522 , 42.79575  ,
        18.471832 ],
       [40.452797 , 2

In [69]:
count = 0
for i in validation:
    print('Tweet: ', body[count])
    for key,value in zip(encoded_dict.keys(), i):
        print(key,value)
    print('Stock: ', ticker[count])
    print('Stock Price: ', stock_price[count], '\n')
    count+=1

Tweet:  "http://iphone.appleinsider.com/articles/14/12/30/editorial-the-world-revolved-around-apple-inc-in-2014…" .@DanielEran Awesome journalism as usual. #mustread $GOOG $AAPL
anger 42.530777
fear 19.809439
joy 88.05628
love 37.564724
sadness 41.909397
surprise 14.473362
Stock:  AAPL
Stock Price:  2.66 

Tweet:  "http://iphone.appleinsider.com/articles/14/12/30/editorial-the-world-revolved-around-apple-inc-in-2014…" .@DanielEran Awesome journalism as usual. #mustread $GOOG $AAPL
anger 42.530777
fear 19.809439
joy 88.05628
love 37.564724
sadness 41.909397
surprise 14.473362
Stock:  GOOG
Stock Price:  4.69 

Tweet:  Jeff Bezos lost $7.4 billion in Amazon's worst year since 2008: http://on.wsj.com/1Acvuo6 $AMZN
anger 43.271942
fear 35.537632
joy 62.97938
love 30.342516
sadness 80.75502
surprise 12.181705
Stock:  AMZN
Stock Price:  0.14 

Tweet:  $MU $AAPL $SUNE Propel Greenlight To 9% Return In 2014 http://valuewalk.com/2014/12/apple-micron-sune-greenlight/… $MRVL $GLD
anger 44.801712
f