In [5]:
from google.cloud import language_v2

client =  language_v2.LanguageServiceClient()
doc = language_v2.Document.Type.PLAIN_TEXT
language_code = "zh-Hant"
document = {
    "content": "今天天氣真好。有花在綻放、鳥在唱歌，但身邊的學生卻苦命的要上學。",
    "type": doc,
    "language_code": language_code
}

encoding_type = language_v2.EncodingType.UTF8

response = client.analyze_sentiment(request={'document': document, 'encoding_type': encoding_type})
print(f'Document sentiment score: {response.document_sentiment.score}')
print(f'Document sentiment magnitude: {response.document_sentiment.magnitude}')

for sentence in response.sentences:
    print(f'Sentence text: {sentence.text.content}')
    print(f'Sentence sentiment score: {sentence.sentiment.score}')
    print(f'Sentence sentiment magnitude: {sentence.sentiment.magnitude}')

print(f'Lanuage of the document: {response.language_code}')

Document sentiment score: 0.41200000047683716
Document sentiment magnitude: 1.7869999408721924
Sentence text: 今天天氣真好。
Sentence sentiment score: 0.9369999766349792
Sentence sentiment magnitude: 0.9879999756813049
Sentence text: 有花在綻放、鳥在唱歌，但身邊的學生卻苦命的要上學。
Sentence sentiment score: -0.11100000143051147
Sentence sentiment magnitude: 0.800000011920929
Lanuage of the document: zh-Hant


!pip install "numpy<2.0.0" --force-reinstall

!pip install numba --upgrade
!pip install scipy --upgrade
!pip install nltk --upgrade
!pip install numpy --upgrade

!python3 -m spacy download zh_core_web_trf

!pip install "numpy<2.0.0" --force-reinstall

In [6]:
import math
import nltk
import spacy
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
import transformers

In [7]:
nlp_zh = spacy.load("zh_core_web_trf")
doc = nlp_zh("台灣是一個位於亞洲東部的島嶼國家。")
doc

台灣是一個位於亞洲東部的島嶼國家。

In [8]:
columns = ['text','pos_','tag_','dep_','is_alpha','is_stop']
dim = list(map(lambda x: [x.text, x.pos_, x.tag_, x.dep_, x.is_alpha, x.is_stop], doc))
pd.DataFrame(dim, columns=columns)

Unnamed: 0,text,pos_,tag_,dep_,is_alpha,is_stop
0,台灣,PROPN,NR,nsubj,True,False
1,是,VERB,VC,cop,True,True
2,一,NUM,CD,nummod,True,True
3,個,NUM,M,mark:clf,True,False
4,位於,VERB,VV,acl,True,False
5,亞洲,PROPN,NR,nmod:assmod,True,False
6,東部,NOUN,NN,dobj,True,False
7,的,PART,DEC,mark,True,True
8,島嶼,NOUN,NN,compound:nn,True,False
9,國家,NOUN,NN,ROOT,True,False


In [9]:
# visualize the dependency tree
from spacy import displacy
options = {"compact": True,
           "bg":'black',
           'color':'white',
           'font':'',
           'distance':80}
displacy.render(doc, style='dep', options=options)

In [10]:
for e in doc.ents:
    print(e.text, e.label_)
displacy.render(doc, style='ent', jupyter=True)

台灣 GPE
亞洲 LOC


##### "國家" should be ORG, but it is not recognized by the model

In [11]:
'''因為國家訓練效果不佳，所以透過多個例句建立訓練資料，重新訓練“國家”這個實體類別'''
# suggestion: use ChatGPT to generate more sentences
training_data = [
    ("台灣的風景都非常美麗",[(0,2,"GPE")]),
    ("亞洲東部國家的一些特色包括擁有豐富多彩的文化遺產和傳統習俗。",[(0,4,"LOC"),(4,6,"ORG")]),
    ("在亞洲東部國家，食品文化有著獨特的地位，其美食吸引著眾多遊客前來品嚐。",[(1,5,"LOC"),(5,7,"ORG")]),
    ("亞洲東部國家的經濟以製造業和出口為主，是全球經濟中的關鍵角色。",[(0,4,"LOC"),(4,6,"ORG")]),
    ("在亞洲東部國家，教育非常重要，其高水平的教育體系吸引著世界各地的學生前來留學。",[(3,5,"LOC"),(5,7,"ORG")]),
    # ("亞洲東部國家的一些城市，如東京、首爾和上海等，擁有先進的科技和發達的城市建設，是現代化的代表。",[(2,4,"LOC"),(4,6,"ORG"),(13,15,"ORG"),(16,18,"ORG"),(19,21,"ORG")]),
]


In [12]:
# train the NER model with training_data
from spacy.tokens import DocBin
from tqdm import tqdm
nlp = spacy.blank("zh")
# DocBin is a container for Doc objects
db = DocBin()
for text, annot in tqdm(training_data):
    _doc = nlp(text)
    ents = []
    for start, end, label in annot:
        span = _doc.char_span(start, end, label=label)
        if span is None:
            print("Skipping entity")
        else:
            ents.append(span)
    _doc.ents = ents
    db.add(_doc)
db.to_disk("./train.spacy")
db.to_disk("./dev.spacy")

100%|██████████| 5/5 [00:00<00:00, 2016.10it/s]


### retrain spacy model
!python3 -m spacy init fill-config base_config.cfg config.cfg
!python3 -m spacy train config.cfg --output ./output --paths.train ./train.spacy --paths.dev ./dev.spacy

In [23]:
best_model = spacy.load("./output/model-best")
new_doc = best_model("台灣是一個位於亞洲東部的島嶼國家。")
displacy.render(new_doc, style='ent', jupyter=True)

### sentiment Analysis with Transformers
#### follow [guide](<https://towardsdatascience.com/sentiment-analysis-with-transformers-a-complete-deep-learning-project-pt-i-d4ca7e47d676>)

!pip install keras
!pip install keras-preprocessing

In [3]:
import math, nltk, spacy, numpy as np, pandas as pd, tensorflow as tf, matplotlib.pyplot as plt, transformers
from tokenizers import BertWordPieceTokenizer
from tqdm.auto import tqdm
from nltk.corpus import stopwords
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential, load_model
from keras_preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences
from keras.metrics import Precision, Recall, AUC
from keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional
from keras.callbacks import EarlyStopping, LearningRateScheduler, CallbackList, ReduceLROnPlateau
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import l1_l2
from keras.saving import register_keras_serializable
from tensorflow.keras.layers import Layer, Dense
from transformers import TFDistilBertModel, DistilBertConfig
from tensorflow.keras.metrics import Precision, Recall, AUC
import warnings
warnings.filterwarnings("ignore")

In [4]:
train_data = pd.read_csv('Train_data.txt', header=None,delimiter='\t')
test_data = pd.read_csv('Test_data.txt', header=None,delimiter='\t')


In [5]:
train_data.columns = ['sentiment','text']
test_data.columns = ['sentiment','text']
print(train_data.shape)
print(test_data.shape)

(149985, 2)
(61998, 2)


In [6]:
print(train_data['sentiment'].value_counts())
print(test_data['sentiment'].value_counts())

0    75019
1    74966
Name: sentiment, dtype: int64
1    31029
0    30969
Name: sentiment, dtype: int64


!python3 -m spacy download en_core_web_md -q
!pip install "numpy==1.26.0" --force-reinstall -q

In [7]:
nlp = spacy.load("en_core_web_md")
nlp

<spacy.lang.en.English at 0x32b83f280>

In [8]:
def preprocess_text(text):
    doc = nlp(text)
    tokens = [token.lemma_.lower().strip() for token in doc if not token.is_stop]
    return " ".join(tokens)

In [9]:
preprocess_text("I am running in the park and fell down.")

'run park fall .'

In [None]:
tqdm.pandas()
train_data['processed_text'] = train_data['text'].progress_apply(preprocess_text)
test_data['processed_text'] = test_data['text'].progress_apply(preprocess_text)
train_data.head()

  0%|          | 0/149985 [00:00<?, ?it/s]

In [9]:
train_data.to_pickle('train_data.pkl')
test_data.to_pickle('test_data.pkl')

In [None]:
# downsize in case of memory error
train_data = pd.read_pickle('train_data.pkl')[:20000]
test_data = pd.read_pickle('test_data.pkl')[:20000]

In [4]:
# first version - using TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, stop_words='english')
train_tfidf = tfidf_vectorizer.fit_transform(train_data['processed_text'])
test_tfidf = tfidf_vectorizer.transform(test_data['processed_text'])
train_tfidf.shape, test_tfidf.shape

((20000, 7505), (20000, 7505))

In [5]:
type(train_tfidf)

scipy.sparse._csr.csr_matrix

In [6]:
X_train_array = train_tfidf.toarray()
X_test_array = test_tfidf.toarray()
X_train_array.shape, X_test_array.shape

((20000, 7505), (20000, 7505))

In [7]:
del train_tfidf, test_tfidf

In [8]:
y_train_le = train_data.sentiment
y_test_le = test_data.sentiment
y_train_le.shape, y_test_le.shape

((20000,), (20000,))

In [9]:
del train_data, test_data

In [10]:
# imbalance class handling - assign greater weight to minority class
class_weights = compute_class_weight('balanced', classes=y_train_le.unique(), y=y_train_le)
class_weights

array([0.99681021, 1.00321027])

In [11]:
X_train, X_val, y_train, y_val = train_test_split(X_train_array, y_train_le, test_size=0.2, random_state=42, stratify=y_train_le)
X_train.shape, X_val.shape, y_train.shape, y_val.shape

((16000, 7505), (4000, 7505), (16000,), (4000,))

In [14]:
y_train_encoded = to_categorical(y_train)
y_val_encoded = to_categorical(y_val)
y_test_encoded = to_categorical(y_test_le)
y_train_encoded.shape, y_val_encoded.shape, y_test_encoded.shape

((16000, 2), (4000, 2), (20000, 2))

In [15]:
# TODO: Convelutional, Bidirectional, attention, transformer

In [21]:
# fully connected neural network
model_v1 = Sequential()
model_v1.add(Dense(4096, input_dim=X_train.shape[1], 
                   activation='selu', kernel_initializer='lecun_normal', kernel_regularizer=tf.keras.regularizers.l2(0.1)))
model_v1.add(Dense(2048, activation='selu', kernel_initializer='lecun_normal',
                    kernel_regularizer=tf.keras.regularizers.l2(0.1)))
model_v1.add(Dense(1024, activation='selu', kernel_initializer='lecun_normal',
                    kernel_regularizer=tf.keras.regularizers.l2(0.1)))
model_v1.add(Dense(64, activation='selu'))
# add output layer
model_v1.add(Dense(2, activation='softmax'))
# set up specific weight for each class
model_v1.layers[-1].bias.assign(class_weights)

# compile the model
model_v1.compile(optimizer='Adam', loss='categorical_crossentropy',
                 metrics=['accuracy', Precision(), Recall(), AUC()])


model_v1.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_5 (Dense)             (None, 4096)              30744576  
                                                                 
 dense_6 (Dense)             (None, 2048)              8390656   
                                                                 
 dense_7 (Dense)             (None, 1024)              2098176   
                                                                 
 dense_8 (Dense)             (None, 64)                65600     
                                                                 
 dense_9 (Dense)             (None, 2)                 130       
                                                                 
Total params: 41299138 (157.54 MB)
Trainable params: 41299138 (157.54 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [22]:
# learning rate scheduler
def step_decay(epoch):
    initial_lrate = 0.001
    drop = 0.5
    epochs_drop = 10.0
    lrate = initial_lrate * math.pow(drop, math.floor((1+epoch)/epochs_drop))
    return lrate
lr_scheduler = LearningRateScheduler(step_decay)

In [23]:
# early stopping
early_stopping = EarlyStopping(monitor='val_loss', patience=3, verbose=1, restore_best_weights=True)

In [24]:
# deployment
'''as for the epoch, batch size, find the balance between the training time and the model performance, and hardware capability'''
num_epochs = 20
batch_size = 256


In [26]:
%%time
history = model_v1.fit(X_train, y_train_encoded, validation_data=(X_val, y_val_encoded),
                        epochs=num_epochs, batch_size=batch_size, verbose=1,
                        callbacks=[lr_scheduler, early_stopping])

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 11: early stopping
CPU times: user 13min 24s, sys: 1min 4s, total: 14min 29s
Wall time: 2min 32s


In [1]:
pred=model_v1.predict(X_test_array)

NameError: name 'model_v1' is not defined

In [2]:
print(classification_report(y_test_encoded.argmax(axis=1), pred.argmax(axis=1)))
print(confusion_matrix(y_test_encoded.argmax(axis=1), pred.argmax(axis=1)))
print(accuracy_score(y_test_encoded.argmax(axis=1), pred.argmax(axis=1)))

NameError: name 'classification_report' is not defined