In [23]:
import pandas as pd
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

# データセットのダウンロード
データセットとして40,000件のツイートとその感情13種類のラベル付けされたデータを用いる.

In [1]:
!wget -P data https://raw.githubusercontent.com/practical-nlp/practical-nlp/master/Ch4/Data/Sentiment%20and%20Emotion%20in%20Text/train_data.csv
!wget -P data https://raw.githubusercontent.com/practical-nlp/practical-nlp/master/Ch4/Data/Sentiment%20and%20Emotion%20in%20Text/test_data.csv
!ls -lah data

--2022-03-10 09:39:36--  https://raw.githubusercontent.com/practical-nlp/practical-nlp/master/Ch4/Data/Sentiment%20and%20Emotion%20in%20Text/train_data.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.108.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2479133 (2.4M) [text/plain]
Saving to: ‘data/train_data.csv’


2022-03-10 09:39:37 (16.3 MB/s) - ‘data/train_data.csv’ saved [2479133/2479133]

--2022-03-10 09:39:37--  https://raw.githubusercontent.com/practical-nlp/practical-nlp/master/Ch4/Data/Sentiment%20and%20Emotion%20in%20Text/test_data.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.108.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 

In [3]:
filepath = "data/train_data.csv"
df = pd.read_csv(filepath)
print(df.shape)
df.head()

(30000, 2)


Unnamed: 0,sentiment,content
0,empty,@tiffanylue i know i was listenin to bad habi...
1,sadness,Layin n bed with a headache ughhhh...waitin o...
2,sadness,Funeral ceremony...gloomy friday...
3,enthusiasm,wants to hang out with friends SOON!
4,neutral,@dannycastillo We want to trade with someone w...


In [4]:
# ターゲットは13種類にラベル付けされている
df.sentiment.value_counts()

worry         7433
neutral       6340
sadness       4828
happiness     2986
love          2068
surprise      1613
hate          1187
fun           1088
relief        1021
empty          659
enthusiasm     522
boredom        157
anger           98
Name: sentiment, dtype: int64

In [6]:
# 上位3カテゴリのみに絞る
shortlist = ["neutral","happiness","worry"]
df_subset = df[df["sentiment"].isin(shortlist)]
df_subset.shape

(16759, 2)

# 前処理
前処理としてmentionの含む個人情報を削除して小文字に変換する処理を行う. そしてストップワードと数字を削除する. 句読点や顔文字は感情を表すから残しておく.

In [9]:
# strip_handles : Trueのときハンドル名を含む個人情報を削除
# preserve_case : 小文字に変換
tweeter = TweetTokenizer(strip_handles=True,preserve_case=False)
# ストップワード
mystopwords = set(stopwords.words("english"))

# 前処理をまとめた関数
def preprocess_corpus(texts):
    def remove_stops_digits(tokens):
        # 入れ子になった関数。ストップワードと数字をトークンのリストから除去
        return [token for token in tokens if token not in mystopwords and not token.isdigit()]
    # 上記で定義した関数を使って、Twitterトークナイザーの出力をさらに処理
    return [remove_stops_digits(tweeter.tokenize(content)) for content in texts]

mydata = preprocess_corpus(df_subset['content'])
mycats = df_subset['sentiment']
print(len(mydata), len(mycats))

16759 16759


# Doc2Vecの学習

In [13]:
train_data,test_data,train_cats,test_cats = train_test_split(
mydata,
mycats,
random_state=1234)

# doc2vec形式に変換
train_doc2vec = [TaggedDocument((d), tags=[str(i)]) for i, d in enumerate(train_data)]

# doc2vecモデルを学習
model = Doc2Vec(vector_size=50,alpha=0.025,min_count=5,dm=1,epochs=100)
model.build_vocab(train_doc2vec)
model.train(train_doc2vec,total_examples=model.corpus_count,epochs=model.epochs)
model.save("d2v.model")
print("Model Saved")

Model Saved


In [15]:
train_doc2vec[:5]

[TaggedDocument(words=["caaaaan't", 'sleep', '...', '3.30', '!', 'wahhhh', '...', 'wanna', 'cry'], tags=['0']),
 TaggedDocument(words=['based', 'future', 'forgetting', '/', 'ignoring', 'present', ',', 'best', 'keeper', 'according', 'dhoni', 'parthiv'], tags=['1']),
 TaggedDocument(words=['good', 'morning', '!', 'early', 'bad', 'conscience', ',', 'trying', 'make', 'taking', 'day', 'yesterday', ',', '?', ':p'], tags=['2']),
 TaggedDocument(words=['hahaha', "chivalry's", 'dead', ',', 'rare'], tags=['3']),
 TaggedDocument(words=['joining', 'twitter'], tags=['4'])]

# 分類モデルの学習

In [22]:
model = Doc2Vec.load("d2v.model")

# 新しい文章に対する表現獲得
train_vectors =  [model.infer_vector(list_of_tokens, epochs=50) for list_of_tokens in train_data]
test_vectors = [model.infer_vector(list_of_tokens, epochs=50) for list_of_tokens in test_data]

logreg = LogisticRegression(class_weight="balanced")
logreg.fit(train_vectors,train_cats)
preds = logreg.predict(test_vectors)

In [24]:
print(classification_report(test_cats, preds))

              precision    recall  f1-score   support

   happiness       0.35      0.54      0.42       713
     neutral       0.47      0.54      0.50      1595
       worry       0.61      0.40      0.48      1882

    accuracy                           0.48      4190
   macro avg       0.47      0.49      0.47      4190
weighted avg       0.51      0.48      0.48      4190

