<a href="https://colab.research.google.com/github/coraldx5/generativeai_intro_book/blob/master/chap04_movie_review_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 書分類問題を解いてみよう
## 映画レビューの良し/悪しを分類するモデルを作成します
- 本Notebookのゴール感
  - Collaboratory上でのPythonの動かし方を体験する
  - コードを動かしながら自然言語処理に対する理解を深める

## 必要なライブラリのインストール

In [20]:
!pip install janome==0.5.0
# PIPというライブラリ管理ツールを用いて簡単にインストールできます



In [21]:
# GithubからCSVを持ってくる
!wget https://raw.githubusercontent.com/coraldx5/generativeai_intro_book/master/movie_review_jpn.csv

--2024-04-29 14:17:56--  https://raw.githubusercontent.com/coraldx5/generativeai_intro_book/master/movie_review_jpn.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 7975 (7.8K) [text/plain]
Saving to: ‘movie_review_jpn.csv.1’


2024-04-29 14:17:56 (60.5 MB/s) - ‘movie_review_jpn.csv.1’ saved [7975/7975]



In [3]:
from sklearn.model_selection import train_test_split
import pandas as pd

spam_df = pd.read_csv("movie_review_jpn.csv", header=None)  # データの読み込み

# ラベルと文章を分ける
labels = spam_df[0].values
sentences = spam_df[1].values

label_dic = {'good': 1, 'bad': 0}
label_dic_inv = {v: k for k, v in label_dic.items()}
label_ids = [label_dic[i] for i in labels]

# 7:3に学習データとテストデータを分割する
train_sentence, test_sentence, y_train, y_test = train_test_split(sentences, label_ids, test_size=0.3, random_state=0, stratify=label_ids)

In [4]:
from janome.tokenizer import Tokenizer
from janome.analyzer import Analyzer
from janome.tokenfilter import *
from janome.charfilter import *

my_analyzer = Analyzer(char_filters=[
                                    UnicodeNormalizeCharFilter(),
                                    RegexReplaceCharFilter(r"[IiⅠｉ?.*/~=()〝 <>:：《°!！!？（）-]+", "")
                                    ],
                       tokenizer=Tokenizer(),
                       token_filters=[POSKeepFilter(["名詞","形容詞","動詞"]), LowerCaseFilter()]
                      )

t = Tokenizer()
def janome_analyzer(x):
    return [token.surface for token in my_analyzer.analyze(x)]

In [5]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = CountVectorizer(analyzer=janome_analyzer)
# vectorizer = TfidfVectorizer(analyzer=janome_analyzer)
X_train = vectorizer.fit_transform(train_sentence)
X_test = vectorizer.transform(test_sentence)

In [6]:
#Trainの上から10件だけ可視化
vector_array = X_train[:10].toarray()
df = pd.DataFrame(data=vector_array,columns = vectorizer.get_feature_names_out())
print(df)

   あり  い  おり  くれ  さ  させ  し  せ  たち  つまらなかっ  ...  雰囲気  非常  音楽  響く  騗  驚き  驚く  \
0   0  0   0   0  1   0  0  0   0       0  ...    0   0   0   0  0   0   0   
1   0  0   0   0  1   0  0  0   0       0  ...    0   0   0   0  0   0   0   
2   0  0   0   0  0   0  0  0   1       0  ...    0   0   0   0  0   0   0   
3   0  0   0   0  0   0  0  0   0       0  ...    0   0   0   0  0   0   0   
4   0  0   0   0  0   0  1  0   0       1  ...    0   0   0   0  0   0   0   
5   0  0   0   0  0   0  0  0   0       0  ...    0   0   0   0  0   0   0   
6   0  0   0   0  0   0  1  0   1       0  ...    0   0   0   0  0   0   0   
7   0  0   0   0  0   0  0  0   0       0  ...    0   0   0   0  0   0   0   
8   0  0   0   1  0   0  1  0   0       0  ...    0   0   0   0  0   0   0   
9   0  0   0   0  1   0  0  0   0       0  ...    0   0   0   0  0   0   0   

   高め  魅了  魅力  
0   0   0   0  
1   0   0   0  
2   0   0   0  
3   0   0   0  
4   0   0   0  
5   0   0   0  
6   0   0   0  
7   0   0   0

In [7]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix

lr = LogisticRegression(random_state=0, n_jobs=-1)
lr.fit(X_train, y_train)

y_pred = lr.predict(X_test)
cm = confusion_matrix(y_test, y_pred)

pd.DataFrame(cm,columns=['Predicted good', 'Predicted bad'], index=['Actual good', 'Actual bad'])

Unnamed: 0,Predicted good,Predicted bad
Actual good,12,0
Actual bad,2,10


In [8]:
from IPython.display import display, HTML

# 赤くハイライトする
def highlight_r(word, attn):
  html_color = '#%02X%02X%02X' % (255, int(255*(1 - attn)), int(255*(1 - attn)))
  return '<span style="background-color: {}">{}</span>'.format(html_color, word)

# 青くハイライトする
def highlight_b(word, attn):
  html_color = '#%02X%02X%02X' % (int(255*(1 - attn)), int(255*(1 - attn)), 255)
  return '<span style="background-color: {}">{}</span>'.format(html_color, word)

def show_lr_explaination(check_idx):
    # 単語と説明変数の値の辞書
    coef_dic = {j: i for i, j in zip(lr.coef_[0], vectorizer.get_feature_names_out())}

    # 対象の文章の単語の説明変数の値を確認していく
    texts = janome_analyzer(test_sentence[check_idx])
    scores = []
    for w in texts:
        try:
            s = coef_dic[w]
        except KeyError:
            s = 0  # 対象外の単語は0を割り当てる
        scores.append(s)

    # 文章をハイライトしていく
    html_outputs = []
    for word, attn in zip(texts, scores):
        if attn < 0:
            html_outputs.append(highlight_b(word, attn*-1))
        else:
            html_outputs.append(highlight_r(word, attn))

    # 結果を表示
    display(HTML(' '.join(html_outputs)))

In [9]:
show_lr_explaination(0)

In [10]:
show_lr_explaination(1)

In [11]:
show_lr_explaination(2)

In [12]:
show_lr_explaination(3)

In [13]:
!pip install gradio

Collecting gradio
  Downloading gradio-4.28.3-py3-none-any.whl (12.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.2/12.2 MB[0m [31m20.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting aiofiles<24.0,>=22.0 (from gradio)
  Downloading aiofiles-23.2.1-py3-none-any.whl (15 kB)
Collecting fastapi (from gradio)
  Downloading fastapi-0.110.2-py3-none-any.whl (91 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m91.9/91.9 kB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting ffmpy (from gradio)
  Downloading ffmpy-0.3.2.tar.gz (5.5 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting gradio-client==0.16.0 (from gradio)
  Downloading gradio_client-0.16.0-py3-none-any.whl (314 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m314.4/314.4 kB[0m [31m23.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting httpx>=0.24.1 (from gradio)
  Downloading httpx-0.27.0-py3-none-any.whl (75 kB)
[2K     [90m━━━━━━━━━━━━━━━━━

In [14]:
import gradio as gr

def greet(name):
    return "Hello " + name + "!"

demo = gr.Interface(fn=greet, inputs="text", outputs="text")

demo.launch()

Setting queue=True in a Colab notebook requires sharing enabled. Setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Running on public URL: https://4e7a019b7c42fbcd1a.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)




## Word2vec

In [15]:
from gensim.models import Word2Vec

model = Word2Vec(
                # [['ストーリー', '行事' ,'イベント']],
                [janome_analyzer(''.join(sentences))],
                sg=1,
                vector_size=10,
                min_count=1,
                window=5,
                hs=1,
                seed=0)



In [16]:
model.wv.most_similar(positive=['退屈'], topn=10)

[('し', 0.9279454350471497),
 ('瞬間', 0.9225526452064514),
 ('不快', 0.8891574740409851),
 ('視覚', 0.8800266981124878),
 ('キャラクター', 0.8778711557388306),
 ('演技', 0.875684380531311),
 ('感動', 0.8718224167823792),
 ('驚き', 0.8622995615005493),
 ('希望', 0.8548675775527954),
 ('絆', 0.8525833487510681)]

## Doc2vec

In [17]:
import gensim
from gensim.models.doc2vec import Doc2Vec
from gensim.models.doc2vec import TaggedDocument

documents=[]
for stc in sentences.tolist():
    tokens = janome_analyzer(stc)
    documents.append(tokens)

documents = [TaggedDocument(tags=[i],words=doc) for i, doc in enumerate(documents)]
model = Doc2Vec(documents=documents, vector_size=400, window=3, min_count=1, dm=1)

In [18]:
#データフレームにDocVectorを追記
spam_df['DocVector'] =  [model.infer_vector(janome_analyzer(doc_words)) for doc_words in spam_df[1]]

In [19]:
train_sentence, test_sentence, y_train, y_test = train_test_split(spam_df['DocVector'].to_list(), label_ids, test_size=0.3, random_state=SEED, stratify=label_ids)

NameError: name 'SEED' is not defined

In [None]:
import numpy as np
X_train = np.array([np.array(v) for v in train_sentence])
X_test = np.array([np.array(v) for v in test_sentence])

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix

lr = LogisticRegression(random_state=0, n_jobs=-1)
lr.fit(X_train, y_train)

y_pred = lr.predict(X_test)
cm = confusion_matrix(y_test, y_pred)

pd.DataFrame(cm,columns=['Predicted good', 'Predicted bad'], index=['Actual good', 'Actual bad'])