<a href="https://colab.research.google.com/github/s528661/nlp100_with_alpha/blob/main/nlp100_8.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 第8章: ニューラルネット
第6章で取り組んだニュース記事のカテゴリ分類を題材として，ニューラルネットワークでカテゴリ分類モデルを実装する．なお，この章ではPyTorch, TensorFlow, Chainerなどの機械学習プラットフォームを活用せよ．

In [1]:
!pip install mecab-python3

Collecting mecab-python3
  Downloading mecab_python3-1.0.8-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (581 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m581.7/581.7 kB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: mecab-python3
Successfully installed mecab-python3-1.0.8


In [2]:
import gc
import re,sys,os

import pickle
import time
import copy

import math
import random
import string

from collections import defaultdict, Counter

import numpy as np
import pandas as pd
import scipy as scp

import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

from torch.utils.data import Dataset, DataLoader, TensorDataset

from sklearn import feature_extraction, preprocessing

from sklearn import svm
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, RandomForestClassifier
from sklearn.neighbors import KNeighborsRegressor

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold, StratifiedKFold, GroupKFold
from sklearn.model_selection import cross_val_score

from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, mean_absolute_percentage_error, f1_score
from sklearn.metrics import accuracy_score,  precision_score, recall_score, confusion_matrix, roc_auc_score

# import dask.array as da

import lightgbm as lgb

# Neural Language Processing
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.feature_selection import SelectKBest

from sklearn.linear_model import RidgeClassifier
import MeCab

import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

# 70. 単語ベクトルの和による特徴量

In [3]:
!wget https://archive.ics.uci.edu/ml/machine-learning-databases/00359/NewsAggregatorDataset.zip --no-check-certificate
!unzip ./NewsAggregatorDataset.zip

--2023-11-16 11:03:35--  https://archive.ics.uci.edu/ml/machine-learning-databases/00359/NewsAggregatorDataset.zip
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified
Saving to: ‘NewsAggregatorDataset.zip’

NewsAggregatorDatas     [     <=>            ]  27.87M  28.5MB/s    in 1.0s    

2023-11-16 11:03:37 (28.5 MB/s) - ‘NewsAggregatorDataset.zip’ saved [29224203]

Archive:  ./NewsAggregatorDataset.zip
  inflating: 2pageSessions.csv       
   creating: __MACOSX/
  inflating: __MACOSX/._2pageSessions.csv  
  inflating: newsCorpora.csv         
  inflating: __MACOSX/._newsCorpora.csv  
  inflating: readme.txt              
  inflating: __MACOSX/._readme.txt   


In [4]:
data = pd.read_csv('newsCorpora.csv',delimiter='\t',header=None,
                 names=["ID", "TITLE", "URL", "PUBLISHER", "CATEGORY", "STORY", "HOSTNAME", "TIMESTAMP"])

In [5]:
data

Unnamed: 0,ID,TITLE,URL,PUBLISHER,CATEGORY,STORY,HOSTNAME,TIMESTAMP
0,1,"Fed official says weak data caused by weather,...",http://www.latimes.com/business/money/la-fi-mo...,Los Angeles Times,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.latimes.com,1394470370698
1,2,Fed's Charles Plosser sees high bar for change...,http://www.livemint.com/Politics/H2EvwJSK2VE6O...,Livemint,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.livemint.com,1394470371207
2,3,US open: Stocks fall after Fed official hints ...,http://www.ifamagazine.com/news/us-open-stocks...,IFA Magazine,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.ifamagazine.com,1394470371550
3,4,"Fed risks falling 'behind the curve', Charles ...",http://www.ifamagazine.com/news/fed-risks-fall...,IFA Magazine,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.ifamagazine.com,1394470371793
4,5,Fed's Plosser: Nasty Weather Has Curbed Job Gr...,http://www.moneynews.com/Economy/federal-reser...,Moneynews,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.moneynews.com,1394470372027
...,...,...,...,...,...,...,...,...
422414,422933,Surgeons to remove 4-year-old's rib to rebuild...,http://www.cbs3springfield.com/story/26378648/...,WSHM-TV,m,dpcLMoJD69UYMXMxaoEFnWql9YjQM,www.cbs3springfield.com,1409229190251
422415,422934,Boy to have surgery on esophagus after battery...,http://www.wlwt.com/news/boy-to-have-surgery-o...,WLWT Cincinnati,m,dpcLMoJD69UYMXMxaoEFnWql9YjQM,www.wlwt.com,1409229190508
422416,422935,Child who swallowed battery to have reconstruc...,http://www.newsnet5.com/news/local-news/child-...,NewsNet5.com,m,dpcLMoJD69UYMXMxaoEFnWql9YjQM,www.newsnet5.com,1409229190771
422417,422936,Phoenix boy undergoes surgery to repair throat...,http://www.wfsb.com/story/26368078/phoenix-boy...,WFSB,m,dpcLMoJD69UYMXMxaoEFnWql9YjQM,www.wfsb.com,1409229191071


In [6]:
fix_data = data[data['PUBLISHER'].isin(["Reuters", "Huffington Post", "Businessweek", "Contactmusic.com", "Daily Mail"])]

In [7]:
def preprocessing(text):
  text = "".join([i for i in text if i not in string.punctuation])
  text = text.lower()
  text = re.sub("[0-9]+", "", text)
  return text

In [8]:
fix_data['pTITLE'] = fix_data['TITLE'].apply(lambda x: preprocessing(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  fix_data['pTITLE'] = fix_data['TITLE'].apply(lambda x: preprocessing(x))


In [9]:
vectorizer = TfidfVectorizer(min_df=10, ngram_range=(1, 2)) # 1-gram, 2-gramでTfidfを計算
X = vectorizer.fit_transform(fix_data['pTITLE']).toarray()
X = pd.DataFrame(X, columns=vectorizer.get_feature_names_out())

In [19]:
X_train_all, X_test = train_test_split(X, test_size=0.2, shuffle=True)

In [21]:
X_train, X_val = train_test_split(X_train_all, test_size=0.1, shuffle=True)

In [22]:
y_train = fix_data.iloc[X_train.index]['CATEGORY'].map({'b': 0, 'e': 1, 't': 2, 'm': 3})
y_val = fix_data.iloc[X_val.index]['CATEGORY'].map({'b': 0, 'e': 1, 't': 2, 'm': 3})
y_test = fix_data.iloc[X_test.index]['CATEGORY'].map({'b': 0, 'e': 1, 't': 2, 'm': 3})

In [28]:
!rm -rf data
!mkdir data

!mkdir data/train
!mkdir data/val
!mkdir data/test

In [29]:
X_train.to_pickle('data/train/X_train.pkl')
X_val.to_pickle('data/val/X_val.pkl')
X_test.to_pickle('data/test/X_test.pkl')

In [30]:
y_train.to_pickle('data/train/y_train.pkl')
y_val.to_pickle('data/val/y_val.pkl')
y_test.to_pickle('data/test/y_test.pkl')

# 71. 単層ニューラルネットワークによる予測

In [77]:
class Net(nn.Module):
  def __init__(self,n_input,n_hidden,n_output):
    super(Net,self).__init__()
    self.input = nn.Linear(n_input,n_hidden,bias=True)
    self.hidden = nn.Linear(n_hidden,n_hidden,bias=True)
    self.output = nn.Linear(n_hidden,n_output,bias=False)

  def forward(self, x):
    x = self.input(x)
    x = F.relu(x)
    x = self.hidden(x)
    x = F.relu(x)
    x = self.output(x)
    x = F.softmax(x)
    return x

In [47]:
X_train = pd.read_pickle('data/train/X_train.pkl')
X_val = pd.read_pickle('data/val/X_val.pkl')
X_test = pd.read_pickle('data/test/X_test.pkl')

In [None]:
y_train = pd.read_pickle('data/train/y_train.pkl')
y_val = pd.read_pickle('data/val/y_val.pkl')
y_test = pd.read_pickle('data/test/y_test.pkl')

In [92]:
tensor_X_train = torch.tensor(X_train.values.astype('float32'))
tensor_X_val = torch.tensor(X_val.values.astype('float32'))
tensor_X_test = torch.tensor(X_test.values.astype('float32'))

tensor_y_train = torch.tensor(y_train.values.astype('float32'),dtype=torch.long)
tensor_y_val = torch.tensor(y_val.values.astype('float32'),dtype=torch.long)
tensor_y_test = torch.tensor(y_test.values.astype('float32'),dtype=torch.long)

In [74]:
n_input = X_train.shape[1]
n_hidden = 32
n_output = 4

In [93]:
model = Net(n_input,n_hidden,n_output)

In [94]:
y_train_pred = model(tensor_X_train)

  x = F.softmax(x)


In [95]:
y_train_pred

tensor([[0.2566, 0.2398, 0.2490, 0.2546],
        [0.2565, 0.2399, 0.2493, 0.2544],
        [0.2563, 0.2401, 0.2487, 0.2549],
        ...,
        [0.2566, 0.2397, 0.2490, 0.2547],
        [0.2560, 0.2403, 0.2492, 0.2545],
        [0.2568, 0.2392, 0.2496, 0.2543]], grad_fn=<SoftmaxBackward0>)

# 72. 損失と勾配の計算

In [96]:
loss_fn = nn.CrossEntropyLoss()

In [97]:
loss_fn(y_train_pred,tensor_y_train)

tensor(1.3873, grad_fn=<NllLossBackward0>)

# 73. 確率的勾配降下法による学習

確率的勾配降下法（SGD: Stochastic Gradient Descent）を用いて，行列W
を学習せよ．なお，学習は適当な基準で終了させればよい（例えば「100エポックで終了」など）．

In [99]:
optimizer = optim.SGD(model.parameters(),lr=0.001)

In [101]:
train_dataset = torch.utils.data.TensorDataset(tensor_X_train, tensor_y_train)

In [105]:
epochs = 50
for epoch in range(epochs):
  for x, y in train_dataset:
    optimizer.zero_grad()
    y_pred = model(x)
    loss = loss_fn(y_pred,y)
    loss.backward()
    optimizer.step()

  x = F.softmax(x)


# 74. 正解率の計測

問題73で求めた行列を用いて学習データおよび評価データの事例を分類したとき，その正解率をそれぞれ求めよ．