### tfidf->TruncatedSVDの実験をするnotebook

* SVDとPCAは大体同じ、数値的にはSVDのほうが安定
    * https://qiita.com/horiem/items/71380db4b659fb9307b4

* n_iterは（このデータ数なら）二桁で十分そう

In [1]:
import time

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns; sns.set()

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.decomposition import TruncatedSVD
from scipy.sparse import csr_matrix

import MeCab
import re
from config import *
from bert_utils import *

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.

****** SEED fixed : 42 ******




In [2]:
def wakati_clear(text):
    text = re.sub(r'、', '', text)
    text = re.sub(r'。', '', text)
    text = re.sub(r'\n', '', text)
    return text


def wakatier(text, tagger=MeCab.Tagger(f"-Owakati -d {dic_neologd}")):
    return wakati_clear(tagger.parse(text))


def calc_tfidf(text_list: list) -> pd.DataFrame:
    bow = CountVectorizer()
    tfidf = TfidfTransformer(smooth_idf=False)

    count = bow.fit_transform(text_list)
    array_bow = count.toarray() # BoW: 出現回数[dim] --
    # cf.) array_tf = array_bow / array_bow.shape[1]  # 出現確率[undim]... terms frequency, tf --
    df_tfidf = pd.DataFrame(tfidf.fit_transform(array_bow).toarray(), columns=bow.get_feature_names_out())
    df_bow = pd.DataFrame(array_bow, columns=bow.get_feature_names_out())
    return df_tfidf, df_bow

### 結果のdf

In [3]:
log_df = pd.DataFrame(columns=["sample", "cols", "n_components", "n_iter", "explained_variance_ratio_sum", "time"])

### n_components, n_iterに対してexplained_variance_ratio_の収束性 --

In [4]:
svd_components = 256

singular_values = []
explained_variance_ratio = []
for svd_components in [1, 256, 1024, 4096, 6144]:
    for n_iter in [1, 32]:
    
        start_time = time.time()
    
        # prepare data --
        df, train_shape = prepare_dataframe(train_data="raw")
        df["clean_text"] = df["text"].map(lambda x: clean_text(x))
        text_list = df["clean_text"].values
        for i in range(len(text_list)):
            text_list[i] = wakatier(text_list[i])
        
        # tfidf -> SVD --
        df_tfidf, df_bow = calc_tfidf(text_list)
        df_tfidf_sparse = csr_matrix(df_tfidf)
        svd = TruncatedSVD(n_components=svd_components, n_iter=n_iter, random_state=SEED)
        df_tfidf_svd = pd.DataFrame(svd.fit_transform(df_tfidf_sparse), columns=[f"svd_{str(i)}" for i in range(svd_components)])
    
        singular_values.append(svd.singular_values_)
        explained_variance_ratio.append(svd.explained_variance_ratio_)
    
        print("explained_variance_ratio_.sum()", svd.explained_variance_ratio_.sum())
    
        end_time = time.time()
        time_elapsed = end_time - start_time
        time_elapsed_str = "{:.0f}h {:.0f}m {:.0f}s".format(time_elapsed//3600, (time_elapsed%3600)//60, (time_elapsed%3600)%60)
    
        # save exp result --
        log_df = pd.concat([log_df, pd.DataFrame(pd.Series({
            "sample": df_bow.shape[0],
            "cols": df_bow.shape[1],
            "n_components": svd_components,
            "n_iter": n_iter,
            "explained_variance_ratio_sum": svd.explained_variance_ratio_.sum(),
            "time": time_elapsed_str,
        })).T])

explained_variance_ratio_.sum() 0.002804907417058345
explained_variance_ratio_.sum() 0.0027693729455987855
explained_variance_ratio_.sum() 0.18485468039329048
explained_variance_ratio_.sum() 0.220501545715091
explained_variance_ratio_.sum() 0.40159570679723067
explained_variance_ratio_.sum() 0.4521029488056133
explained_variance_ratio_.sum() 0.7881391364120922
explained_variance_ratio_.sum() 0.8251719661805887


ValueError: Shape of passed values is (8479, 8479), indices imply (8479, 10240)

In [None]:
log_df.to_csv("./experiment/train_test_bow_svd_experiment.csv", index=False)