In [1]:
import pandas as pd
import numpy as np
import time
import sys
import pickle

## Mecab part

In [2]:
import MeCab
m = MeCab.Tagger ("-Ochasen")

In [3]:
from pykakasi import kakasi
kakasi = kakasi()
kakasi.setMode('H', 'a')
kakasi.setMode('K', 'a')
kakasi.setMode('J', 'a')
conv = kakasi.getConverter()

def mecab_list(text,ctr=0):
    tagger = MeCab.Tagger("-Ochasen")
    tagger.parse('')
    node = tagger.parseToNode(text)
    word_class = []
    result = []
    while node:
        word = node.surface
        wclass = node.feature.split(',')
        if (conv.do(word).isalpha() == True or conv.do(wclass[6]).isalpha() == True) and \
            (wclass[0] == '名詞' or (wclass[0] == '動詞' and wclass[1] == '自立') or wclass[0] == '形容詞' or wclass[0] == '`副詞'):
            p1 = ''
            p2 = ''
            if wclass[0] == '名詞':
                p1 = 'no'
                p2 = 'g'
            elif wclass[0] == '動詞':
                p1 = 've'
                if wclass[1] == '自立':
                    p2 = 'i'
            elif wclass[0] == '形容詞':
                p1 = 'aj'
                p2 = 'g'
            else:
                p1 = 'av'
                p2 = 'g'
                
            try:
                word_class.append((word,wclass[0],wclass[1],wclass[6],wclass[7],conv.do(wclass[6])))
                result.append(conv.do(wclass[6]) + '_' + p1 + '_' + p2 + '_' + wclass[6])
            except:
                word_class.append((word,wclass[0],wclass[1],wclass[6],word,conv.do(word)))
                result.append(conv.do(word) + '_' + p1 + '_' + p2 + '_' + wclass[6])
        node = node.next
    if ctr == 0:
        return result
    elif ctr == 1:
        return word_class

## TF-IDF part

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [5]:
def attach_tfidf_vector(path):
    df = pd.read_csv(path,
                    sep = ",", 
                    encoding = "utf-16", 
                    error_bad_lines = 0,
    )
#     df = df.iloc[:9000,] ## Reducing data table since this is experimental coding.
    print ("There are " + str(df.shape[0]) + " candidates.")

    print("Preprocessing using MeCab...")
    time0 = time.time()
    df["question_clean"] = df["question"].str.split('って英語でなんて言うの', expand=True)[0]
    df['words'] = df['question_clean'].apply(mecab_list)
    df['question_cd'] = df['words'].apply(' '.join)
    df = df[df['words'].apply(len)!=0].reset_index(drop=True)
    df['words_count'] = df['words'].apply(len)
    time1 = time.time()
    print("Preprocessing time: ",time1 - time0)    

    corpus = [document.replace('*','_') for document in df['question_cd']]
    vectorizer = TfidfVectorizer(lowercase=False)
    
    time0 = time.time()
    corpus_vectorized = vectorizer.fit_transform(corpus)
    time1 = time.time()
    time1 - time0
    print("TF_IDF processing time: ",time1 - time0)
    
    return df[['url','question_clean']], vectorizer, corpus_vectorized

In [7]:
df_clean, vectorizer, corpus_vectorized  = attach_tfidf_vector("C:/Project/DMM_uKnow_Scraping/output.csv")
pickle.dump(df_clean, open('./df_clean.pkl', 'wb')), print("Saved data frame with clean question lines to df_clean.pkl")
pickle.dump(vectorizer, open('./vectorizer.pkl', 'wb')), print("Saved trained vectorizer to vectorizer.pkl")
pickle.dump(corpus_vectorized, open('./corpus_vectorized.pkl', 'wb')), print("Saved vectorized corpus to corpus_vectorized.pkl")

b'Skipping line 5339: expected 3 fields, saw 4\n'


There are 9949 candidates.
Preprocessing using MeCab...
Preprocessing time:  30.7064425945282
TF_IDF processing time:  0.11222171783447266
Saved data frame with clean question lines to df_clean.pkl
Saved trained vectorizer to vectorizer.pkl
Saved vectorized corpus to corpus_vectorized.pkl


(None, None)

## Generating Similarity

In [9]:
from collections import Counter
# import scipy.spatial.distance as spdis

df_clean = pickle.load(open('./df_clean.pkl', 'rb'))
vectorizer = pickle.load(open('./vectorizer.pkl', 'rb'))
corpus_vectorized = pickle.load(open('./corpus_vectorized.pkl', 'rb'))

In [52]:
def generate_similarity(sample,disnum=5):
    sample_mecabed = mecab_list(sample)
    sample_vec = vectorizer.transform([' '.join(sample_mecabed).replace('*','_')]).toarray()

    df_res = df_clean.copy()
    df_res['similarity'] = np.dot(corpus_vectorized.toarray(), sample_vec.T) 
    # Cosine similarity = cosine values since the tf-idf vectors are l2 normed.

    df_res.columns = ['DMM uknow URL','Question Line', 'Similarity']
    
    return df_res[df_res['Similarity']>0].sort_values(by='Similarity', ascending=False).head(disnum)

In [53]:
# sample = '急に必要になる時だってあります'
sample = '天気予報を見てから出かける'
# sample = '無料枠があるので、今回はそれを使っていきたいと思います'

In [54]:
time0 = time.time()
top5res = generate_similarity(sample,5)
time1 = time.time()
print("processing time: ", time1-time0)

processing time:  0.16954636573791504


In [55]:
top5res

Unnamed: 0,DMM uknow URL,Question Line,Similarity
4146,https://eikaiwa.dmm.com/uknow/questions/53369/,天気予報の精度がとても高い,0.578216
1241,https://eikaiwa.dmm.com/uknow/questions/46524/,出かける,0.509022
2844,https://eikaiwa.dmm.com/uknow/questions/33921/,天気,0.509022
2780,https://eikaiwa.dmm.com/uknow/questions/54222/,〇〇の天気はどうですか？,0.509022
7638,https://eikaiwa.dmm.com/uknow/questions/28819/,出かけなければいけなくなりました,0.450107
