In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pickle
from bert_serving.client import BertClient

In [None]:
IMDB = pd.read_csv('data/IMDB_movie_V1_clear.csv')
IMDB.head()

In [None]:
segments = [' '.join(eval(x)+eval(y)) for x,y in IMDB[['keywords','Genres']].values.tolist()]

In [None]:
# create bag-of-word model and fit the documents
vectorizer = CountVectorizer(binary=True)
print('vectorizer', vectorizer)

vectorizer.fit(segments)
print('Number of vocabulary', len(vectorizer.vocabulary_))
print('Vocabulary', vectorizer.vocabulary_)
print('feature name', vectorizer.get_feature_names())

# TF
vectorizer = CountVectorizer(binary=False)
print('vectorizer', vectorizer)

vectorizer.fit(segments)
print('Number of vocabulary', len(vectorizer.vocabulary_))
print('Vocabulary', vectorizer.vocabulary_)
print('feature name', vectorizer.get_feature_names())

# Term Frequency inverse Document Frequency
vectorizer = TfidfVectorizer(use_idf=True, norm=None, smooth_idf=False)
vectorizer.fit(segments)
X = vectorizer.transform(segments).toarray()
print(vectorizer.get_feature_names())
print(vectorizer.idf_)
print('word by doc.:')
print(X.transpose())

#cosine similarity
print('by documents')
print(cosine_similarity(X, X))
print('by words')
print(cosine_similarity(X.transpose(), X.transpose()))

In [None]:
pickle.dump(vectorizer, open("vectorizer.pickle", "wb"))

In [None]:
with open('TFIDF.npy', 'wb') as f:
    np.save(f, X)

In [None]:
#取得詞向量
bc = BertClient() # 取得bert服務器資源
print('BERT')
sents_enc = bc.encode(IMDB['StoryLine'].tolist())

In [None]:
with open('BERT.npy', 'wb') as f:
    np.save(f, sents_enc)