In [1]:
import sys
import os
module_path = os.path.abspath(os.path.join('..'))
sys.path.append(module_path)

In [2]:
from TextSimila.text_sim_reco import Text_sim_reco

# Example data(English ver.)

In [3]:
Items = [
          'Netflix movie',
          'Netflix party',
          'Netflix top',
          'Netflix ratings',
          'rotten tomatoes ratings',
          'IMDb Top 250 Movie ratings'
          ]
          
related_to_Items = [
          ["movie top", "Netflix"],
          ["party pricing", "Netflix"],
          ["top TV shows',","Netflix"],
          ["ratings"],
          ['tomatoes'],
          ['ratings']
          ]

## Predict with newly-trained model

In [4]:
params_vid = {'lang': 'en', 'reco_Item_number': 3, 'ratio': 0.2}

params_tok = {
        'stopwords': None,
        'extranouns': None,
        'verbose': False,
        'min_noun_frequency': 1,
        'max_noun_frequency': 80,
        'max_frequency_for_char': 20,
        'min_noun_score': 0.1,
        'extract_compound': False,
        }

params_emb = {
        'vector_size': 15, 
        'window': 3, 
        'min_count': 1, 
        'workers': 4, 
        'sg': 1,
        }

In [5]:
text_sim_reco = Text_sim_reco(Items=Items,
                        related_to_Items = related_to_Items,
                        saved=True,
                        pretrain_tok=False,
                        pretrain_emb=False,
                        **params_vid,
                        **params_tok,
                        **params_emb)
text_sim_reco.train()

pred = text_sim_reco.predict()
for item, rec_item in zip(text_sim_reco.Items, pred):
    print(item)
    for idx, rec in enumerate(rec_item):
        print(f'{idx+1}: {rec}')
    print()


NLTK data punkt has been saved in "./model"

NLTK data averaged_perceptron_tagger has been saved in "./model"

The embedded_model has been saved as "embedded_model.pickle" in "./model"

Netflix movie
1: rotten tomatoes ratings
2: Netflix ratings
3: Netflix top

Netflix party
1: Netflix movie
2: rotten tomatoes ratings
3: Netflix top

Netflix top
1: IMDb Top 250 Movie ratings
2: Netflix ratings
3: Netflix movie

Netflix ratings
1: IMDb Top 250 Movie ratings
2: Netflix movie
3: Netflix top

rotten tomatoes ratings
1: Netflix movie
2: Netflix party
3: Netflix top

IMDb Top 250 Movie ratings
1: Netflix ratings
2: Netflix top
3: Netflix movie



## Predict with Pre-trained model
Since we will use pre-trained models, we need to specify which models to use
- `model_name_emb`

You don't need to specify the model name for tokenization('model_name_tok') while using **English** custom dataset

In [6]:
params_vid = {'lang': 'en', 'reco_Item_number': 3, 'ratio': 0.2}

params_tok = {
        'stopwords': None,
        'extranouns': None,
        'verbose': False,
        'min_noun_frequency': 1,
        'max_noun_frequency': 80,
        'max_frequency_for_char': 20,
        'min_noun_score': 0.1,
        'extract_compound': False
        }

params_emb = {
        'vector_size': 15, 
        'window': 3, 
        'min_count': 1, 
        'workers': 4, 
        'sg': 1,
        # As you can see from the output of the above cell, the model named "embedded_model.pickle" has been saved
        'model_name_emb': 'embedded_model' 
        }

In [7]:
text_sim_reco = Text_sim_reco(Items=Items,
                        related_to_Items = related_to_Items,
                        pretrain_tok=True,
                        pretrain_emb=True,
                        **params_vid,
                        **params_tok,
                        **params_emb)
text_sim_reco.train()

pred = text_sim_reco.predict()
for item, rec_item in zip(text_sim_reco.Items, pred):
    print(item)
    for idx, rec in enumerate(rec_item):
        print(f'{idx+1}: {rec}')
    print()

NLTK data punkt has been already saved in "./model"

NLTK data averaged_perceptron_tagger has been already saved in "./model"

Load pre-trained model for embedding...

Netflix movie
1: rotten tomatoes ratings
2: Netflix ratings
3: IMDb Top 250 Movie ratings

Netflix party
1: Netflix movie
2: rotten tomatoes ratings
3: Netflix top

Netflix top
1: Netflix movie
2: Netflix ratings
3: IMDb Top 250 Movie ratings

Netflix ratings
1: IMDb Top 250 Movie ratings
2: Netflix movie
3: rotten tomatoes ratings

rotten tomatoes ratings
1: Netflix movie
2: Netflix ratings
3: IMDb Top 250 Movie ratings

IMDb Top 250 Movie ratings
1: Netflix ratings
2: Netflix movie
3: rotten tomatoes ratings



# Example data(Korean ver.)

In [8]:
Items = [
    '아이유 막방',
    '아이유 앵콜',
    '아이유 공연',
    '아이유 기타',
    '기타로 연주',
    '초보자를 위한 기타 레슨'
]

related_to_Items = [
    ['막방 공연','아이유'],
    ['앵콜 실수','아이유'],
    ['공연 콘서트 정보','아이유'],
    ['기타'],
    ['연주'],
    ['기타']
    ]

## Predict with newly-trained model

In [9]:
params_vid = {'lang': 'ko', 'reco_Item_number': 3, 'ratio': 0.2}

params_tok = {
        'stopwords': None,
        'extranouns': None,
        'verbose': False,
        'min_noun_frequency': 1,
        'max_noun_frequency': 80,
        'max_frequency_for_char': 20,
        'min_noun_score': 0.1,
        'extract_compound': False}
        
params_emb = {
        'vector_size': 15, 
        'window': 3, 
        'min_count': 1, 
        'workers': 4, 
        'sg': 1}

In [10]:
text_sim_reco = Text_sim_reco(Items=Items,
        related_to_Items = related_to_Items,
        saved=True,
        pretrain_tok=False,
        pretrain_emb=False,
        **params_vid,
        **params_tok,
        **params_emb)
text_sim_reco.train()

pred = text_sim_reco.predict()
for item, rec_item in zip(text_sim_reco.Items, pred):
    print(item)
    for idx, rec in enumerate(rec_item):
        print(f'{idx+1}: {rec}')
    print()


The tokenized_nouns has been saved as "tokenized_nouns.pickle" in "./model"

The embedded_model has been saved as "embedded_model_(2).pickle" in "./model"

아이유 막방
1: 아이유 공연
2: 아이유 앵콜
3: 아이유 기타

아이유 앵콜
1: 아이유 막방
2: 아이유 공연
3: 아이유 기타

아이유 공연
1: 아이유 막방
2: 아이유 앵콜
3: 아이유 기타

아이유 기타
1: 기타로 연주
2: 초보자를 위한 기타 레슨
3: 아이유 앵콜

기타로 연주
1: 아이유 기타
2: 초보자를 위한 기타 레슨
3: 아이유 앵콜

초보자를 위한 기타 레슨
1: 아이유 기타
2: 기타로 연주
3: 아이유 앵콜



## Predict with Pre-trained model
Since we will use pre-trained models, we need to specify which models to use
- `model_name_tok`
- `model_name_emb`

In [11]:
params_vid = {'lang': 'ko', 'reco_Item_number': 3, 'ratio': 0.2}

params_tok = {
        'stopwords': None,
        'extranouns': None,
        'verbose': False,
        'min_noun_frequency': 1,
        'max_noun_frequency': 80,
        'max_frequency_for_char': 20,
        'min_noun_score': 0.1,
        'extract_compound': False,
        # As you can see from the output of the above cell, the model named "tokenized_nouns.pickle" has been saved
        'model_name_tok': 'tokenized_nouns'}

params_emb = {
        'vector_size': 15, 
        'window': 3, 
        'min_count': 1, 
        'workers': 4, 
        'sg': 1,
        # As you can see from the output of the above cell, the model named "embedded_model_(2).pickle" has been saved
        'model_name_emb': 'embedded_model_(2)'}

In [12]:
text_sim_reco = Text_sim_reco(Items=Items,
                        related_to_Items = related_to_Items,
                        pretrain_tok=True,
                        pretrain_emb=True,
                        **params_vid,
                        **params_tok,
                        **params_emb)
text_sim_reco.train()

pred = text_sim_reco.predict()
for item, rec_item in zip(text_sim_reco.Items, pred):
    print(item)
    for idx, rec in enumerate(rec_item):
        print(f'{idx+1}: {rec}')
    print()

Load pre-trained model for tokenization...

Load pre-trained model for embedding...

아이유 막방
1: 아이유 공연
2: 아이유 앵콜
3: 아이유 기타

아이유 앵콜
1: 아이유 막방
2: 아이유 공연
3: 아이유 기타

아이유 공연
1: 아이유 막방
2: 아이유 앵콜
3: 아이유 기타

아이유 기타
1: 기타로 연주
2: 초보자를 위한 기타 레슨
3: 아이유 앵콜

기타로 연주
1: 아이유 기타
2: 초보자를 위한 기타 레슨
3: 아이유 앵콜

초보자를 위한 기타 레슨
1: 아이유 기타
2: 기타로 연주
3: 아이유 앵콜

