# Model Evaluation

This notebook runs train.py for different config.yaml parameters such as model type and text preprocessing. The results of each run is saved in the 'runs' folder. The results are comiled and saved as a pandas dataframe to be used by the streamlit app. 

In [1]:
import pickle
import json
import yaml
import pandas as pd

from os import listdir
from os.path import isfile, join

In [2]:
# Clear the runs folder
!rm ./runs/*

In [3]:
# Functions to manipulate the config yaml
def reset_config():
    config = {
      "model": "",
      "args": {
        "data_path": "./data/raw/sample_10000.csv",
        "lowercase": False,
        "remove_punctuation": False,
        "remove_stopwords": False,
        "lemmatize": False,
        "save_model": False,
        "model_dir": "./models/",
        "save_validation": True,
        "validation_dir": "./runs/"
      }
    }
    return config

def set_model(config, model):
    config['model'] = model
    return config
    
def set_nlp_args(config, preprocess):
    config['args']['lowercase'] = preprocess
    config['args']['remove_punctuation'] = preprocess
    config['args']['remove_stopwords'] = preprocess
    config['args']['lemmatize'] = preprocess
    return config

In [4]:
# Run training for all models with/without processing
models = ['tfidf', 'doc2vec', 'bert']
for m in models:
    print('Running model ' + m)
    config = reset_config()
    config = set_model(config, m)
    config = set_nlp_args(config, False)

    with open('./config.yaml', "w") as ff:
        yaml.dump(config, ff, default_flow_style=False)

    !python3 train.py -y './config.yaml'
    
    config = reset_config()
    config = set_model(config, m)
    config = set_nlp_args(config, True)

    with open('./config.yaml', "w") as ff:
        yaml.dump(config, ff, default_flow_style=False)

    !python3 train.py -y './config.yaml'

Running model tfidf
[nltk_data] Downloading package stopwords to /Users/elmi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/elmi/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
2020-06-11 16:00:24,569 gensim.corpora.dictionary INFO     adding document #0 to Dictionary(0 unique tokens: [])
2020-06-11 16:00:24,570 gensim.corpora.dictionary INFO     built Dictionary(12 unique tokens: ['computer', 'human', 'interface', 'response', 'survey']...) from 9 documents (total 29 corpus positions)
2020-06-11 16:00:25,068 transformers.file_utils INFO     PyTorch version 1.5.0 available.
2020-06-11 16:00:25,214 root         INFO     {'data_path': './data/raw/sample_10000.csv', 'lemmatize': False, 'lowercase': False, 'model_dir': './models/', 'remove_punctuation': False, 'remove_stopwords': False, 'save_model': False, 'save_validation': True, 'validation_dir': './runs/'}
2020-06-11 16:00:25,214 root         INFO  

In [5]:
# Read output pickle files and store in dictionary
files = [f for f in listdir('./runs') if isfile(join('./runs', f))]
config_list = []

graph = {'model':[], 'processed':[], 'similarity':[]}
for fx in files:
    f = './runs/' + fx
    with open(f, "rb") as file:
        config = pickle.load(file)
        graph['model'] += [config['model']]
        graph['similarity'] += [config['output']]
        if config['args']['lowercase']:
            graph['processed'] += [True]
        else:
            graph['processed'] += [False]
        
    config_list += [config]
    
graph

{'model': ['tfidf', 'bert', 'tfidf', 'doc2vec', 'doc2vec', 'bert'],
 'processed': [False, True, True, True, False, False],
 'similarity': [0.38998098552654425,
  0.8900391459465027,
  0.32663569612868903,
  0.4791419357061386,
  0.40171354711055757,
  0.8834213316440582]}

In [6]:
# Convert to graph friendly dataframe and save as pickle
graph_df = pd.DataFrame(graph)
graph_df = graph_df.sort_values(by='model', ascending=False)

with open('runs/graph_data.pkl', "wb") as pickleFile:
    pickle.dump(graph_df, pickleFile)