In [13]:
import pickle
import pandas as pd
import csv
#tf-idf model and cosine similary querying
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity, linear_kernel
import gensim.corpora as corpora
from gensim.models import TfidfModel


def train_tfidf_model(path):
    '''This function reads in the list of processed dreams and creates:
            - texts: a list of lists of words as strings
            - corpus: a bag-of-words list of lists of tuples
            - stringified_corpus: a bag-of-words of list of strings
            - tf_idf_model: a tf idf model trained on the corpus
            - The path I'm using for this project is "/Users/cmeaton/Documents/code/ds/METIS/sea19_ds7_workingdir/project_4/data/processed_data/processed_dreams.csv"'''
   
    with open(path, 'r') as f:
        reader = csv.reader(f)
        data_words_nostops = list(reader)
    
    # Create Dictionary
    id2word = corpora.Dictionary(data_words_nostops)
    # Create Corpus
    texts = data_words_nostops
    # Term Document Frequency
    corpus = [id2word.doc2bow(text) for text in texts]
    # convert corpus into string for fit_transform later
    stringified_corpus = [' '.join(i) for i in texts]
    # Build tf-idf model
    tf_idf_model = TfidfModel(corpus)
    
    # create tfidf vectorizer
    tfidf = TfidfVectorizer()
    # fit tfidf vectorizer on entire corpus
    queryTFIDF = tfidf.fit_transform(stringified_corpus)
    
    # pickle models, dict and corpus for later
    model_filename = "/Users/cmeaton/Documents/code/ds/METIS/sea19_ds7_workingdir/project_4/src/models/pickled_model_output/tfidf/tfid_vectorizer.sav"
    pickle.dump(queryTFIDF, open(model_filename, 'wb'))
    dic_filename = "/Users/cmeaton/Documents/code/ds/METIS/sea19_ds7_workingdir/project_4/src/models/pickled_model_output/tfidf/tfidf_model.sav"
    pickle.dump(tf_idf_model, open(dic_filename, 'wb'))
    corpus_filename = "/Users/cmeaton/Documents/code/ds/METIS/sea19_ds7_workingdir/project_4/src/models/pickled_model_output/tfidf/bow_corpus.sav"
    pickle.dump(stringified_corpus, open(corpus_filename, 'wb'))
    print(tf_idf_model)
    return texts, tf_idf_model, stringified_corpus
    
train_tfidf_model('/Users/cmeaton/Documents/code/ds/METIS/sea19_ds7_workingdir/project_4/data/processed_data/processed_dreams.csv'
)



TfidfModel(num_docs=4572, num_nnz=261826)


([['stuck',
   'facility',
   'people',
   'mental',
   'issue',
   'meet',
   'girl',
   'starnge',
   'thing',
   'begin',
   'happen',
   'meet',
   'form',
   'group',
   'find',
   'fall',
   'love',
   'girl',
   'like',
   'show',
   'maybe',
   'like',
   'stick',
   'together',
   'leave',
   'reject',
   'demon',
   'monster',
   'shape',
   'shift',
   'black',
   'dog',
   'sacrifice',
   'people',
   'nowhere',
   'capture',
   'demon',
   'girl',
   'away',
   'release',
   'stumble',
   'city',
   'fight',
   'sure',
   'girl',
   'dad',
   'find',
   'bring',
   'sit',
   'strange',
   'room',
   'big',
   'warehouse',
   'discuss',
   'happen',
   'feeling',
   'stay',
   'attempt',
   'leave',
   'learn',
   'somehow',
   'florence',
   'italy',
   'far',
   'american',
   'home',
   'return',
   'facility',
   'find',
   'girl',
   'dad',
   'figure',
   'everyone',
   'leave',
   'dad',
   'longer',
   'work',
   'leave',
   'doctor',
   'assistant',
   'girl',
   '