In [9]:
import pandas as pd
import csv
#tf-idf model and cosine similary querying
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity, linear_kernel
import gensim.corpora as corpora
from gensim.models import TfidfModel


def train_tfidf_model(path):
    '''This function reads in the list of processed dreams and creates:
            - texts: a list of lists of words as strings
            - corpus: a bag-of-words list of lists of tuples
            - stringified_corpus: a bag-of-words of list of strings
            - tf_idf_model: a tf idf model trained on the corpus
            - The path I'm using for this project is "/Users/cmeaton/Documents/code/ds/METIS/sea19_ds7_workingdir/project_4/data/processed_data/processed_dreams.csv"'''
   
    with open(path, 'r') as f:
        reader = csv.reader(f)
        data_words_nostops = list(reader)
    
    # Create Dictionary
    id2word = corpora.Dictionary(data_words_nostops)
    # Create Corpus
    texts = data_words_nostops
    # Term Document Frequency
    corpus = [id2word.doc2bow(text) for text in texts]
    # convert corpus into string for fit_transform later
    stringified_corpus = [' '.join(i) for i in texts]
    # Build tf-idf model
    tf_idf_model = TfidfModel(corpus)
    
    return texts, tf_idf_model, stringified_corpus
    
texts, tf_idf_model, stringified_corpus = train_tfidf_model('/Users/cmeaton/Documents/code/ds/METIS/sea19_ds7_workingdir/project_4/data/processed_data/processed_dreams.csv'
)


def bow_sample_dreams(path):
    '''This function reads in a single processed dream and returns the dream as a list of strings for querying.
    Paths that can be used are:
    - "/Users/cmeaton/Documents/code/ds/METIS/sea19_ds7_workingdir/project_4/data/processed_data/processed_sample_dream.csv"
    - "/Users/cmeaton/Documents/code/ds/METIS/sea19_ds7_workingdir/project_4/data/processed_data/processed_sample_dream_2.csv"'''

    with open(path, "r") as f:
        reader = csv.reader(f)
        a = list(reader)
        sample_dream = []
        for i in a:
            i = ''.join(i)
            sample_dream.append(i)
        
        processed_sample_dream = ' '.join(sample_dream)
        return processed_sample_dream

processed_sample_dream = bow_sample_dreams("/Users/cmeaton/Documents/code/ds/METIS/sea19_ds7_workingdir/project_4/data/processed_data/processed_sample_dream.csv"
)


def return_similar_docs(training_corpus, new_doc):

    # create tfidf vectorizer
    tfidf = TfidfVectorizer()
    # fit tfidf vectorizer on entire corpus
    queryTFIDF = tfidf.fit_transform(training_corpus)
    # get tfidf weights for the new_doc terms
    sample_query = tfidf.transform([new_doc])
    # create array of cosine similarities of the new_doc terms with each document from entire corpus
    cosine_similarities = linear_kernel(sample_query, queryTFIDF).flatten()
    # return index position of top 5 most similar documents in corpus to the new_doc
    related_docs_indices = print(f'The 5 most similar documents are: {cosine_similarities.argsort()[:-5:-1]}')
    # return cosine similiarty of top 5 most similar documents in corpus to the new_doc
    related_docs_similiarity = print(f'The cosine similarites of the 5 most similar documents are: {cosine_similarities[cosine_similarities.argsort()[:-5:-1]]}')

return_similar_docs(stringified_corpus, processed_sample_dream)

The 5 most similar documents are: [2825  830  861 2889]
The cosine similarites of the 5 most similar documents are: [0.28592152 0.2677321  0.24444793 0.23231523]
(4572, 19087)


In [2]:
processed_sample_dream

'home city water tidal wave started crashing street climbed top tree held tight wave continued crashing saw people swalled current break wave jumped swam shore climbed bridge ran away safe vantage point zoomed space saw one big wave crash whole pacific coast surprise went cascade across rockies great plain appalachian eventually atlantic ocean whole country submerged water'

### Sanity check

Let's read in the original sample dream text file...

In [2]:
path_sample_dream_1 = "/Users/cmeaton/Documents/code/ds/METIS/sea19_ds7_workingdir/project_4/data/external/sample_dream.csv"

with open(path_sample_dream_1, 'r') as f:
    reader = csv.reader(f)
    sample_dream_1 = list(reader)

sample_dream_1

[['I was in my home city and right by the water when tidal waves started crashing over the streets. I climbed to the top of a tree and held on as tight as I could but the waves continued crashing and I saw many people swalled up in the current. When there was a break in the waves, I jumped down and swam to shore, where I climbed up a bridge and ran away. Once I was safe, my vantage point zoomed out to space where I saw one big wave crash over the whole pacific coast and to my surprise, went over the cascades, across the rockies, great plains, appalachians, and eventually into the atlantic ocean. The whole country was submerged in water.']]

And compare it to the top most similar dream in the corpus.

In [3]:
path_to_raw_dreams = "/Users/cmeaton/Documents/code/ds/METIS/sea19_ds7_workingdir/project_4/data/raw/final_dataframe.csv"
df = pd.read_csv(path_to_raw_dreams)
raw_dreams = df.selftext.values.tolist()


In [4]:
raw_dreams[2825]

"Last night I dreamt I was visiting a friends house on the beach. We were watching the waves and they were huge. I was excited to play in them. Then we were inside the house and I looked out and the waves were right at the window and started crashing against the house. All of a sudden the house just tilted. We had to lay against the wall as the house started rocking back and forth from the waves. I told her I didn't think this was okay and she said it happened all the time and that it was okay. The whole house was rocking and water was leaking in but I trusted the girl who was telling me that it was totally normal. I feel like the waves were rocking my foundation? But I'm not sure."