In [None]:
from __future__ import print_function
from parse import *
from docx import Document
from functools import reduce
import os
import numpy as np
import pandas as pd
import pickle
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
    
def modify_working_dir(dir_name):
    # Modify the path based on where you keep your docx files
    os.chdir(dir_name)

def filenames():
    # Extracts filenames based on the working directory
    #Print working directory for your reference
    print(os.getcwd())
    files = os.listdir(str(os.getcwd()))
    docx = []
    for file in files:
        if file.endswith(".docx"):
            docx.append(file)
    return docx

def word_doc(file):
    # Combines paragraphs into one continuous string of text

    #print(file)
    f = Document(file)
    textline = []
    for para in f.paragraphs:
        textline.append(para.text)
    text = reduce(lambda x, y: x + ' ' + y, textline)
    return text

def parse_text(text, filename):
    #Parses the individual document based on the search parameters in global list: list_headings

    profile = {}
    global list_headings
    second = ": {}:"
    for _ in list_headings:
        try:
            profile[_] = (list(search(_.lower()+second, text.lower()))[0]).rsplit("  ",1)[0]
        except:
            profile[_] = "None"
            profile["filename"] = filename
            pass
    return profile

def get_parsed():
    #Parses all the documents and returns a pandas dataframe

    final_dict = []
    for filename in filenames():
        text = word_doc(str(filename))
        p = parse_text(text, filename)
        final_dict.append(p)
    #print(final_dict)
    final_df = pd.DataFrame(final_dict)
    df = final_df[list_headings]

    return [df,"dataframe"]

def create_corpus():
    corpus = []
    for _ in filenames():
        text = word_doc(str(_))
        corpus.append(text)
    return [corpus, "corpus"]

def modeling():
    #Read Corpus
    global topics_for_lda

    #Read stopwords
    with open("stopwords_custom.txt", 'r') as f:
        stopwords = f.read().split()
        #print(stopwords)

    modify_working_dir(result_dir)
    corpus = pickle.load(open("corpus_phrack.p", 'rb'))

    #Count Term Frequency
    tf_vectorizer = CountVectorizer(stop_words=stopwords, min_df= 5, max_df= 10,  ngram_range=(1,2))
    tf = tf_vectorizer.fit_transform(corpus)
    tf_feature_names = tf_vectorizer.get_feature_names()
    #print(len(tf_feature_names))

    tfidf_vectorizer = TfidfVectorizer()
    tfidf_matrix = tfidf_vectorizer.fit_transform(corpus)
    print (tfidf_matrix.shape)
    
    #Calculate Cosine Similarity
    cs = cosine_similarity(tfidf_matrix[0:], tfidf_matrix)
    cs = pd.DataFrame(cs)
    
    #LDA
    lda = LatentDirichletAllocation(n_topics=topics_for_lda, max_iter = 500, learning_method='online', random_state=0 )
    lda.fit(tf)
    doc_word = lda.transform(tf)
    #print(lda.transform(tf))

    doc_topic_dist_unnormalized = np.matrix(doc_word)
    
    # normalize the distribution
    doc_topic_dist = doc_topic_dist_unnormalized / doc_topic_dist_unnormalized.sum(axis=1)

    doc_topic_dist = pd.DataFrame(doc_topic_dist*100)
    return [lda, tf_feature_names, doc_topic_dist, cs]

def print_top_words(model):
    lda = model[0]
    feature_names = model[1]
    topic_dist = model[2]
    cs = model[3]
    n_top_words = 20

    list_topic_words = []
    for topic_idx, topic in enumerate(lda.components_):
        topic_words = {}

        print("Topic #%d:" % topic_idx)
        words_for_topic = (" ".join([feature_names[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))
        print(words_for_topic)
        topic_words[topic_idx] = words_for_topic
        list_topic_words.append(topic_words)

    #Read file for filenames and handle names
    handles_df = pd.read_csv('handles.csv', sep=',')

    # get topic distribution across the documents
    list_topic_dist_docs = []

    for _ in range(len(topic_dist.index)):
        dict_topic_dist = {}
        topic_for_doc= list(topic_dist.iloc[_,:])
        try:
            filename = handles_df.iloc[_,1].strip()
            handle = handles_df.iloc[_,2].strip()
        except:
            handle = _

        # Add lines for additional topics, if more than 3,  to be included in the final topic
        dict_topic_dist['filename'] = filename
        dict_topic_dist['handle'] = handle
        for i in range(topics_for_lda):
            topic_number = i+1
            dict_topic_dist['topic_' + str(topic_number)] = topic_for_doc[i]
        dict_topic_dist['top_topic'] = int(topic_for_doc.index(max(topic_for_doc))) + 1

        #Append dictionary to the final list for dataframe creation
        list_topic_dist_docs.append(dict_topic_dist)

    #Sort and generate Topic Distribution File
    df = pd.DataFrame(list_topic_dist_docs).sort(columns="top_topic", ascending=True)
    modify_working_dir(result_dir)
    df.to_csv("topic distribution for prophiles.csv")
    #print(df.values)

    #generate topics and their words distribution
    df = pd.DataFrame(list_topic_words)
    df.to_csv("topic-word distribution.csv")
    #df.reset_index()
    
    #generate cosine similarity csv
    cs.to_csv("cosine similarity.csv")
        
        
def create_file(list):
    # Writes the dataframe to a csv in the working directory
    modify_working_dir(result_dir)
    data = list[0]
    type = list[1]
    if type == "dataframe":
        data.to_csv('prophiles.csv')
        data1 = data[['filename', 'Handle']]
        data1.to_csv("handles.csv")
    elif type == "corpus":
        pickle.dump(data, open("corpus_phrack.p", 'wb'))
    else:
        pass
    modify_working_dir(data_dir)
    return (str(type) + " created")

if __name__ == '__main__':

    #Add/Remove the headings to be parsed in the order for generating the last parsed dataframe
    list_headings = ["filename", "Handle", "Past handles", "Handle origin", "Call him", "Call her", "Age of your body",
         "Date of Birth", "Produced in", "Height", "weight", "Height & Weight", "Eye color", "Hair Color", "Computers",
         "Admin of", "Sites Frequented", "URLs", "Women", "Cars", "Foods", "Height & Weight", "Projects", "Alcohol",
         "Books & Authors", "Authors", "Music", "Drugs", "I like", "I dislike"]

    #Variables
    topics_for_lda = 10

    create_csv = True
    create_doc_corpus = True
    create_topics = True

    #working directories
    os.chdir("C:\\Users\\gaura\\Documents\\MIM - UMCP\\Data Science Resources\\Python\\Projects\\phrack")
    script_dir = str(os.getcwd())
    result_dir = (script_dir + '\\result')
    data_dir = (script_dir + '\data')

    #modify working directory to the foder where you keep all files relative to the script
    modify_working_dir(data_dir)

    if create_csv:
        print(create_file(get_parsed()))
        print("csv of parsed prophiles created")
    if create_doc_corpus:
        print(create_file(create_corpus()))
    if create_topics:
        print_top_words(modeling())

In [1]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

import os
import tempfile
TEMP_FOLDER = tempfile.gettempdir()
print('Folder "{}" will be used to save temporary dictionary and corpus.'.format(TEMP_FOLDER))

from gensim import corpora

documents = pickle.load(open("corpus_phrack.p", 'rb'))
print(documents)

Folder "C:\Users\gaura\AppData\Local\Temp" will be used to save temporary dictionary and corpus.


ImportError: No module named 'gensim'