In [19]:
import sqlite3
import json
from datetime import datetime
import itertools


from gensim.models.doc2vec import Doc2Vec

import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [6]:
model = Doc2Vec.load("../data/models/wikifr_doc2vec.model")

2020-02-10 15:55:51,741 : INFO : loading Doc2Vec object from ../data/models/wikifr_doc2vec.model
2020-02-10 15:56:05,031 : INFO : loading vocabulary recursively from ../data/models/wikifr_doc2vec.model.vocabulary.* with mmap=None
2020-02-10 15:56:05,032 : INFO : loading trainables recursively from ../data/models/wikifr_doc2vec.model.trainables.* with mmap=None
2020-02-10 15:56:05,032 : INFO : loading syn1neg from ../data/models/wikifr_doc2vec.model.trainables.syn1neg.npy with mmap=None
2020-02-10 15:56:05,456 : INFO : loading wv recursively from ../data/models/wikifr_doc2vec.model.wv.* with mmap=None
2020-02-10 15:56:05,457 : INFO : loading vectors from ../data/models/wikifr_doc2vec.model.wv.vectors.npy with mmap=None
2020-02-10 15:56:05,863 : INFO : loading docvecs recursively from ../data/models/wikifr_doc2vec.model.docvecs.* with mmap=None
2020-02-10 15:56:05,863 : INFO : loading vectors_docs from ../data/models/wikifr_doc2vec.model.docvecs.vectors_docs.npy with mmap=None
2020-02-10

In [20]:
def accounts_iterator():
    db = sqlite3.connect("../data/datasets/verbatim.sqlite")
    with db:
        cr = db.cursor()
        accounts = cr.execute("SELECT id, refid, date_entered, date_modified, name, type, industry, description FROM account").fetchall()
    
    print(len(accounts))
    for a in accounts:
        _id, refid, date_entered, date_modified, name, atype, industry, description = a
        data = {
            'id': _id,
            'refid': refid,
            'date_entered': date_entered,
            'date_modified': date_modified,
            'name': name,
            'type': atype,
            'industry': industry,
            'description': ' '.join(description.split()) if description else '',
            
        }
        yield data

def log(x):
    index, data = x
    if index % 100 == 0: print(index)
    return data
        
def preprocess(data):
    new_data = data.copy()
    if new_data['type'] in ['X', '']:
        new_data['type'] = None
    if new_data['industry'] in ['']:
        new_data['industry'] = None

    return new_data


def feature_engineering(data):
    new_data = data.copy()
    
    date_entered = new_data.pop('date_entered', None)
    if date_entered:
        date = datetime.fromisoformat(date_entered)
        y = date.year
        m = date.month
        d = date.day
    else:
        y = m = d = 0
    new_data['year_entered'] = y
    new_data['month_entered'] = m
    new_data['day_entered'] = d
    
    date_modified = new_data.pop('date_modified', None)
    if date_entered:
        date = datetime.fromisoformat(date_modified)
        y = date.year
        m = date.month
        d = date.day
    else:
        y = m = d = 0
    new_data['year_modified'] = y
    new_data['month_modified'] = m
    new_data['day_modified'] = d

    return new_data


def embed_verbatim(data):
    new_data = data.copy()
    db = sqlite3.connect("../data/datasets/verbatim.sqlite")
    with db:
        cr = db.cursor()
        verbatims = cr.execute("SELECT title, content FROM verbatim WHERE parent_refid = '{}'".format(new_data['refid'])).fetchall()
        verbatim_str = ' '.join(map(lambda x: x[0] + ' ' + x[1], verbatims))
        verbatim_vec = model.infer_vector(verbatim_str.split())
        for i, v in enumerate(verbatim_vec):
            new_data['verbatim_{}'.format(i)] = v
            
        desc_vec = model.infer_vector(new_data.pop('description').split())
        for i, v in enumerate(desc_vec):
            new_data['description_{}'.format(i)] = v

    return new_data


industry_set = [
    None,
    'Apparel',
    'Banking',
    'Biotechnology',
    'Chemicals',
    'Communications',
    'Construction',
    'Consulting',
    'Education',
    'Electronics'
    'Energy',
    'Engineering',
    'Entertainment',
    'Environmental',
    'Finance',
    'Government',
    'Healthcare',
    'Hospitality',
    'Insurance',
    'Machinery',
    'Manufacturing',
    'Media',
    'Not For Profit',
    'Other',
    'Recreation',
    'Retail',
    'Shipping',
    'Technology',
    'Telecommunications',
    'Transportation',
    'Utilities',
]
type_set = [
    None,
    'Analyst',
    'Competitor',
    'Customer',
    'Integrator',
    'Other',
    'Partner',
    'Press',
    'Prospect',
    'Provider',
    'Reseller',
    'Supplier'
]

def categorize(data):
    new_data = data.copy()
    
    type_cat = new_data.pop('type')
    for i, key in enumerate(type_set):
        new_data['type_{}'.format(key)] = 0
    new_data['type_{}'.format(type_cat)] = 1
    
    industry_cat = new_data.pop('industry')
    for i, key in enumerate(industry_set):
        new_data['industry_{}'.format(key)] = 0
    new_data['industry_{}'.format(industry_cat)] = 1

    return new_data

def remove_feature(data):
    new_data = data.copy()
    
    new_data.pop('id')    
    new_data.pop('refid')    
    new_data.pop('name')

    return new_data


def dataset_iterator():
    it = accounts_iterator()
    it = map(log, enumerate(it))
    it = map(preprocess, it)
    it = map(feature_engineering, it)
    it = map(embed_verbatim, it)
    it = map(categorize, it)
    it = map(remove_feature, it)
    return it

In [21]:
import pandas as pd

dt = pd.DataFrame(dataset_iterator())

22167
0


AttributeError: type object 'datetime.datetime' has no attribute 'fromisoformat'

In [16]:
dt.to_csv("../data/datasets/verbatim_feature_engineered_dataset.csv")

In [17]:
import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline

# calculate the correlation matrix
corr = dt.corr()
plt.subplots(figsize=(300,300))

# plot the heatmap
heatmap = sns.heatmap(corr, xticklabels=corr.columns, yticklabels=corr.columns)
heatmap.figure.savefig("output.png")

ModuleNotFoundError: No module named 'seaborn'