In [8]:
import os, sys
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from utils import random_cv_split, int_col_to_string, string_col_to_int, load_gensim_model

# Load Data

Read the data into a panda dataframe

Build training data using the not null data and a prediction set using the null data.

Then split up the training data into a training set and a cross validation set by randomly splitting items up by their program priority code.

In [12]:
df = pd.read_excel('data/LA_Budget_Data.xlsx')

In [46]:
full_data = df[df["Expense_Type"].notnull()]
full_data = full_data.dropna(how='any')
empty_data = df[False == df["Expense_Type"].notnull()]


single_empty = empty_data[empty_data["Program_Name"].notnull()]
all_empty = empty_data[False == empty_data["Program_Name"].notnull()]

priorities = full_data["Program_Priority"].unique()
programs = full_data["Program_Name"].unique()
expenses = full_data["Expense_Type"].unique()
departments = full_data["Dept_Code"].unique()

# Convert Strings to Ints

Before trying to run word2vec, try just running random forest on data giving each string a unique int id

In [14]:
full_data = string_col_to_int("Program_Priority", priorities, full_data)
full_data = string_col_to_int("Expense_Type", expenses, full_data)
full_data = string_col_to_int("Program_Name", programs, full_data)
full_data = string_col_to_int("Dept_Code", departments, full_data)

Other than Dept_Code and Program name, there don't appear to be any great correlations with Expense type. I'm not too optimistic about this approach, but let's see how it goes.

In [15]:
full_data.corr()

Unnamed: 0,Dept_Code,Program_Name,Program_Priority,Appropriation,Fiscal_Year,Expense_Type
Dept_Code,1.0,0.421186,0.028695,0.03936,0.125308,0.270991
Program_Name,0.421186,1.0,0.200173,0.0331,0.073229,0.050132
Program_Priority,0.028695,0.200173,1.0,0.104004,0.062546,0.013889
Appropriation,0.03936,0.0331,0.104004,1.0,0.01738,0.091814
Fiscal_Year,0.125308,0.073229,0.062546,0.01738,1.0,0.127806
Expense_Type,0.270991,0.050132,0.013889,0.091814,0.127806,1.0


In [16]:
guaranteed = ["Dept_Code", "Appropriation"]
not_guaranteed = ["Program_Name", "Program_Priority"]
all_keys = guaranteed + not_guaranteed


def train_and_score_rfc(data, train_keys,  iterations=1):
    
    train, cv = random_cv_split(full_data)
    
    rfc_low = RandomForestClassifier(n_estimators=10)

    
    for i in range(iterations):
        rfc_low.fit(train[train_keys], train["Expense_Type"])
        score = rfc_low.score(cv[train_keys], cv["Expense_Type"])
        print("Keys: ", train_keys)
        print(" Acc: ", score )

    
    return rfc_low
    
rfc = train_and_score_rfc(full_data, ["Dept_Code"])
rfc = train_and_score_rfc(full_data, ["Appropriation"])
rfc = train_and_score_rfc(full_data, ["Program_Name"])
rfc = train_and_score_rfc(full_data, ["Program_Priority"])
rfc = train_and_score_rfc(full_data, guaranteed)
rfc = train_and_score_rfc(full_data, not_guaranteed)

Keys:  ['Dept_Code']
 Acc:  0.573002109210218
Keys:  ['Appropriation']
 Acc:  0.6322314049586777
Keys:  ['Program_Name']
 Acc:  0.5889246323529411
Keys:  ['Program_Priority']
 Acc:  0.49446749654218536
Keys:  ['Dept_Code', 'Appropriation']
 Acc:  0.7438646652370741
Keys:  ['Program_Name', 'Program_Priority']
 Acc:  0.5792183477650362


So interesting results here: It seems that the best individual indicator is simply Appropriation which makes sense. Then is program name and not far behind Dept code. Last is program priority which makes sense because that was a complex description. So this is kind of good news because for about 200 rows, the only info we have is the dept code and appropriation, so for those rows which we can't run embeddings on, we can still get descent results, given the ~75% accuracy of the only guaranteed keys

Now I'm going to try a few combinations just to get a better feel for data

In [17]:
rfc = train_and_score_rfc(full_data, ["Program_Name", "Appropriation"])
rfc = train_and_score_rfc(full_data, ["Program_Priority", "Appropriation"])
rfc = train_and_score_rfc(full_data, ["Program_Name", "Appropriation", "Dept_Code"] )
rfc = train_and_score_rfc(full_data, all_keys)

Keys:  ['Program_Name', 'Appropriation']
 Acc:  0.757351183361224
Keys:  ['Program_Priority', 'Appropriation']
 Acc:  0.6038794110773545
Keys:  ['Program_Name', 'Appropriation', 'Dept_Code']
 Acc:  0.7877647058823529
Keys:  ['Dept_Code', 'Appropriation', 'Program_Name', 'Program_Priority']
 Acc:  0.7599906520215004


So the best combination is essentially all the keys minus program priority which is something to keep in mind for later. I'm interested to see how the random forest results differ with embedding as opposed to unique values.

Random Forest was my first instinct to try on this data but I'm gonna try an SVM just in case it is closer to being linearly separablee

In [11]:
from sklearn import svm


def train_and_score_svc(data, iterations=1):
    
    train, cv = random_cv_split(full_data)
    
    svc_g = svm.SVC()
    svc_ng = svm.SVC()
    
    for i in range(iterations):

        svc_g.fit(train[guaranteed], train["Expense_Type"])
        score = svc_g.score(cv[guaranteed], cv["Expense_Type"])
        print("SVC guaranteed data Score: ", score)


        svc_ng.fit(train[guaranteed+not_guaranteed], train["Expense_Type"])
        score = svc_ng.score(cv[guaranteed+not_guaranteed], cv["Expense_Type"])
        print("SVC not guaranteed data Score: ", score)
    
    
    return svc_g, svc_ng

svc_g, svc_ng = train_and_score_svc(full_data)


SVC guaranteed data Score:  0.6964368347987043
SVC not guaranteed data Score:  0.6881073577047663


Not as good results as Random Forest, but it might scale better. With a smaller dataset I might use svm, but becasue we have around 80% of the data already filled and 20% not filled, I'm leaning towards the random forest

Also, SVM performs worse given the program name and ID, because it probably just adds unneccessary complexity towards the fitting.

# NLP

A major idea in NLP is the idea that word can be encoded into a vector space. Something interesting about this vector space, however, is that the difference in values (i.e. distance) between similar words and phrases will be smaller than the difference in values between dissimilar words. I'm going to try to leverage these encodings to see if there is a greater theme in either program priority or program names and the expense types

In [234]:
import gensim
from gensim.models import word2vec
import logging, urllib.request, zipfile

Download and load training data

# Train Gensim

We have to first train Gensim on data in order to load the word2vec of our data. These are some functions to read the zipfile, extract the zipfile, train gensim, and finally save the model under 'mymodel' so we don't have to retrain every time

I did packaged all this into a load_gensim_model function which takes in a root path to search for the data and model. If it finds the model it loads it, otherwise, it reads the data, trains the model, saves the model and returns the model. You can read the code in my utils.py file

In [19]:
model = load_gensim_model(os.getcwd()+"/data/")

Found and verified text8.zip


In [67]:
dim = 300

def sentence2vec(words):
    """
    Module that converts takes in a string 
    with multiple words, takes the word2vec
    of each word, and then averages them all
    to get a sentence to vec
    
    For this dataset, no tf-idf is needed
    because no word appears twice in a single sentance
    """
    
    avg_vec = np.array([
                    np.mean([model.wv[w] for w in words.split() if w in model.wv]
                            or [np.zeros(dim)], axis=0)
                ]).reshape([300])
    return avg_vec


def add_vector_column(column_name, keys, df):
    
    vector_keys = []
    
    for key in keys:
        
        #print(key)
        vector_keys.append(sentence2vec(key))
        
    #print(vector_keys)
    
    
    col = np.zeros([df.shape[0]] )
    
    for i, key in enumerate(keys):
        index = df[column_name] == key
        
        np.place(col, index, vector_keys[i])
        
    
    df[column_name+"_Vectors"] = col
    
    
        
    return df

df = add_vector_column("Program_Name", programs, full_data)
    

    
    