In [65]:
import os
import glob
import pandas as pd
import numpy as np
import pickle

import matplotlib
import matplotlib.pyplot as plt


import seaborn as sns

%matplotlib inline

from math import sqrt
from sklearn.linear_model import Ridge, RANSACRegressor
from sklearn.preprocessing import PolynomialFeatures, StandardScaler 
from sklearn.cross_validation import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.cluster import KMeans

In [66]:
def train_test_measure_reg(df, model, l_col, f_cols, scale_f=False, poly_f=False, picklize=False, save_dir="/"):
    """
    In: df        - DataFrame object that we want to learn from
        model     - sklearn ML object
        l_col     - label column name
        f_cols    - list of feature column names
        
    Out: tuple of mse and pred-true pairing DataFrame
    """
    
    X = df[f_cols].values
    y = df[l_col].tolist()
    
    X_train, X_test, y_train, y_test = \
    train_test_split(X, y, test_size=0.7, random_state=42)
    
    
    if poly_f:
        X_train, X_test = _transform(X_train, X_test, PolynomialFeatures())
        
    if scale_f:
        X_train, X_test = _transform(X_train, X_test, StandardScaler())
    
    reg = model
    reg.fit(X_train, y_train)
    
    if picklize:
        pickle.dump(reg, open(save_dir + "reg_model.p", "wb"))
        
    
    pred = reg.predict(X_test)
    
    
    pred_df = pd.DataFrame(data = {'pred': pred, 'true': y_test}, columns = ['pred', 'true'])
    mse = mean_squared_error(y_test, pred)
    return sqrt(mse), pred_df


def train_cl(df, model, f_cols, picklize=False, save_dir="/"):
    """
    In: df        - DataFrame object that we want to learn from
        model     - sklearn ML object
        l_col     - label column name
        f_cols    - list of feature column names
    Out: tuple of mse and pred-true pairing DataFrame
    """
    
    X = df[f_cols].values
    
    cl = model
    cl.fit(X)
    
    if picklize:
        pickle.dump(model, open(save_dir + 'cl_model.p', "wb"))
        
    pred = cl.predict(X)

    df['pred'] = pred
    
    return df


def _transform(X_train, X_test, transformer):
    tr = transformer.fit(X_train)
    
    X_train = tr.transform(X_train)
    X_test = tr.transform(X_test)
    
    return X_train, X_test 


def predict(model, features):
    """
    IN:
    reg: sklearn prediction model (pickle file)
    features: feature column vector

    OUT: float of predicted tax deduction
    """
    preds = model.predict(features)
    return preds[0]

def load_model(path):
    """
    Loads pickle model for prediction stuff
    """
    return pickle.load(open(path, "rb"))

## 1.0 Getting data

In [67]:
fname = "/Users/yan/Documents/govhack/data/Allyearssamplefile/all_data/2014-15.txt"
df = pd.read_csv(fname, index_col=None, header=0)

In [68]:
# TODO: dummy variables: Occ_code, Partner_status, Region

reg_cols = ['Gender', 'age_range', 'Sw_amt']
cl_cols = ['Gender', 'age_range', 'Sw_amt']

## 2.0 Training models

In [75]:
save_dir = "/Users/yan/Documents/govhack/Lambda/predstuff/models/"

mse, pred_df = train_test_measure_reg(df=df, 
                                      model=Ridge(), 
                                      l_col='Tot_ded_amt', 
                                      f_cols=reg_cols, 
                                      scale_f=False, 
                                      poly_f=False, 
                                      picklize=True,
                                      save_dir=save_dir)

print "got regression model with mse = {}".format(mse)

cl_df = train_cl(df=df, 
                 model=KMeans(), 
                 f_cols=cl_cols, 
                 picklize=True,
                 save_dir=save_dir)

print "got clustering model"

got regression model with mse = 7490.86494976
got clustering model




## 3.0 Getting models

In [76]:
models_dir = "/Users/yan/Documents/govhack/Lambda/predstuff/models/"

In [77]:
reg_model = load_model(models_dir + "reg_model.p")
print reg_model

Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)


In [78]:
cl_model = load_model(models_dir + "cl_model.p")
print cl_model

KMeans(copy_x=True, init='k-means++', max_iter=300, n_clusters=8, n_init=10,
    n_jobs=1, precompute_distances='auto', random_state=None, tol=0.0001,
    verbose=0)


## 4.0 Processing flow example

In [80]:
#'Gender', 'age_range', 'Sw_amt'

f_vector_reg = [1, 10, 200000]
pred = predict(reg_model, f_vector_reg) # Getting regression prediction
print pred

f_vector_cl = [1, 10, 200000]
cl = predict(cl_model, f_vector_reg)
print cl

5584.41040012
7




## 5.0 Taking look at the clusters