In [None]:
import gzip, ujson, requests, urllib2, wget, re, dill
import pandas as pd
from pandas.io.json import json_normalize
import numpy as np
from math import sqrt
from sklearn import cross_validation, grid_search
from sklearn.linear_model import LinearRegression, ElasticNetCV, ElasticNet, RidgeCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.cross_validation import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.base import BaseEstimator, RegressorMixin, TransformerMixin
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction.text import TfidfTransformer
import collections

#Download data
#the url for this dataset is purposefully omitted
url = '...'
filename = wget.download(url)

#read the zip file
with gzip.open('yelp_train_academic_dataset_business.json.gz', 'r') as f:
    file_content = f.read()
    
#save as json file
with open('yelp_academic_dataset_business.json', 'w') as f:
    f.write(file_content)
with open('yelp_academic_dataset_business.json', 'r') as f:
    data = f.readlines()
    
#read in as list of dictionaries
keys = ['name','full_address','city','state','latitude', 'longitude','categories','attributes','stars']
list_dict = []
for entry in data:
    item = ujson.loads(entry)
    list_dict.append({key: item[key] for key in keys})

#convert to pandas dataframe
df=pd.DataFrame(list_dict)
df.loc[:,'full_address'] = df.full_address.str.replace('\n',' ')

#read into pandas with columns of interest
keys = ['name','full_address','city','state','latitude', 'longitude','categories','attributes','stars']
list_dict = []
for entry in data:
    item = ujson.loads(entry)
    list_dict.append({key: item[key] for key in keys})
    
#to pandas    
df=pd.DataFrame(list_dict)
#clean up the addresses
df.loc[:,'full_address'] = df.full_address.str.replace('\n',' ')
#clean up the city names
df.loc[:,'city'] = df.city.map(lambda x: " ".join([a.strip() for a in x.split()]) )
#target
y=df['stars']


#City Model
class city_model(BaseEstimator, RegressorMixin):
    def __init__(self, key):
        self.key = key
        
    
    def fit(self, X, y):
        # select the city column and combine with the target column
        self.df = pd.concat([X[self.key],y],1)     
        # calculate mean for each city
        self.dfmean = self.df.groupby(self.key).mean().reset_index() 
        # make a dictionary
        self.dic = self.dfmean.set_index('city')['stars'].to_dict() 
        # calculate the mean of all cities
        self.mean = self.df.stars.mean() 
        return self
    
    def predict(self, X):
        if type(X)==dict:
            city = str(X[self.key])
            # if the city is in dictionary, look it up
            # if the city is not in the dictionary, assign the average score
            if city in self.df.city.tolist(): 
                self.result = self.dic[city]
            else:
                self.result = self.mean  
        else:
            self.result = self.df['city'].map(lambda x: self.dic[x])
                
        return self.result

#initialize, fit and save model to "city_model"
citymodel = city_model('city')
citymodel.fit(df, y)
citymodel.predict(test)
dill.dump(citymodel, open('city_model','w'))


#lat_long_model
class ColumnSelectTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        # initialization code
        self.key = key

    def fit(self, X, y=None):
        # fit the transformation
        return self

    def transform(self, X):
        if type(X)==dict:
            self.li=[]
            # for json, build a list of lists (n_sample x n_feature)
            # for pandas format, output n_sample x n_feature array
            for k in self.key:
                self.li.append(X[k])                  
            self.df = [self.li]             
        else:
            self.df = X[self.key].as_matrix()         
        return self.df 
    
class ll_model(BaseEstimator, RegressorMixin):
    def __init__(self):
        pass
    
    def fit(self, X, y):
        #GridsearchCV
        param_grid = { "n_estimators": range(10, 50, 20), "min_samples_leaf": range(30,100,20) }    
        self.random_forest_cv = grid_search.GridSearchCV(RandomForestRegressor(warm_start=True), 
                                                       param_grid=param_grid, 
                                                       scoring='mean_squared_error')
        self.random_forest_cv.fit(X, y)
        return self
    
    def predict(self, X):
        self.tmp = self.random_forest_cv.predict(X)
        # if one by one array returned for one input, get the number
        # if a larger array returned, use the array form
        if len(self.tmp) ==1:
            self.result = self.tmp[0]   
        else:
            self.result = self.tmp      
        return self.result

latln_model = Pipeline([('trans', ColumnSelectTransformer(['latitude','longitude'])),
                        ('est', ll_model())
                      ])

#fit and save the model
latln_model.fit(df, y)
dill.dump(latln_model, open('latln_model','w'))



#Category model
class CategoryTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        # initialization with key
        self.key = key

    def fit(self, X, y=None):
  
        return self

    def transform(self, X):
        # transform the data
        # for json format
        # build a list of dictionaries with value 1 for present categories
        if type(X)==dict:
            self.XX={}
            for item in self.categories:
                if item in X[self.key]:
                    self.XX.update({item: 1})   
                else:
                    self.XX.update({item: 0})   
        else:
            # Training using pandas dataframe
            # construct a list of all categories from training dataset
            self.list_of_lists = X[self.key].tolist()
            self.categories = list(set([item for sublist in self.list_of_lists for item in sublist]))

            # transform the data from pandas dataframe to list of dictionaries
            self.XX = X[self.key].map(lambda x: {item: 1 for item in x}).tolist() 
        
        return self.XX # transformation

class lr_model(BaseEstimator, RegressorMixin):
    def __init__(self):
        pass
    
    def fit(self, X, y):
        cv = cross_validation.KFold(5)
        ratios = [.1,.2,.3,.5,.9]
        
        self.linear_regression_cv = ElasticNetCV(n_alphas=5, l1_ratio=ratios, cv=cv, n_jobs = -1)
        self.linear_regression_cv.fit(X, y)
        return self
    
    def predict(self, X):
        self.tmp = self.linear_regression_cv.predict(X)
        if len(self.tmp) ==1:
            self.result = self.tmp[0]       
        else:
            self.result = self.tmp          

        return self.result
    
cate_model = Pipeline([('trans', CategoryTransformer('categories')),
                       ('vect', DictVectorizer()),
                       ('TF-IDF', TfidfTransformer()),         # adding the TF-IDF
                       ('est', lr_model())
                      ])

#fit and save the model
cate_model.fit(df,y)
dill.dump(cate_model, open('cate_model','w'))


#Attribute model
class AttributeTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        # initialization with key ('attributes')
        self.key = key

    def fit(self, X, y=None):
        return self
    
    def flattern(self, d, parent_key="", sep="_"):
        flat_dict = []
        for k, v in d.items():
            new_key = parent_key + sep + k if parent_key else k
            if type(v)==dict:
                flat_dict.extend(self.flattern(v, new_key, "_"))
            else:
                flat_dict.append({new_key: v})      
        return flat_dict
        

    def transform(self, X):
        # transform the data
        if type(X)==dict:
            # for json format, flattern the dictionary for attributes
            self.flat_X = {k:v for d in self.flattern(X[self.key]) for k,v in d.items()}
        else:
            # for pandas (used for training), build a list of dictionaries
            self.flat_X=X[self.key].map(lambda x: {k:v for d in self.flattern(x) for k,v in d.items()}).tolist()

        return self.flat_X # transformation
    

att_model = Pipeline([('trans', AttributeTransformer('attributes')),
                      ('vect', DictVectorizer()),
                       ('est', lr_model())
                      ])

#fit and save model
att_model.fit(df,y)
dill.dump(att_model, open('att_model','w'))


#Full model
#A transformer that takes the estimators built above using pipeline,
#and turns it into a transformer
class com_Transformer(BaseEstimator, TransformerMixin):
    def __init__(self, estimator):
        self.est = estimator
    
    def fit(self, X, y):
        self.est.fit(X,y)
        return self
    
    def transform(self, X):
        return self.est.predict(X).reshape(-1,1)

#feature union
features = FeatureUnion(transformer_list = 
                        [('city', com_Transformer(city_model('city'))),
                         ('latln', com_Transformer(latln_model)),
                         ('cate', com_Transformer(cate_model)),
                         ('att', com_Transformer(att_model))
                        ])

#pipeline the features with the linear estimator built above
fullmodel = Pipeline([
        ('featureunion', features), 
        ('est', lr_model())
    ])

#fit and save full model
fullmodel.fit(df,y)
dill.dump(fullmodel, open('full_model','w'))