In [1]:
import sys
import json
import csv
import numpy as np
import pandas as pd
from pandas.io.json import json_normalize
from collections import Counter
from sklearn.base import TransformerMixin
from scipy.sparse import *

In [None]:
businessjson = 'dataset/business.json'

# Load Business data
business_json_lines = [json.loads( l.strip() ) for l in open(businessjson).readlines() ]
df = json_normalize(business_json_lines)

In [None]:
# Get records from Yelp set for only restaurants
df = df[df['categories'].astype(str).str.lower().str.contains('restaurant') == True]
df = df.reset_index(drop=True)


#df = df[df['review_count']>=35]
#df = df.reset_index(drop=True)

#Remove Columns of least significance
df = df.drop(['address','attributes.AcceptsInsurance','attributes.AgesAllowed','attributes.BYOB','attributes.ByAppointmentOnly','attributes.Corkage','attributes.DietaryRestrictions.dairy-free','attributes.DietaryRestrictions.gluten-free','attributes.DietaryRestrictions.halal','attributes.DietaryRestrictions.kosher','attributes.DietaryRestrictions.soy-free','attributes.DietaryRestrictions.vegan','attributes.DietaryRestrictions.vegetarian','attributes.HairSpecializesIn.africanamerican','attributes.HairSpecializesIn.asian','attributes.HairSpecializesIn.coloring','attributes.HairSpecializesIn.curly','attributes.HairSpecializesIn.extensions','attributes.HairSpecializesIn.kids','attributes.HairSpecializesIn.perms','attributes.HairSpecializesIn.straightperms','attributes.Open24Hours','attributes.RestaurantsCounterService','business_id','categories','hours.Friday','hours.Monday','hours.Saturday','hours.Sunday','hours.Thursday','hours.Tuesday','hours.Wednesday','latitude','longitude','name','neighborhood','state','city','postal_code'],axis=1)
df = df.copy()
# Classify the records
df['target'] = [1 if df['stars'][x] >= 3.5 and df['review_count'][x] >= 30 else 0 for x in range(len(df))]
df.to_csv('business_csv_filter.csv', encoding='utf-8', index=False)
#print df_binary['target'].value_counts()



In [None]:
headers = list(df.select_dtypes(include=['object']).columns.values)
#print headers
df = pd.get_dummies(data=df, columns=headers)
df = df.fillna(0)


#print df.head(10)

In [None]:
from sklearn.preprocessing import LabelEncoder
headers = list(df.select_dtypes(include=['object']).columns.values)
#df = df.fillna('Dummy')
#print headers

#for col in headers:
#print df['attributes.Alcohol'].head(2)
lb_make = LabelEncoder()
for col in headers:
    print col
    df[col] = df[col].fillna('Dummy')

    df[col] = lb_make.fit_transform(df[col])
    print df[col].head(5)
#df['attributes.Ambience.casual'] = lb_make.fit_transform(df['attributes.Ambience.casual'])


print df['attributes.Ambience.casual']

In [None]:
# Seggregting data to positive and negative buckets.
# Splitting train and test data for binary.
pos_bucket = df.loc[df['target'] == 1].reset_index(drop=True)
neg_bucket = df.loc[df['target'] == 0].reset_index(drop=True)

print len(pos_bucket)
print len(neg_bucket)

df_pos_train = pos_bucket.loc[0:11588]
df_pos_test = pos_bucket.loc[11588:]
df_neg_train = neg_bucket.loc[0:11884]
df_neg_test = neg_bucket.loc[11884:]

#print df_pos_train.shape[0],"df_pos_train"
#print df_pos_test.shape[0],"df_pos_test"
#print df_neg_train.shape[0],"df_neg_train"
#print df_neg_test.shape[0],"df_neg_test"


df_train = pd.concat([df_pos_train, df_neg_train], axis=0)
df_train = df_train.reset_index(drop=True)

#print df_train.shape[0],"df_train"

df_test = pd.concat([df_pos_test, df_neg_test], axis=0)
df_test = df_test.reset_index(drop=True)

df_train.to_csv('business_binary_le_final_train.csv', encoding='utf-8', index=False)
#df_test.to_csv('business_binary_final_test.csv', encoding='utf-8', index=False)

#print df_test.shape[0],"df_test"
#print df_train.loc[0:1]," df_train.loc_ahead"
#print df_train.loc[12589]," df_train.loc_behind"

In [2]:
dataset = list()
labels = list()
dataset_test = list()

with open('business_multi_le_finally_train.csv', 'r') as f:
    reader = csv.reader(f)
    your_list = list(reader)

print your_list[0]
for l in your_list:
    dataset.append(l[0:-1])
    labels.append(l[-1])

['2', '1', '0', '0', '0', '0', '0', '0', '0', '0', '0', '2', '2', '2', '2', '2', '2', '2', '1', '2', '1', '0', '1', '0', '0', '0', '0', '2', '2', '2', '2', '1', '0', '0', '0', '1', '0', '0', '2', '1', '2', '2', '2', '2', '2', '1', '2', '1', '0', '1', '0', '1', '2', '1', '1', '1', '0', '1', '2', '1', '3']


In [3]:
labels_test = list()
dataset_test = list()

with open('business_multi_le_finally_test.csv', 'r') as f:
    reader = csv.reader(f)
    your_list = list(reader)

for l in your_list:
    dataset_test.append(l[0:-1])
    labels_test.append(l[-1])
print len(dataset_test)

3495


In [4]:
docs = list()
for list2 in dataset:
    docs.append([int(n) if n != '' else 0 for n in list2])
dataset_csr_train = csr_matrix(docs)

In [5]:
docs_test = list()
print len(dataset_test)
for list1 in dataset_test:
    docs_test.append([int(n) if n != '' else 0 for n in list1])
dataset_csr_test = csr_matrix(docs_test)
print dataset_csr_test.shape

3495
(3495, 60)


In [6]:
from sklearn import preprocessing
mat_train = preprocessing.normalize(dataset_csr_train, norm='l2')
mat_test = preprocessing.normalize(dataset_csr_test, norm='l2')

In [7]:

from sklearn.model_selection import cross_val_score
from sklearn.neural_network import MLPClassifier
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

ch2_model = SelectKBest(chi2, k=50).fit(mat_train, labels)
X_chi2 = ch2_model.transform(mat_train)
X_chi2_test = ch2_model.transform(mat_test)

In [8]:
from sklearn.metrics import f1_score
clf = MLPClassifier(solver='lbfgs', alpha=1e-5,
        hidden_layer_sizes=(12, 7), random_state=1)

In [13]:
labels = [np.round(float(x)) for x in labels]
labels_test = [np.round(float(x)) for x in labels_test]

In [14]:
y_predict = clf.fit(X_chi2, labels).predict(X_chi2)
print f1_score(labels, y_predict, average='micro')

y_predict_test = clf.fit(X_chi2, labels).predict(X_chi2_test)
print f1_score(labels_test, y_predict_test, average='micro')  

0.759318881019
0.750214592275
