# YELP DATASET

In [25]:
import pandas as pd
import numpy as np

In [26]:
business_df = pd.read_csv('dataset/business_las_vegas_restaurant_dataset.csv', sep=',', header=0)
original_df = pd.DataFrame(data=business_df)

In [27]:
original_df.head()

Unnamed: 0,business_id,is_open,latitude,longitude,postal_code,review_count,stars
0,vJIuDBdu01vCA8y1fwR1OQ,0,36.192284,-115.159272,89106.0,3,1.5
1,kgffcoxT6BQp-gJ-UQ7Czw,1,36.201794,-115.281981,89128.0,13,2.5
2,0jtRI7hVMpQHpUVtUy4ITw,1,36.20199,-115.283122,89128.0,242,4.0
3,JJEx5wIqs9iGGATOagE8Sg,0,36.271169,-115.267759,89149.0,4,2.0
4,zhxnD7J5_sCrKSw5cwI9dQ,1,36.17314,-115.077945,89110.0,16,1.5


In [28]:
from sklearn.model_selection import train_test_split

X = list(zip(original_df['stars'], original_df['review_count']))
Y = np.array(original_df['is_open'])

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=12)

In [29]:
# Gridsearch for KNN
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
params = dict(n_neighbors=range(10,100), weights=['uniform', 'distance'])
grid = GridSearchCV(KNeighborsClassifier(), params, cv=10, scoring='accuracy')
grid.fit(x_train, y_train)
print(grid.best_estimator_)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=70, p=2,
           weights='uniform')


## KNN - K Neigbors Classifier

* Numero de vizinhos: 69
* Função de peso utilizada: uniform

In [30]:
 from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

model = KNeighborsClassifier(algorithm='brute', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=69, p=2,
           weights='uniform')
model.fit(x_train, y_train)

predicted = model.predict(x_test)

print(accuracy_score(y_test, predicted), "\n")
print(confusion_matrix(y_test, predicted),"\n")
print(classification_report(y_test, predicted))

0.675514626219 

[[ 217  447]
 [ 152 1030]] 

             precision    recall  f1-score   support

          0       0.59      0.33      0.42       664
          1       0.70      0.87      0.77      1182

avg / total       0.66      0.68      0.65      1846



In [31]:
# Gridsearch for Linear SVC
from sklearn.model_selection import GridSearchCV
from sklearn.svm import LinearSVC
params = dict(max_iter=[1500,1600,1700,1800,1900,2000], loss=['hinge', 'squared_hinge'])
grid = GridSearchCV(LinearSVC(), params, cv=10, scoring='accuracy')
grid.fit(x_train, y_train)
print(grid.best_estimator_)

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1600,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)


## Linear SVC - Linear Support Vector Classification

* Número de interações: 1500
* Função loss: squared hinge


In [32]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

model = LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1500,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)
model.fit(x_train, y_train)

predicted = model.predict(x_test)

print(accuracy_score(y_test, predicted), "\n")
print(confusion_matrix(y_test, predicted),"\n")
print(classification_report(y_test, predicted))

0.652221018418 

[[ 123  541]
 [ 101 1081]] 

             precision    recall  f1-score   support

          0       0.55      0.19      0.28       664
          1       0.67      0.91      0.77      1182

avg / total       0.62      0.65      0.59      1846



In [35]:
# Gridsearch for Logistic Regression
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
params = dict(max_iter=range(10, 100), solver=['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'])
grid = GridSearchCV(LogisticRegression(), params, cv=10, scoring='accuracy')
grid.fit(x_train, y_train)
print(grid.best_estimator_)































































































LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=10, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='lbfgs', tol=0.0001,
          verbose=0, warm_start=False)




## Logistic Regression

* Número de interações:
* Algoritmo de otimização:


In [None]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

model = LogisticRegression(random_state=12, max_iter=100)
model.fit(x_train, y_train)

predicted = model.predict(x_test)

print(accuracy_score(y_test, predicted), "\n")
print(confusion_matrix(y_test, predicted),"\n")
print(classification_report(y_test, predicted))

## Tabela de Comparação

| Algoritmos | Accuracy Score | Precision | Recall | F1-Score |
| :---------- | :--------------: | :---------: | :------: | :--------: |
| KNN | **0.677** | **0.66** | **0.68** | **0.65** |
| Linear SVC |0.652 | 0.64 | 0.65 | 0.64 |
| Logistic Regression |0.637 | 0.58 | 0.64 | 0.53 |