This notebook shows the yelp dataset start rating classification problem.
The objective is to predict the star rating based on the:
- business review count
- total number of checkins
- state where business is located
- city where business is located

In [None]:
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score


## 1. Load Data

In [None]:
df_business = pd.read_json('../../data/business.json',lines=True)
df_checkin = pd.read_json('../../data/checkin.json', lines=True)

## 2. Exploratory Data Analysis (EDA)

In [None]:
df_business.info()

In [None]:
df_business.head()

In [None]:
df_business = df_business.dropna()

In [None]:
df_business['review_count'] = df_business['review_count'].fillna(0)

In [None]:
df_business['stars'] = df_business['stars'].fillna(0)

In [None]:
df_business.describe()

In [None]:
stars = df_business.groupby('stars').mean()

In [None]:
stars.corr()

In [None]:
df_business.info()

In [None]:
numeric_features = df_business.loc[:, ['latitude', 'longitude', 'review_count', 'stars']]

In [None]:
numeric_features_standardized = (numeric_features)/numeric_features.max()

In [None]:
ax = sns.pairplot(numeric_features_standardized)
plt.show()

In [None]:
ax = sns.distplot(df_business['latitude'])
plt.show()
ax = sns.distplot(df_business['longitude'])
plt.show()
ax = sns.distplot(df_business['stars'])
plt.show()
ax = sns.distplot(df_business['review_count'])
plt.show()

In [None]:
from matplotlib import pyplot as plt

plt.figure(figsize=(10,5))
sns.countplot(x='state', data=df_business)
plt.xticks(rotation='vertical')
plt.title('State distribution')
plt.show()

In [None]:
from matplotlib import pyplot as plt

plt.figure(figsize=(10,5))
sns.countplot(x='city', data=df_business)
plt.xticks(rotation='vertical')
plt.title('City distribution')
plt.show()

In [None]:
df_business["stars"].value_counts()

In [None]:
df_checkin.info()

In [None]:
df_checkin.head()

In [None]:
#from datetime import datetime
#datetime_object = datetime.strptime(x, '%Y-%m-%d %H:%M:%S')

In [None]:
def get_checkin_count(x):
    return len(x.split(", "))

df_checkin['checkin_count'] = df_checkin['date'].apply(get_checkin_count)

In [None]:
df_checkin.head()

In [None]:
len(df_checkin["checkin_count"].unique())

## 3. Prepare Data

In [None]:
df_merged = pd.merge(df_business, df_checkin, on='business_id', how='left')

In [None]:
df_merged.info()

In [None]:
df_merged.head()

In [None]:
df_merged[['checkin_count']] = df_merged[['checkin_count']].fillna(0)

In [None]:
df_merged.head()

In [None]:
df_merged.info()

In [None]:
df_merged_new = df_merged[['business_id','review_count', 'stars', 'checkin_count', 'city', 'state', 'categories']]
df_merged_new.info()

In [None]:
df_merged_new['stars'] = df_merged_new['stars'].round()

In [None]:
df_merged_new['stars'].value_counts()

In [None]:
df_merged_new['review_count'] = df_merged_new['review_count'].fillna(0)

## 4. Train Models

#### Normalize numeric features

In [None]:
from sklearn.preprocessing import StandardScaler

cols_to_norm = ['review_count', 'checkin_count']
#new_df[cols_to_norm] = new_df[cols_to_norm].apply(lambda x: (x - x.min()) / (x.max() - x.min()))

dict_scaler = {}
for col_to_norm in cols_to_norm:
    scaler = StandardScaler()
    scaler.fit(df_merged_new.loc[:, col_to_norm].values.reshape(-1, 1))
    df_merged_new.loc[:,col_to_norm] = scaler.transform(df_merged_new.loc[:,col_to_norm].values.reshape(-1, 1))
    dict_scaler[col_to_norm] = scaler  

#### Transform features of types string to integer

In [None]:
from sklearn import preprocessing

cols_str_to_int = ['state', 'city']
for col_str_to_int in cols_str_to_int:
    label_encoder = preprocessing.LabelEncoder()
    df_merged_new[col_str_to_int] = label_encoder.fit_transform(df_merged_new[col_str_to_int])

In [None]:
df_merged_new.head()

In [None]:
df_merged_new['categories_str'] = df_merged_new['categories'].apply(lambda x:x.replace(',', ''))

In [None]:
df_merged_new.head()

In [None]:
categories = set((', '.join(df_merged_new['categories'].tolist())).strip().split(', '))
len(categories)

In [None]:
categories

#### TODO: one-hot encoding
Due to limited computational resource, the categorical feature one-hot encoding is put into the todo list.

#### Text feature extraction 
https://scikit-learn.org/stable/modules/feature_extraction.html
https://machinelearningmastery.com/prepare-text-data-machine-learning-scikit-learn/

In [None]:
#https://stackoverflow.com/questions/45961747/append-tfidf-to-pandas-dataframe
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(stop_words='english', max_features=100, min_df=2)
x = vectorizer.fit_transform(categories)

df_temp = pd.DataFrame(x.toarray(), columns=vectorizer.get_feature_names())
print(df_temp)

df_merged_new_tfid = pd.concat([df_merged_new, df_temp], axis=1)
print(df_merged_new_tfid)

print(vectorizer.vocabulary_)
print(vectorizer.idf_)

In [None]:
corr = df_merged_new.corr()

fig, ax = plt.subplots(figsize=(10, 10))
ax.matshow(corr)
plt.xticks(range(len(corr.columns)), corr.columns);
plt.yticks(range(len(corr.columns)), corr.columns);

plt.show()

In [None]:
corr = df_merged_new_tfid.corr()

fig, ax = plt.subplots(figsize=(10, 10))
ax.matshow(corr)
plt.xticks(range(len(corr.columns)), corr.columns);
plt.yticks(range(len(corr.columns)), corr.columns);

plt.show()

### Predict star ratings without using category text features

In [None]:
X = df_merged_new[['review_count', 'checkin_count', 'city', 'state']]
Y = df_merged_new['stars'].apply(int).tolist()

X.info()

In [None]:
X.shape

In [None]:
#from sklearn.preprocessing import StandardScaler

#scaler = StandardScaler()
#X = scaler.fit_transform(X)

#scaler = StandardScaler()
#X_tfid = scaler.fit_transform(X_tfid)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=42)

In [None]:
X_tfid = df_merged_new_tfid[['review_count', 'checkin_count', 'city', 'state'] + vectorizer.get_feature_names()]

X_tfid.info()

In [None]:
X_tfid.fillna(0, inplace=True)

In [None]:
X_train_tfid, X_test_tfid, y_train, y_test = train_test_split(X_tfid, Y, test_size=0.3, random_state=42)

In [None]:
random_forest = RandomForestClassifier(n_estimators=100, max_depth=6, random_state=42)
random_forest.fit(X_train, y_train)

y_predict = random_forest.predict(X_test)

#accuracy_score = accuracy_score(y_test, y_predict)
#print(accuracy_score)

print(f1_score(y_test, y_predict, average='macro'))  
print(f1_score(y_test, y_predict, average='micro'))
print(f1_score(y_test, y_predict, average='weighted'))  
print(f1_score(y_test, y_predict, average=None))

pd.DataFrame(confusion_matrix(y_test, y_predict))

print(classification_report(y_test, y_predict))

In [None]:
random_forest = RandomForestClassifier(n_estimators=100, max_depth=6, random_state=42)
random_forest.fit(X_train_tfid.values, y_train_tfid)

y_predict_tfid = random_forest.predict(X_test_tfid)

#accuracy_score = accuracy_score(y_test_tfid, np.array(y_predict_tfid))
#print(accuracy_score)

print(f1_score(y_test, y_predict_tfid, average='macro'))  
print(f1_score(y_test, y_predict_tfid, average='micro'))
print(f1_score(y_test, y_predict_tfid, average='weighted'))  
print(f1_score(y_test, y_predict_tfid, average=None))

pd.DataFrame(confusion_matrix(y_test_tfid, y_predict_tfid))

print(classification_report(y_test_tfid, y_predict_tfid))

In [69]:
!pip install xgboost
!pip install hyperopt

[33mYou are using pip version 9.0.1, however version 19.0.3 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m
[33mYou are using pip version 9.0.1, however version 19.0.3 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [None]:
import xgboost as xgb

clf = xgb.XGBClassifier(max_depth=4, n_estimators=200, learning_rate=0.05)
clf.fit(X_train, y_train)

from sklearn.metrics import accuracy_score

y_predict = clf.predict(X_test)
accuracy_score = accuracy_score(y_test, y_predict)
print(accuracy_score)

from sklearn.metrics import confusion_matrix
pd.DataFrame(
    confusion_matrix(y_test, y_predict)
)


### Predict star ratings with using category text features

### Hyperparameter Tuning