In [4]:
import os, re
import pandas as pd
from xgboost import XGBClassifier

#sci-kit learn
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import f1_score, precision_score, recall_score, roc_curve, roc_auc_score

#nltk
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize, wordpunct_tokenize
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/claireboyd/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/claireboyd/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/claireboyd/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [5]:
# read in data
os.chdir("..")
os.path.abspath(os.curdir)

'/Users/claireboyd/courses/advanced_ml/how_the_bear_got_a_C'

In [6]:
data = pd.read_csv("data/phila/labeled_inspections_with_reviews.csv")

data[['y']] = 0
data.loc[data.loc[:,'Overall Compliance'] == "No",'y'] = 1

#feature selection (come back to this if needed)
data_simple = data[["reviews", "ratings", "n_reviews", "avg_rating", "y"]]

In [15]:
data_simple['reviews'][1]

"['Stopped in for happy hour last night, first time here.  I had a blast.  Great atmosphere, the food was outstanding (for pub fare) and service very good especially considering how busy they were.  Reasonably priced as well.  I will be back!', 'I have only two words for this installment: Bon. Appetit. I peregrinate to the grill & tavern approximately once a month, as the establishment is within walking distance from my resting quarters. I often commandeer myself the Tadano steak sandwich, hmm, just thinking about it makes my mouth salivate. The service is customarily hospitable and always treats their guests with the proper care that any Yelper and fellow aficionado are entitled to.']"

In [6]:
# # Pre Processing
# stop_words = stopwords.words('english') # creates a list of English stop words
# wnl = WordNetLemmatizer() # I used lemmatizing instead of stemming

# def preprocess(text_column):

#     new_reviews = []
#     for i, review_list in enumerate(text_column):
#     # for every sentence, we perform the necessary pre-processing

#         processed_review_list=re.sub("@\S+|https?:\S+|http?:\S|[^A-Za-z0-9]+", 
#                  ' ', 
#                  str(review_list).lower()).strip()
#         #print(processed_review_list)

#         new_review = word_tokenize(processed_review_list)
#         # print(new_review)
#         new_reviews.append(new_review)

#     return new_reviews

# # actually transforming the datasets
# data_simple["tokenized_reviews"] = preprocess(data_simple["reviews"])

In [7]:
# We can use BERT or GloVe embeddings instead of TF-IDF
# https://medium.com/analytics-vidhya/basics-of-using-pre-trained-glove-vectors-in-python-d38905f356db
import numpy as np
from scipy import spatial
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE

# look back at hws in order to do this

In [7]:
## Encode text data with n_gram = 2
vectorizer2 = TfidfVectorizer(analyzer='word', 
                              ngram_range=(1, 2),
                              stop_words='english',
                              binary=True)

X2_train = vectorizer2.fit_transform(data_simple["reviews"])
X2_train = X2_train.todense()
X2_train_pd = pd.DataFrame(X2_train)

In [9]:
encoded_data = pd.concat([data_simple, X2_train_pd], axis = 1).drop(columns=['reviews','ratings'])
encoded_data

Unnamed: 0,n_reviews,avg_rating,y,0,1,2,3,4,5,6,...,367651,367652,367653,367654,367655,367656,367657,367658,367659,367660
0,11,2.454545,0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,4.500000,0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,7,3.571429,0,0.032195,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,11,3.363636,0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5,2.200000,0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2160,4,4.250000,0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2161,6,4.833333,0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2162,6,5.000000,0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2163,6,2.666667,1,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [10]:
train_val, test = train_test_split(encoded_data, test_size=0.1)
train, val = train_test_split(train_val, test_size=(1/9))

x_train = train.drop(columns='y')
y_train = train['y']
x_val = val.drop(columns='y')
y_val = val['y']
x_test = test.drop(columns='y')
y_test = test['y']

In [13]:
x_train_small = x_train[:10]
y_train_small = y_train[:10]

model = XGBClassifier(nthread=4,
                      verbosity=1,
                      #n_estimators=2, 
                      #max_depth=2, 
                      learning_rate=0.3, #default 
                      #lambda=1, #default
                      #objective='binary:logistic',
                      #objective='binary:logistic'
                     ).fit(x_train_small, y_train_small)

In [15]:
x_val_small = x_val[:10]
y_val_small = y_val[:10]

#test model fit on val data
model_probs = model.predict_proba(x_val_small)[:, 1]
model_pred = model.predict(x_val_small)

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [25]:
model_val_results = {}
model_val_results['fpr'], model_val_results['tpr'], _ = roc_curve(y_val_small, model_probs)
model_val_results['auc'] = roc_auc_score(y_val_small, model_probs)
model_val_results['f1'] = f1_score(y_val_small, model_pred)
model_val_results['precision'] = precision_score(y_val_small, model_pred)
model_val_results['recall'] = recall_score(y_val_small, model_pred)

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [27]:
model_val_results

{'fpr': array([0., 1.]),
 'tpr': array([0., 1.]),
 'auc': 0.5,
 'f1': 0.0,
 'precision': 0.0,
 'recall': 0.0}