In [1]:
import numpy as np
import pandas as pd
import re
import pickle
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.util import ngrams
from collections import Counter
from sklearn.externals import joblib
from sklearn.model_selection import cross_val_score

file = pd.read_csv("reviews.csv")

reviews = file['reviews'].values.tolist()
foods = file['food'].values.tolist()
services = file['service'].values.tolist()
prices = file['price'].values.tolist()
places = file['place'].values.tolist()

tokenize_reviews = []
for review in reviews:
    word_tokens = word_tokenize(review)
    tokenize_reviews.append(word_tokens)
    
stoplist = set(stopwords.words('english'))
filtered_reviews = []
words = []

def RemovePunctAndStopWords(tokens):
    nonPunct = re.compile('.*[A-Za-z0-9].*')  # must contain a letter or digit
    filtered = [w for w in tokens if nonPunct.match(w)]
    #Remove the stopwords from filtered text
    filtered_words = [word for word in filtered if word.lower() not in stoplist]
    frequent_words=['the','and','of','this','am','etc','also','are','were','was','is']
    filtered_words = [word for word in filtered_words if word.lower() not in frequent_words]
    filtered_words=[word.lower() for word in filtered_words]
    return filtered_words

for review in tokenize_reviews:
    review = RemovePunctAndStopWords(review)
    filtered_reviews.append(review)

def ngram_list(word_list, n):
    all_ngrams = list(ngrams(word_list, n))
    ngram_res = []
    for ngram in all_ngrams:
        ngram_res.append(ngram)
    return ngram_res

trigram_result = []
# for i in range(len(filtered_reviews)):
#     if (len(filtered_reviews[i]) > 2):
#         trigram = ngram_list(filtered_reviews[i], 3)
#         trigram_result.append(dict(Counter(trigram)))

for i in range(len(filtered_reviews)):
    unigram = ngram_list(filtered_reviews[i], 1)
    trigram_result.append(dict(Counter(unigram)))
#     bigram = {}
#     if (len(filtered_reviews[i]) > 1):
#         bigram = ngram_list(filtered_reviews[i], 2)
#     trigram_result.append({**dict(Counter(unigram)), **dict(Counter(bigram))})

trigram_set = set()
trigram_list = []
for trigram in trigram_result:
    for key in trigram:
        trigram_set.add(key)
        if key not in trigram_list:
            trigram_list.append(key)
        
pickle.dump(trigram_list, open('trigram.pkl', 'wb'))

output_feature = pd.DataFrame(0, columns = trigram_list, index = [i for i in range(len(filtered_reviews))])
for i, trigram in enumerate(trigram_result):
    for k, v in iter(trigram.items()):
        output_feature.at[i,k] += v

output_feature.to_csv('output_encode.csv', index=False, header=False)

food_df = pd.DataFrame(foods)
service_df = pd.DataFrame(services)
price_df = pd.DataFrame(prices)
place_df = pd.DataFrame(places)

1921


In [2]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV

def randomForest_fit(x_train, y_train, aspect, scoring = 'accuracy'):
    randomForest = RandomForestClassifier(random_state=123)

    hyperparam = {'min_samples_leaf': [3, 5, 7, 9, 13, 17, 21, 27, 33, 41, 50, 60, 80, 100],
                  'max_features': ['sqrt', 'log2', 0.25, 0.5, 0.75], 
                  'n_estimators': [100, 200, 300, 500, 1000]}
    
    random_randomForest = RandomizedSearchCV(randomForest, param_distributions = hyperparam, cv = 5, 
                                           n_iter = 10, scoring = scoring, n_jobs=-1, random_state = 123)
    random_randomForest.fit(x_train, y_train)
    
    print ("Best Accuracy " + aspect, random_randomForest.best_score_)
    print ("Best Param " + aspect, random_randomForest.best_params_)
    return random_randomForest

# food
best_randForest_food = randomForest_fit(output_feature, food_df.values.ravel(), "food")

randForest_food = RandomForestClassifier(random_state=123, n_jobs = -1,
                                   min_samples_leaf = best_randForest_food.best_params_.get('min_samples_leaf'),
                                   max_features = best_randForest_food.best_params_.get('max_features'),
                                   n_estimators = best_randForest_food.best_params_.get('n_estimators'))
randForest_food.fit(output_feature, food_df.values.ravel())

joblib.dump(randForest_food,'rf_food.pkl')

print(cross_val_score(randForest_food, output_feature, food_df.values.ravel(), cv=10))

  from numpy.core.umath_tests import inner1d


Best Accuracy food 0.678173719376392
Best Param food {'n_estimators': 300, 'min_samples_leaf': 3, 'max_features': 0.25}
[0.67032967 0.56043956 0.71428571 0.72527473 0.71111111 0.68888889
 0.64444444 0.72727273 0.68181818 0.73863636]


In [3]:
# price
best_randForest_price = randomForest_fit(output_feature, price_df.values.ravel(), "price")

randForest_price = RandomForestClassifier(random_state=123, n_jobs = -1,
                                   min_samples_leaf = best_randForest_price.best_params_.get('min_samples_leaf'),
                                   max_features = best_randForest_price.best_params_.get('max_features'),
                                   n_estimators = best_randForest_price.best_params_.get('n_estimators'))
randForest_price.fit(output_feature, price_df.values.ravel())

joblib.dump(randForest_price,'rf_price.pkl')

print(cross_val_score(randForest_price, output_feature, price_df.values.ravel(), cv=5))

Best Accuracy price 0.9621380846325167
Best Param price {'n_estimators': 300, 'min_samples_leaf': 3, 'max_features': 0.25}
[0.96132597 0.9558011  0.96089385 0.96089385 0.97191011]


In [4]:
# place
best_randForest_place = randomForest_fit(output_feature, place_df.values.ravel(), "place")

randForest_place = RandomForestClassifier(random_state=123, n_jobs = -1,
                                   min_samples_leaf = best_randForest_place.best_params_.get('min_samples_leaf'),
                                   max_features = best_randForest_place.best_params_.get('max_features'),
                                   n_estimators = best_randForest_place.best_params_.get('n_estimators'))
randForest_place.fit(output_feature, place_df.values.ravel())

joblib.dump(randForest_place,'rf_place.pkl')

print(cross_val_score(randForest_place, output_feature, place_df.values.ravel(), cv=10))

Best Accuracy place 0.8507795100222717
Best Param place {'n_estimators': 300, 'min_samples_leaf': 3, 'max_features': 0.25}
[0.82417582 0.82417582 0.86813187 0.82417582 0.86666667 0.74444444
 0.87640449 0.86516854 0.86363636 0.86363636]


In [5]:
# service
best_randForest_service = randomForest_fit(output_feature, service_df.values.ravel(), "service")

randForest_service = RandomForestClassifier(random_state=123, n_jobs = -1,
                                   min_samples_leaf = best_randForest_service.best_params_.get('min_samples_leaf'),
                                   max_features = best_randForest_service.best_params_.get('max_features'),
                                   n_estimators = best_randForest_service.best_params_.get('n_estimators'))
randForest_service.fit(output_feature, service_df.values.ravel())

joblib.dump(randForest_service,'rf_service.pkl')

print(cross_val_score(randForest_service, output_feature, service_df.values.ravel(), cv=10))

Best Accuracy service 0.8708240534521158
Best Param service {'n_estimators': 300, 'min_samples_leaf': 3, 'max_features': 0.25}
[0.84782609 0.84615385 0.87777778 0.85555556 0.87777778 0.84269663
 0.88764045 0.8988764  0.88764045 0.92134831]


In [6]:
import numpy as np

randForest_food = joblib.load('rf_food.pkl')
randForest_price = joblib.load('rf_price.pkl')
randForest_place = joblib.load('rf_place.pkl')
randForest_service = joblib.load('rf_service.pkl')

def predictData(classifier, datas, aspect):
    tokenize_data = []
    filtered_data = []
    for review in datas:
        data_tokens = word_tokenize(review)
        tokenize_data.append(data_tokens)
    
    for review in tokenize_data:
        review = RemovePunctAndStopWords(review)
        filtered_data.append(review)
    
    data_result = []
    for i in range(len(filtered_data)):
        data = ngram_list(filtered_data[i], 1)
        data_result.append(data)

    feature = []
    for trigram in trigram_set:
        feature.append(data_result[0].count(trigram) if trigram in data_result[0] else 0)
                
    output_data = pd.DataFrame(feature)
    output_data.to_csv('output_data.csv', index=False, header=False)
    
    result = classifier.predict(output_data.T)
    result_proba = classifier.predict_proba(output_data.T)
    print(result_proba)
    if result > 0:
        return('positive')
    elif result < 0:
        return('negative')
    else:
        return('neutral')

text = ["Call me nitpicky, but overcooked yolks pretty much kills the dish."]

print(predictData(randForest_price, text, "price"))
print(predictData(randForest_place, text, "place"))
print(predictData(randForest_service, text, "service"))
print(predictData(randForest_food, text, "food"))

[[0.00275019 0.99724981 0.        ]]
neutral
[[0.0117383  0.96775647 0.02050523]]
neutral
[[5.48027490e-03 9.94349353e-01 1.70371624e-04]]
neutral
[[0.06906133 0.8988769  0.03206176]]
neutral


In [7]:
def predictData(datas, aspect):
    classifier = joblib.load('rf_food.pkl')
    if (aspect == "price"):
        classifier = joblib.load('rf_price.pkl')
    if (aspect == "place"):
        classifier = joblib.load('rf_place.pkl')
    if (aspect == "service"):
        classifier = joblib.load('rf_service.pkl')
    
    tokenize_data = []
    filtered_data = []
    for review in datas:
        data_tokens = word_tokenize(review)
        tokenize_data.append(data_tokens)
    
    for review in tokenize_data:
        review = RemovePunctAndStopWords(review)
        filtered_data.append(review)
    
    data_result = []
    for i in range(len(filtered_data)):
        data = ngram_list(filtered_data[i], 1)
        data_result.append(data)
        
#     print(trigram_set)

    feature = []
    for trigram in trigram_set:
        feature.append(data_result[0].count(trigram) if trigram in data_result[0] else 0)
                
    output_data = pd.DataFrame(feature)
    output_data.to_csv('output_data.csv', index=False, header=False)
    
    result = classifier.predict(output_data.T)
    result_proba = classifier.predict_proba(output_data.T)
    print(result_proba)
    if result > 0:
        return('positive')
    elif result < 0:
        return('negative')
    else:
        return('neutral')
    
    
text = ["The breakfast is great here and staff are very friendly."]

print(predictData(text, "food"))
print(predictData(text, "price"))
print(predictData(text, "place"))
print(predictData(text, "service"))

[[0.0790997  0.89466953 0.02623077]]
neutral
[[0.00238987 0.99761013 0.        ]]
neutral
[[0.04808667 0.92676086 0.02515247]]
neutral
[[5.90811632e-02 9.40748465e-01 1.70371624e-04]]
neutral
