In [1]:
import numpy as np
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.util import ngrams
from collections import Counter
from sklearn.externals import joblib
from sklearn.model_selection import cross_val_score

file = pd.read_csv("reviews.csv")

reviews = file['reviews'].values.tolist()
foods = file['food'].values.tolist()
services = file['service'].values.tolist()
prices = file['price'].values.tolist()
places = file['place'].values.tolist()

tokenize_reviews = []
for review in reviews:
    word_tokens = word_tokenize(review)
    tokenize_reviews.append(word_tokens)
    
stoplist = set(stopwords.words('english'))
filtered_reviews = []
words = []

def RemovePunctAndStopWords(tokens):
    nonPunct = re.compile('.*[A-Za-z0-9].*')  # must contain a letter or digit
    filtered = [w for w in tokens if nonPunct.match(w)]
    #Remove the stopwords from filtered text
    filtered_words = [word for word in filtered if word.lower() not in stoplist]
    frequent_words=['the','and','of','this','am','etc','also','are','were','was','is']
    filtered_words = [word for word in filtered_words if word.lower() not in frequent_words]
    filtered_words=[word.lower() for word in filtered_words]
    return filtered_words

for review in tokenize_reviews:
    review = RemovePunctAndStopWords(review)
    filtered_reviews.append(review)

def ngram_list(word_list, n):
    all_ngrams = list(ngrams(word_list, n))
    ngram_res = []
    for ngram in all_ngrams:
        ngram_res.append(ngram)
    return ngram_res

trigram_result = []
# for i in range(len(filtered_reviews)):
#     if (len(filtered_reviews[i]) > 2):
#         trigram = ngram_list(filtered_reviews[i], 3)
#         trigram_result.append(dict(Counter(trigram)))

for i in range(len(filtered_reviews)):
    unigram = ngram_list(filtered_reviews[i], 1)
    trigram_result.append(dict(Counter(unigram)))
#     bigram = {}
#     if (len(filtered_reviews[i]) > 1):
#         bigram = ngram_list(filtered_reviews[i], 2)
#     trigram_result.append({**dict(Counter(unigram)), **dict(Counter(bigram))})

trigram_set = set()
for trigram in trigram_result:
    for key in trigram:
        trigram_set.add(key)
        
display(len(trigram_set))

output_feature = pd.DataFrame(0, columns = trigram_set, index = [i for i in range(len(filtered_reviews))])
for i, trigram in enumerate(trigram_result):
    for k, v in iter(trigram.items()):
        output_feature.at[i,k] += v

output_feature.to_csv('output_encode.csv', index=False, header=False)

food_df = pd.DataFrame(foods)
service_df = pd.DataFrame(services)
price_df = pd.DataFrame(prices)
place_df = pd.DataFrame(places)

1921

In [13]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV

def randomForest_fit(x_train, y_train, aspect, scoring = 'accuracy'):
    randomForest = RandomForestClassifier(random_state=123)

    hyperparam = {'min_samples_leaf': [3, 5, 7, 9, 13, 17, 21, 27, 33, 41, 50, 60, 80, 100],
                  'max_features': ['sqrt', 'log2', 0.25, 0.5, 0.75], 
                  'n_estimators': [100, 200, 300, 500, 1000]}
    
    random_randomForest = RandomizedSearchCV(randomForest, param_distributions = hyperparam, cv = 5, 
                                           n_iter = 10, scoring = scoring, n_jobs=-1, random_state = 123)
    random_randomForest.fit(x_train, y_train)
    
    print ("Best Accuracy " + aspect, random_randomForest.best_score_)
    print ("Best Param " + aspect, random_randomForest.best_params_)
    return random_randomForest

# food
best_randForest_food = randomForest_fit(output_feature, food_df.values.ravel(), "food")

randForest_food = RandomForestClassifier(random_state=123, n_jobs = -1,
                                   min_samples_leaf = best_randForest_food.best_params_.get('min_samples_leaf'),
                                   max_features = best_randForest_food.best_params_.get('max_features'),
                                   n_estimators = best_randForest_food.best_params_.get('n_estimators'))
randForest_food.fit(output_feature, food_df.values.ravel())
print(cross_val_score(randForest_food, output_feature, food_df.values.ravel(), cv=10))

# price
best_randForest_price = randomForest_fit(output_feature, price_df.values.ravel(), "price")

randForest_price = RandomForestClassifier(random_state=123, n_jobs = -1,
                                   min_samples_leaf = best_randForest_price.best_params_.get('min_samples_leaf'),
                                   max_features = best_randForest_price.best_params_.get('max_features'),
                                   n_estimators = best_randForest_price.best_params_.get('n_estimators'))
randForest_price.fit(output_feature, price_df.values.ravel())
print(cross_val_score(randForest_price, output_feature, price_df.values.ravel(), cv=10))

# place
best_randForest_place = randomForest_fit(output_feature, place_df.values.ravel(), "place")

randForest_place = RandomForestClassifier(random_state=123, n_jobs = -1,
                                   min_samples_leaf = best_randForest_place.best_params_.get('min_samples_leaf'),
                                   max_features = best_randForest_place.best_params_.get('max_features'),
                                   n_estimators = best_randForest_place.best_params_.get('n_estimators'))
randForest_place.fit(output_feature, place_df.values.ravel())
print(cross_val_score(randForest_place, output_feature, place_df.values.ravel(), cv=10))

# service
best_randForest_service = randomForest_fit(output_feature, service_df.values.ravel(), "service")

randForest_service = RandomForestClassifier(random_state=123, n_jobs = -1,
                                   min_samples_leaf = best_randForest_service.best_params_.get('min_samples_leaf'),
                                   max_features = best_randForest_service.best_params_.get('max_features'),
                                   n_estimators = best_randForest_service.best_params_.get('n_estimators'))
randForest_service.fit(output_feature, service_df.values.ravel())
print(cross_val_score(randForest_service, output_feature, service_df.values.ravel(), cv=10))

Best Accuracy food 0.6848552338530067
Best Param food {'n_estimators': 300, 'min_samples_leaf': 3, 'max_features': 0.25}
[0.67032967 0.56043956 0.7032967  0.72527473 0.71111111 0.68888889
 0.64444444 0.72727273 0.68181818 0.73863636]
Best Accuracy price 0.9621380846325167
Best Param price {'n_estimators': 300, 'min_samples_leaf': 3, 'max_features': 0.25}




[0.95604396 0.96703297 0.94444444 0.95555556 0.95555556 0.95555556
 0.95555556 0.96629213 0.96629213 0.98863636]
Best Accuracy place 0.8496659242761693
Best Param place {'n_estimators': 300, 'min_samples_leaf': 3, 'max_features': 0.25}
[0.82417582 0.84615385 0.87912088 0.82417582 0.86666667 0.75555556
 0.87640449 0.86516854 0.86363636 0.86363636]
Best Accuracy service 0.8674832962138085
Best Param service {'n_estimators': 300, 'min_samples_leaf': 3, 'max_features': 0.25}
[0.85869565 0.84615385 0.87777778 0.87777778 0.88888889 0.83146067
 0.88764045 0.8988764  0.87640449 0.92134831]


In [3]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import RandomizedSearchCV

def decTree_fit(x_train, y_train, scoring = 'accuracy'):
    decTree = DecisionTreeClassifier(random_state=123)

    hyperparam = {'min_samples_leaf': [3, 5, 7, 9, 13, 17, 21, 27, 33, 41, 50, 60, 80, 100],
                  'max_features': ['sqrt', 'log2', 0.25, 0.5, 0.75]}

    random_decTree = RandomizedSearchCV(decTree, param_distributions = hyperparam, cv = 5,
                                        n_iter = 15, scoring = scoring, n_jobs=-1, random_state = 123)
    
    random_decTree.fit(x_train, y_train)
    
    print ("Best Accuracy", random_decTree.best_score_)
    print ("Best Param", random_decTree.best_params_)
    
    return random_decTree

best_decTree = decTree_fit(output_feature, food_df.values.ravel())

decTree = DecisionTreeClassifier(min_samples_leaf = best_decTree.best_params_.get('min_samples_leaf'),
                                 max_features = best_decTree.best_params_.get('max_features'), random_state=123)
decTree.fit(output_feature, food_df.values.ravel())

Best Accuracy 0.6614699331848553
Best Param {'min_samples_leaf': 5, 'max_features': 0.5}


DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=0.5, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=5, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=123,
            splitter='best')

In [4]:
from sklearn.ensemble import BaggingClassifier
from sklearn.model_selection import RandomizedSearchCV

def bagging_fit(x_train, y_train, scoring = 'accuracy'):
    decTree = DecisionTreeClassifier(random_state=123)
    
    bagging = BaggingClassifier(base_estimator = decTree, random_state=123)
    
    hyperparam = {'base_estimator__min_samples_leaf': [3, 5, 7, 9, 13, 17, 21, 27, 33, 41, 50, 60, 80, 100],
                  'n_estimators': [100, 200, 300, 500, 1000]}
    # 'base_estimator__' sebelum 'min_samples_leaf' menandakan hyperparameter yang dicari ada di dalam base estimatornya
    # dalam hal ini berarti decTree
    # (min_samples_leaf ada di dalam decTree)
    
    random_bagging = RandomizedSearchCV(bagging, param_distributions = hyperparam, cv = 5, 
                                           n_iter = 10, scoring = scoring, n_jobs=-1, random_state = 123)
    random_bagging.fit(x_train, y_train)
    
    print ("Best Accuracy", random_bagging.best_score_)
    print ("Best Param", random_bagging.best_params_)
    return random_bagging

best_bagging = bagging_fit(output_feature, food_df.values.ravel())

decTreeBag = DecisionTreeClassifier(min_samples_leaf = best_bagging.best_params_.get('base_estimator__min_samples_leaf'),
                                    random_state=123)
bagging = BaggingClassifier(base_estimator = decTreeBag, 
                            n_estimators = best_bagging.best_params_.get('n_estimators'),
                            random_state=123, n_jobs=-1)
bagging.fit(output_feature, food_df.values.ravel())

Best Accuracy 0.678173719376392
Best Param {'n_estimators': 500, 'base_estimator__min_samples_leaf': 5}


BaggingClassifier(base_estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=5, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=123,
            splitter='best'),
         bootstrap=True, bootstrap_features=False, max_features=1.0,
         max_samples=1.0, n_estimators=500, n_jobs=-1, oob_score=False,
         random_state=123, verbose=0, warm_start=False)

In [5]:
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import cross_val_score

clf = GaussianNB()
clf.fit(output_feature, food_df.values.ravel())
print(cross_val_score(clf, output_feature, food_df.values.ravel(), cv=10))

[0.6043956  0.38461538 0.47252747 0.41758242 0.5        0.48888889
 0.53333333 0.51136364 0.47727273 0.42045455]


In [6]:
from sklearn.svm import SVC
from sklearn.model_selection import RandomizedSearchCV

def rbfSVC_fit(x_train, y_train):
    rbfSVC = SVC(kernel = 'rbf')

    hyperparam = {'C': [1000, 333.33, 100, 33.33, 10, 3.33, 10, 3.33, 1, 0.33, 0.1, 0.033, 0.01, 0.0033, 
                        0.001, 0.00033, 0.0001]}

    random_rbfSVC = RandomizedSearchCV(rbfSVC, param_distributions = hyperparam, cv = 5,
                                    n_iter = 5, n_jobs=2, random_state = 123)
    
    random_rbfSVC.fit(x_train, y_train)
    
    print ("Best Accuracy", random_rbfSVC.score(x_train, y_train))
    print ("Best Param", random_rbfSVC.best_params_)
    
    return random_rbfSVC 

best_rbfSVC = rbfSVC_fit(output_feature, food_df.values.ravel())

RBF_SVC = SVC(kernel = 'rbf', C=best_rbfSVC.best_params_.get('C'))
RBF_SVC.fit(output_feature, food_df.values.ravel())
print(cross_val_score(RBF_SVC, output_feature, food_df.values.ravel(), cv=10))

Best Accuracy 0.5868596881959911
Best Param {'C': 10}
[0.58241758 0.58241758 0.58241758 0.58241758 0.58888889 0.58888889
 0.58888889 0.59090909 0.59090909 0.59090909]


In [7]:
from sklearn.ensemble import AdaBoostClassifier

def adaBoost_fit(x_train, y_train, scoring = 'accuracy'):
    decTree = DecisionTreeClassifier(random_state=123)
    
    adaBoost = AdaBoostClassifier(base_estimator = decTree, random_state=123)
    
    hyperparam = {'base_estimator__min_samples_leaf': [3, 5, 7, 9, 13, 17, 21, 27, 33, 41, 50, 60, 80, 100],
                  'learning_rate':[1., .1, .01, .001],
                  'n_estimators': [100, 200, 300]}
    
    random_adaBoost = RandomizedSearchCV(adaBoost, param_distributions = hyperparam, cv = 5, 
                                           n_iter = 1, scoring = scoring, n_jobs=-1, random_state = 123)
    random_adaBoost.fit(x_train, y_train)
    
    print ("Best Accuracy", random_adaBoost.best_score_)
    print ("Best Param", random_adaBoost.best_params_)
    return random_adaBoost

best_adaBoost = adaBoost_fit(output_feature, food_df.values.ravel())

decTreeAdaBoost = DecisionTreeClassifier(min_samples_leaf = best_adaBoost.best_params_.get('base_estimator__min_samples_leaf'),
                                    random_state=123)
adaBoost = AdaBoostClassifier(base_estimator = decTreeAdaBoost, 
                            n_estimators = best_adaBoost.best_params_.get('n_estimators'),
                            learning_rate = best_adaBoost.best_params_.get('learning_rate'),
                            random_state=123)
adaBoost.fit(output_feature, food_df.values.ravel())

Best Accuracy 0.6046770601336303
Best Param {'n_estimators': 200, 'learning_rate': 1.0, 'base_estimator__min_samples_leaf': 41}


AdaBoostClassifier(algorithm='SAMME.R',
          base_estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=41, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=123,
            splitter='best'),
          learning_rate=1.0, n_estimators=200, random_state=123)

In [14]:
joblib.dump(decTree,'decision_tree.pkl')
joblib.dump(bagging,'bagging.pkl')
joblib.dump(randForest_food,'random_forest_food.pkl')
joblib.dump(randForest_price,'random_forest_price.pkl')
joblib.dump(randForest_place,'random_forest_place.pkl')
joblib.dump(randForest_service,'random_forest_service.pkl')
joblib.dump(adaBoost,'adaptive_boosting.pkl')
joblib.dump(clf, 'naive_bayes.pkl')
joblib.dump(RBF_SVC, 'svm.pkl')

['svm.pkl']

In [17]:
import numpy as np

classifier = joblib.load('decision_tree.pkl')
clf_load = joblib.load('naive_bayes.pkl')
randForest_food = joblib.load('random_forest_food.pkl')
randForest_price = joblib.load('random_forest_price.pkl')
randForest_place = joblib.load('random_forest_place.pkl')
randForest_service = joblib.load('random_forest_service.pkl')

def predictData(classifier, datas, aspect):
    tokenize_data = []
    filtered_data = []
    for review in datas:
        data_tokens = word_tokenize(review)
        tokenize_data.append(data_tokens)
    
    for review in tokenize_data:
        review = RemovePunctAndStopWords(review)
        filtered_data.append(review)
    
    data_result = []
    for i in range(len(filtered_data)):
        data = ngram_list(filtered_data[i], 1)
        data_result.append(data)

    feature = []
    for trigram in trigram_set:
        feature.append(data_result[0].count(trigram) if trigram in data_result[0] else 0)
                
    output_data = pd.DataFrame(feature)
    output_data.to_csv('output_data.csv', index=False, header=False)
    
    result = classifier.predict(output_data.T)
    result_proba = classifier.predict_proba(output_data.T)
    print(result_proba)
    if result > 0:
        return('positive')
    elif result < 0:
        return('negative')
    else:
        return('neutral')

text = ["Call me nitpicky, but overcooked yolks pretty much kills the dish."]

predictData(randForest_price, text, "price")
predictData(randForest_place, text, "place")
predictData(randForest_service, text, "service")
predictData(randForest_food, text, "food")

[[0.01377953 0.98383176 0.00238871]]
[[0.00160234 0.96747581 0.03092185]]
[[5.27395223e-02 9.47233811e-01 2.66666667e-05]]
[[0.570525   0.38879833 0.04067667]]


'negative'

In [21]:
def predictData(datas, aspect):
    classifier = joblib.load('random_forest_food.pkl')
    if (aspect == "price"):
        classifier = joblib.load('random_forest_price.pkl')
    if (aspect == "place"):
        classifier = joblib.load('random_forest_place.pkl')
    if (aspect == "service"):
        classifier = joblib.load('random_forest_service.pkl')
    
    tokenize_data = []
    filtered_data = []
    for review in datas:
        data_tokens = word_tokenize(review)
        tokenize_data.append(data_tokens)
    
    for review in tokenize_data:
        review = RemovePunctAndStopWords(review)
        filtered_data.append(review)
    
    data_result = []
    for i in range(len(filtered_data)):
        data = ngram_list(filtered_data[i], 1)
        data_result.append(data)
        
    print(data_result)

    feature = []
    for trigram in trigram_set:
        feature.append(data_result[0].count(trigram) if trigram in data_result[0] else 0)
                
    output_data = pd.DataFrame(feature)
    output_data.to_csv('output_data.csv', index=False, header=False)
    
    result = classifier.predict(output_data.T)
    result_proba = classifier.predict_proba(output_data.T)
    print(result_proba)
    if result > 0:
        return('positive')
    elif result < 0:
        return('negative')
    else:
        return('neutral')
    
    
text = ["Call me nitpicky, but overcooked yolks pretty much kills the dish."]

print(predictData(text, "food"))
print(predictData(text, "price"))
print(predictData(text, "place"))
print(predictData(text, "service"))

[[('call',), ('nitpicky',), ('overcooked',), ('yolks',), ('pretty',), ('much',), ('kills',), ('dish',)]]
[[0.570525   0.38879833 0.04067667]]
negative
[[('call',), ('nitpicky',), ('overcooked',), ('yolks',), ('pretty',), ('much',), ('kills',), ('dish',)]]
[[0.01377953 0.98383176 0.00238871]]
neutral
[[('call',), ('nitpicky',), ('overcooked',), ('yolks',), ('pretty',), ('much',), ('kills',), ('dish',)]]
[[0.00160234 0.96747581 0.03092185]]
neutral
[[('call',), ('nitpicky',), ('overcooked',), ('yolks',), ('pretty',), ('much',), ('kills',), ('dish',)]]
[[5.27395223e-02 9.47233811e-01 2.66666667e-05]]
neutral
