In [0]:
import pandas as pd

# read data
reviews_df = pd.read_csv("Hotel_Reviews.csv")
# append the positive and negative text reviews
reviews_df["review"] = reviews_df["Negative_Review"] + reviews_df["Positive_Review"]
# create the label
reviews_df["is_bad_review"] = reviews_df["Reviewer_Score"].apply(lambda x: -1 if x <=4 else 0 if x<=7 else 1)
# select only relevant columns
reviews_df = reviews_df[["review", "is_bad_review"]]

In [0]:
reviews_df = reviews_df.sample(frac = 0.2, replace = False, random_state=42)

In [113]:
reviews_df.shape

(515738, 2)

In [0]:
reviews_df["review"] = reviews_df["review"].apply(lambda x: x.replace("No Negative", "").replace("No Positive", ""))

In [115]:
import nltk
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [0]:
from nltk.corpus import wordnet

def get_wordnet_pos(pos_tag):
    if pos_tag.startswith('J'):
        return wordnet.ADJ
    elif pos_tag.startswith('V'):
        return wordnet.VERB
    elif pos_tag.startswith('N'):
        return wordnet.NOUN
    elif pos_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN
    
import string
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.tokenize import WhitespaceTokenizer
from nltk.stem import WordNetLemmatizer

def clean_text(text):
    # lower text
    text = text.lower()
    # tokenize text and remove puncutation
    text = [word.strip(string.punctuation) for word in text.split(" ")]
    # remove words that contain numbers
    text = [word for word in text if not any(c.isdigit() for c in word)]
    # remove stop words
    stop = stopwords.words('english')
    text = [x for x in text if x not in stop]
    # remove empty tokens
    text = [t for t in text if len(t) > 0]
    # pos tag text
    pos_tags = pos_tag(text)
    # lemmatize text
    text = [WordNetLemmatizer().lemmatize(t) for t in text]
    # remove words with only one letter
    text = [t for t in text if len(t) > 1]
    # join all
    text = " ".join(text)
    return(text)

In [0]:
# clean text data
reviews_df["review_clean"] = reviews_df["review"].apply(lambda x: clean_text(x))

In [0]:
from sklearn.feature_extraction.text import CountVectorizer  
bow_vectorizer = CountVectorizer(max_features=1500, min_df=5, max_df=0.7, stop_words=stopwords.words('english'))  
bow_result = bow_vectorizer.fit_transform(reviews_df["review_clean"]).toarray()
bow_df = pd.DataFrame(bow_result,columns = bow_vectorizer.get_feature_names())
bow_df.index = reviews_df.index
bow_df_final = pd.concat([bow_df,reviews_df["is_bad_review"]],axis=1)

In [123]:
bow_df_final.shape

(103148, 1501)

In [0]:

# feature selection
label = "is_bad_review"
ignore_cols = [label, "review", "review_clean"]
features = [c for c in bow_df_final.columns if c not in ignore_cols]

# split the data into train and test
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(bow_df_final[features], bow_df_final[label], test_size = 0.20, random_state = 42)

In [125]:
bow_df_final[["is_bad_review", "review", "review_clean"]]

KeyError: ignored

In [0]:
exclusion = [c for c in pred_x.columns if c not in X_train.columns]

In [126]:
# train a random forest classifier
rf = RandomForestClassifier(n_estimators = 100, random_state = 42)
rf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)

In [127]:
rf.score(X_test,y_test)

0.8466795928259816

In [0]:
y_pred = rf.predict(X_test)

In [81]:
X_test.shape

(516, 1240)

In [129]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(confusion_matrix(y_test,y_pred))  
print(classification_report(y_test,y_pred))  
print(accuracy_score(y_test, y_pred))  

[[   27   202   232]
 [   16   726  2267]
 [   10   436 16714]]
              precision    recall  f1-score   support

          -1       0.51      0.06      0.11       461
           0       0.53      0.24      0.33      3009
           1       0.87      0.97      0.92     17160

    accuracy                           0.85     20630
   macro avg       0.64      0.42      0.45     20630
weighted avg       0.81      0.85      0.82     20630

0.8466795928259816


In [0]:

gen_df = pd.read_csv('test-data.csv')

In [132]:

gen_df.groupby('classified').count()

Unnamed: 0_level_0,review
classified,Unnamed: 1_level_1
negative,26
neutral,16
positive,31


In [0]:
def rate(classified):
    if classified == 'negative':
        return -1
    elif classified == 'neutral':
        return 0
    else:
        return 1

In [0]:
gen_df['is_bad_review'] = gen_df['classified'].apply(rate)

In [0]:
gen_df.drop('classified',axis=1,inplace=True)

In [0]:
# clean text data
gen_df["review_clean"] = gen_df["review"].apply(lambda x: clean_text(x))

In [0]:
gen_result = bow_vectorizer.transform(gen_df["review_clean"]).toarray()

In [0]:
bow_vectorizer.get_feature_names()

In [0]:
pred_df = pd.DataFrame(gen_result,columns = bow_vectorizer.get_feature_names())
pred_df.index = gen_df.index
pred_df_final = pd.concat([pred_df,gen_df["is_bad_review"]],axis=1)

In [139]:
len(pred_features)

1240

In [0]:
#feature selection
gen_label = "is_bad_review"
ignore_cols = [gen_label, "review", "review_clean"]
pred_features = [c for c in pred_df_final.columns if c not in ignore_cols]
pred_df_final.fillna(0,inplace=True)
pred_x = pred_df_final[pred_features]
pred_y = pred_df_final[gen_label]

In [141]:
pred_df_final.shape

(73, 1501)

In [142]:
rf.predict(pred_x)

array([ 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  0,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1, -1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1, -1,  1,  0,  1,  1,  1,  1,
        1,  1,  0,  1,  1,  1,  1,  1,  0,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  0,  0])

In [143]:
rf.score(pred_x,pred_y)

0.4931506849315068