In [0]:
import pandas as pd

# read data
reviews_df = pd.read_csv("train_Reviews.csv")

dmap = {"good": 1, "bad":-1,"neutral" : 0}

reviews_df["review_status"] = reviews_df["classification"].replace(dmap)
# select only relevant columns
reviews_df = reviews_df[["review", "review_status"]]

In [0]:
reviews_df = reviews_df.sample(frac = 0.5, replace = False, random_state=42)

In [0]:
reviews_df["review"] = reviews_df["review"].apply(lambda x: str(x).replace("No Negative", "").replace("No Positive", ""))

In [4]:
reviews_df.head()

Unnamed: 0,review,review_status
59961,Air condition didn t worked pretty old room s...,0
51906,Unfortunately the corridor before our room sm...,0
52156,WI FI VERY BAD SIGNAL NEVER COULD GET IT THEY...,0
62102,Stafff and the way they work a lot of things ...,0
55338,The pillows were too thick and hard The showe...,0


In [5]:
import nltk
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [0]:
from nltk.corpus import wordnet

def get_wordnet_pos(pos_tag):
    if pos_tag.startswith('J'):
        return wordnet.ADJ
    elif pos_tag.startswith('V'):
        return wordnet.VERB
    elif pos_tag.startswith('N'):
        return wordnet.NOUN
    elif pos_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN
    
import string
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.tokenize import WhitespaceTokenizer
from nltk.stem import WordNetLemmatizer

def clean_text(text):
    # lower text
    text = text.lower()
    # tokenize text and remove puncutation
    text = [word.strip(string.punctuation) for word in text.split(" ")]
    # remove words that contain numbers
    text = [word for word in text if not any(c.isdigit() for c in word)]
    # remove stop words
    stop = stopwords.words('english')
    text = [x for x in text if x not in stop]
    # remove empty tokens
    text = [t for t in text if len(t) > 0]
    # pos tag text
    #pos_tags = pos_tag(text)
    # lemmatize text
    text = [WordNetLemmatizer().lemmatize(t) for t in text]
    # remove words with only one letter
    text = [t for t in text if len(t) > 1]
    # join all
    text = " ".join(text)
    #print("cleaning complete for",text)
    return(text)

In [0]:
# clean text data
reviews_df["review_clean"] = reviews_df["review"].apply(lambda x: clean_text(str(x)))

In [0]:
from sklearn.feature_extraction.text import CountVectorizer  
bow_vectorizer = CountVectorizer(max_features=3000, min_df=5, max_df=0.7, stop_words=stopwords.words('english'),ngram_range=(1,2))  
bow_result = bow_vectorizer.fit_transform(reviews_df["review_clean"]).toarray()
bow_df = pd.DataFrame(bow_result,columns = bow_vectorizer.get_feature_names())
bow_df.index = reviews_df.index
bow_df_final = pd.concat([bow_df,reviews_df["review_status"]],axis=1)

In [0]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(min_df = 10)
tfidf_result = tfidf_vectorizer.fit_transform(reviews_df["review_clean"]).toarray()
tfidf_df = pd.DataFrame(tfidf_result, columns = tfidf_vectorizer.get_feature_names())
tfidf_df.index = reviews_df.index
tfidf_df_final = pd.concat([tfidf_df,reviews_df["review_status"]],axis=1)

In [0]:

# feature selection
label = "review_status"
ignore_cols = [label, "review", "review_clean"]
features = [c for c in bow_df_final.columns if c not in ignore_cols]

# split the data into train and test
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(bow_df_final[features], bow_df_final[label], test_size = 0.20, random_state = 42)

In [11]:
# train a random forest classifier

rf = RandomForestClassifier(n_estimators = 100, random_state = 42 )
rf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)

In [12]:
rf.score(X_test,y_test)

0.6721726549312756

In [0]:
y_pred = rf.predict(X_test)

In [0]:
X_test.shape

(516, 1240)

In [14]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(confusion_matrix(y_test,y_pred))  
print(classification_report(y_test,y_pred))  
print(accuracy_score(y_test, y_pred))  

[[1205  728  179]
 [ 360 2041  741]
 [ 122  589 2329]]
              precision    recall  f1-score   support

          -1       0.71      0.57      0.63      2112
           0       0.61      0.65      0.63      3142
           1       0.72      0.77      0.74      3040

    accuracy                           0.67      8294
   macro avg       0.68      0.66      0.67      8294
weighted avg       0.67      0.67      0.67      8294

0.6721726549312756


In [0]:
label = "review_status"
ignore_cols = [label, "review", "review_clean"]
features = [c for c in tfidf_df_final.columns if c not in ignore_cols]

# split the data into train and test
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(tfidf_df_final[features], tfidf_df_final[label], test_size = 0.20, random_state = 42)

In [16]:
# train a random forest classifier
rf2 = RandomForestClassifier(n_estimators = 100, random_state = 42)
rf2.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)

In [18]:
rf2.score(X_test,y_test)

0.6646973715939233

In [0]:

gen_df = pd.read_csv('predict_Reviews.csv')

In [21]:

gen_df.groupby('classification').count()

Unnamed: 0_level_0,review
classification,Unnamed: 1_level_1
bad,482
good,645
neutral,628


In [0]:
gen_df['review_status'] = gen_df['classification'].replace(dmap)

In [0]:
gen_df.drop('classification',axis=1,inplace=True)

In [0]:
# clean text data
gen_df["review_clean"] = gen_df["review"].apply(lambda x: clean_text(str(x)))

In [0]:
gen_result = bow_vectorizer.transform(gen_df["review_clean"]).toarray()

In [0]:
pred_df = pd.DataFrame(gen_result,columns = bow_vectorizer.get_feature_names())
pred_df.index = gen_df.index
pred_df_final = pd.concat([pred_df,gen_df["review_status"]],axis=1)

In [0]:
#feature selection
gen_label = "review_status"
ignore_cols = [gen_label, "review", "review_clean"]
pred_features = [c for c in pred_df_final.columns if c not in ignore_cols]
pred_df_final.fillna(0,inplace=True)
pred_x = pred_df_final[pred_features]
pred_y = pred_df_final[gen_label]

In [33]:
rf.predict(pred_x)

array([-1, -1,  0, ..., -1, -1,  0])

In [34]:
rf.score(pred_x,pred_y)

0.6719727117680501