In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from imblearn.over_sampling import SMOTE

In [2]:
dataset = pd.read_csv("modelData.csv")
dataset.head()

Unnamed: 0.1,Unnamed: 0,Review,Ratings
0,1,book bay suite family seamless book process ho...,Positive
1,2,world class service highly recommend anyone be...,Positive
2,3,family adult kid thoroughly enjoyable staycati...,Positive
3,4,look unforgettable staycation experience pleas...,Positive
4,5,great staycation celebrate anniversary alice p...,Positive


In [3]:
dataset.dtypes

Unnamed: 0     int64
Review        object
Ratings       object
dtype: object

In [4]:
dataset.isna().sum()

Unnamed: 0    0
Review        0
Ratings       0
dtype: int64

In [5]:
dataset.drop(['Unnamed: 0'], axis=1, inplace=True)
dataset.head()

Unnamed: 0,Review,Ratings
0,book bay suite family seamless book process ho...,Positive
1,world class service highly recommend anyone be...,Positive
2,family adult kid thoroughly enjoyable staycati...,Positive
3,look unforgettable staycation experience pleas...,Positive
4,great staycation celebrate anniversary alice p...,Positive


In [6]:
dataset['Review'].values[0]

'book bay suite family seamless book process hotline worry check process fast not many sand lounge recommend pay little club lounge benefit general queue long room quality excellent view garden bay tip breakfast spago food coffee quality superior restaurant option butler service great fruit cooky milk cake deliver daily book pool slot advance win disappointed amazing experience swim sky beautiful city skyline backdrop five star high service quality among every staff room quality hope tv upgraded soon look pretty outdated time smart tv netflix youtube current trend perfect stay'

In [7]:
rate = {"Positive":2,"Negative":0,"Neutral":1}

In [8]:
dataset['Ratings'] = dataset['Ratings'].map(rate)

In [9]:
dataset.isna().sum()

Review     0
Ratings    0
dtype: int64

In [10]:
X = dataset["Review"].values
y = dataset.drop("Review", axis=1).values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

In [11]:
X.shape

(3000,)

## vectorize the data

In [12]:
vect = CountVectorizer()
X_train = vect.fit_transform(X_train)
X_test = vect.transform(X_test)

In [13]:
X_train.shape

(2100, 6512)

In [14]:
tfidf = TfidfTransformer()
X_train = tfidf.fit_transform(X_train)
X_test = tfidf.transform(X_test)
X_train = X_train.toarray()
X_test = X_test.toarray()

In [15]:
X_train.shape

(2100, 6512)

## use machine learning algorithms for training

# 1. Random Forest Algorithm

In [16]:
from sklearn.ensemble import RandomForestClassifier
rf_cl = RandomForestClassifier()

In [17]:
from sklearn.model_selection import RandomizedSearchCV

In [22]:
import numpy as np
## Randomized Search CV

# Number of trees in random forest
n_estimators = [int(i) for i in np.linspace(start=100, stop=1200, num=12)]

# Number of features to consider at evry split
max_features = ['auto']

# max number of levels in tree
max_depth = [int(i) for i in np.linspace(5, 30, num=6)]

# max_depth.append(None)
# min number of samples required to split a node
min_samples_split = [2, 5, 10, 15, 100]

#min number of sample required for each leaf node
min_samples_leaf = [1, 2, 5, 10]

In [23]:
# Create the random_grid 

random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf}

print(random_grid)

{'n_estimators': [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000, 1100, 1200], 'max_features': ['auto'], 'max_depth': [5, 10, 15, 20, 25, 30], 'min_samples_split': [2, 5, 10, 15, 100], 'min_samples_leaf': [1, 2, 5, 10]}


In [24]:
rf_random = RandomizedSearchCV(estimator=rf_cl, param_distributions=random_grid, scoring='neg_mean_squared_error', n_iter=10, cv=5,verbose=2, random_state=42, n_jobs=1)

In [None]:
rf_random.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV] n_estimators=1100, min_samples_split=15, min_samples_leaf=5, max_features=auto, max_depth=25 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  n_estimators=1100, min_samples_split=15, min_samples_leaf=5, max_features=auto, max_depth=25, total=19.1min
[CV] n_estimators=1100, min_samples_split=15, min_samples_leaf=5, max_features=auto, max_depth=25 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed: 19.1min remaining:    0.0s


In [344]:

rf_random.best_params_

{'n_estimators': 1000,
 'min_samples_split': 2,
 'min_samples_leaf': 1,
 'max_features': 'sqrt',
 'max_depth': 25}

In [18]:
best_random_forest_model = RandomForestClassifier(n_estimators=1000, min_samples_split=2, min_samples_leaf=1, max_features='sqrt', max_depth=25)

In [19]:
best_random_forest_model.fit(X_train, y_train)

  """Entry point for launching an IPython kernel.


RandomForestClassifier(max_depth=25, max_features='sqrt', n_estimators=1000)

In [20]:
pred = best_random_forest_model.predict(X_test)

In [21]:
from sklearn.metrics import roc_auc_score, accuracy_score

In [22]:
accuracy_score(y_test, pred)

0.8122222222222222

In [23]:
from sklearn.metrics import classification_report, confusion_matrix

In [24]:
print(confusion_matrix(y_test, pred))

[[252  31  13]
 [ 36 209  46]
 [ 16  27 270]]


In [25]:
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.83      0.85      0.84       296
           1       0.78      0.72      0.75       291
           2       0.82      0.86      0.84       313

    accuracy                           0.81       900
   macro avg       0.81      0.81      0.81       900
weighted avg       0.81      0.81      0.81       900



In [36]:
review = ["service is not good"]

In [37]:
r = vect.transform(review)

In [38]:
r.shape

(1, 6512)

In [39]:
r

<1x6512 sparse matrix of type '<class 'numpy.int64'>'
	with 3 stored elements in Compressed Sparse Row format>

In [40]:
best_random_forest_model.predict(r)

array([0], dtype=int64)

In [55]:
import pickle

In [56]:
file = open('vectorize.pkl','wb')

# dump information to that file
pickle.dump(vect, file)
file.close()

In [57]:
file = open('tfidfTransformer.pkl','wb')

# dump information to that file
pickle.dump(tfidf, file)
file.close()

In [58]:
# open a file, where you want to share the data
file = open('randomForest_clssify_model.pkl','wb')

# dump information to that file
pickle.dump(best_random_forest_model, file)
file.close()