In [13]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier

In [14]:
df = pd.read_csv('ReviewsPreprocessingStemming.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 148845 entries, 0 to 148844
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   Score   148845 non-null  int64 
 1   Text    148845 non-null  object
dtypes: int64(1), object(1)
memory usage: 2.3+ MB


In [16]:
bow_tf = CountVectorizer()
res_tf = bow_tf.fit_transform(df['Text'])

In [4]:
bow_tfidf = TfidfVectorizer()
res_tf_idf = bow_tfidf.fit_transform(df['Text'])

In [5]:
df_train, df_test = train_test_split(df, train_size=0.8)

In [6]:
def classify_with(classifier, X_train, y_train, X_test, y_test):
    classifier.fit(X_train, y_train)
    y_predicted = classifier.predict(X_test)
    return {
        'accuracy_score': accuracy_score(y_test, y_predicted),
        'recall_score': recall_score(y_test, y_predicted, average='macro'),
        'precision_score': precision_score(y_test, y_predicted, average='macro')
    }

In [7]:
tf_vectorizer = CountVectorizer()
Xtrain = tf_vectorizer.fit_transform(df_train['Text'])
Xtest = tf_vectorizer.transform(df_test['Text'])

encoder = LabelEncoder()
ytrain = encoder.fit_transform(df_train['Score']) 
ytest = encoder.transform(df_test['Score'])

In [8]:
rf = RandomForestClassifier(verbose=1, n_jobs=-1)
results = classify_with(rf, Xtrain, ytrain, Xtest, ytest)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  2.6min
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  6.0min finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.3s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    0.9s finished


In [9]:
results

{'accuracy_score': 0.6274984043803957,
 'recall_score': 0.6281104685767368,
 'precision_score': 0.6322012383418855}

In [10]:
tfidf_vectorizer = TfidfVectorizer()
Xtrain = tfidf_vectorizer.fit_transform(df_train['Text'])
Xtest = tfidf_vectorizer.transform(df_test['Text'])

encoder = LabelEncoder()
ytrain = encoder.fit_transform(df_train['Score'])
ytest = encoder.transform(df_test['Score'])

In [11]:
rf_tfidf = RandomForestClassifier(verbose=1, n_jobs=-1)
results_tfidf = classify_with(rf_tfidf, Xtrain, ytrain, Xtest, ytest)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  5.6min finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    7.1s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    7.8s finished


In [12]:
results_tfidf

{'accuracy_score': 0.6263898686553125,
 'recall_score': 0.6270027918604322,
 'precision_score': 0.6307229779019037}