In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import Word2Vec
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
import string
import time

In [2]:
# 1. Data processing
# 1.1 Read CSV file, select the first 10,000 records and keep 'Text' & 'Score'
file_path = 'Reviews.csv'
df = pd.read_csv(file_path)
df_filtered = df[['Text', 'Score']].head(10000)
df_id = df['Id'].head(10000)
print(df_filtered)

# Convert Score>=4 to Score=1, and convert others to Score=0 (1: positive 0: negative)
df_filtered['Score'] = df_filtered['Score'].apply(lambda x: 1 if x >= 4 else 0)
print(df_filtered)

# Remove punctuation and extra spaces from the 'Text' column
df_filtered['Text'] = df_filtered['Text'].str.replace(f'[{string.punctuation}]', '', regex=True)
df_filtered['Text'] = df_filtered['Text'].str.replace(r'\s+', ' ', regex=True).str.strip()
print(df_filtered)

# Split the text in the 'Text' column using the delimiter
df_filtered['Text'] = df_filtered['Text'].str.split(' ')
print(df_filtered)

                                                   Text  Score
0     I have bought several of the Vitality canned d...      5
1     Product arrived labeled as Jumbo Salted Peanut...      1
2     This is a confection that has been around a fe...      4
3     If you are looking for the secret ingredient i...      2
4     Great taffy at a great price.  There was a wid...      5
...                                                 ...    ...
9995  we switched from the advance similac to the or...      1
9996  Like the bad reviews say, the organic formula ...      5
9997  I wanted to solely breastfeed but was unable t...      5
9998  i love the fact that i can get this delieved t...      5
9999  We have a 7 week old... He had gas and constip...      4

[10000 rows x 2 columns]
                                                   Text  Score
0     I have bought several of the Vitality canned d...      1
1     Product arrived labeled as Jumbo Salted Peanut...      0
2     This is a confection th

In [3]:
# 1.2 Remove stop words
vectorizer = CountVectorizer(stop_words='english')
df_filtered['Text'] = df_filtered['Text'].apply(lambda x: ' '.join(vectorizer.build_analyzer()(' '.join(x))))
print(df_filtered)

                                                   Text  Score
0     bought vitality canned dog food products good ...      1
1     product arrived labeled jumbo salted peanutsth...      0
2     confection centuries light pillowy citrus gela...      1
3     looking secret ingredient robitussin believe g...      0
4     great taffy great price wide assortment yummy ...      1
...                                                 ...    ...
9995  switched advance similac organic product think...      0
9996  like bad reviews say organic formula constipat...      1
9997  wanted solely breastfeed unable supplement for...      1
9998  love fact delieved house delievy chargeit hard...      1
9999  week old gas constipation problems weeks tried...      1

[10000 rows x 2 columns]


In [4]:
# Remove frequent words
# Use CountVectorizer to calculate word frequencies
vectorizer = CountVectorizer()
word_counts = vectorizer.fit_transform(df_filtered['Text'])

# Sum the word frequencies
word_freq = word_counts.sum(axis=0)
word_freq = [(word, word_freq[0, idx]) for word, idx in vectorizer.vocabulary_.items()]

# Sort the words by frequency
word_freq_sorted = sorted(word_freq, key=lambda x: x[1], reverse=True)
print(word_freq_sorted[:10])

# Define frequent words (for example, top 10 most frequent words)
frequent_words = [word for word, freq in word_freq_sorted[:10]]  # Top 10 frequent words

# Remove frequent words from the 'Text' column
df_filtered['Text'] = df_filtered['Text'].apply(lambda text: ' '.join([word for word in text.split() if word not in frequent_words]))

[('br', 4375), ('like', 4268), ('coffee', 3543), ('good', 3411), ('taste', 2929), ('just', 2878), ('great', 2797), ('flavor', 2598), ('product', 2450), ('love', 2028)]


In [5]:
# 1.3 Text mining preprocessing: Convert the text into vectors, implement TF-IDF and Word2Vec, and compare the results
# Implement TF-IDF
start_time = time.time()
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(df_filtered['Text'])
end_time = time.time()
print(f"TF-IDF time: {end_time - start_time} sec")
print("TF-IDF Matrix:\n", tfidf_matrix.toarray())

# Implement Word2Vec
start_time = time.time()
word2vec_model = Word2Vec(df_filtered['Text'], vector_size=100, window=5, min_count=1, workers=4)
def average_word_vectors(words, model, vector_size):
    vector = np.zeros(vector_size)
    count = 0
    for word in words:
        if word in model.wv:
            vector += model.wv[word]
            count += 1
    if count != 0:
        vector /= count
    return vector
df_filtered['Word2Vec_Vector'] = df_filtered['Text'].apply(lambda x: average_word_vectors(x, word2vec_model, 100))
Word2Vec_vectors = np.array(df_filtered['Word2Vec_Vector'].tolist())
end_time = time.time()
print(f"Word2Vec time: {end_time - start_time} sec")
print("Word2Vec Matrix:\n", Word2Vec_vectors)

TF-IDF time: 0.26729893684387207 sec
TF-IDF Matrix:
 [[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
Word2Vec time: 6.768906593322754 sec
Word2Vec Matrix:
 [[ 0.02018949  0.03739723 -0.00541242 ... -0.00196284  0.03204571
   0.04624848]
 [-0.03297562  0.03235491 -0.00132849 ...  0.04655     0.08154547
   0.0123597 ]
 [ 0.02657184  0.04558397 -0.00805743 ...  0.00161942  0.06419811
   0.01243184]
 ...
 [-0.00267181  0.04989159  0.00206788 ...  0.00506341  0.05094728
   0.02337295]
 [ 0.00566839  0.04982306 -0.00472831 ...  0.029323    0.03726533
   0.01427644]
 [ 0.00095775  0.04158614  0.00667466 ...  0.00477131  0.02261127
   0.02707624]]


In [6]:
# 2. Modeling: Use Random Forest for classification
# Split the dataset into training and testing sets
X_train_tfidf, X_test_tfidf, y_train, y_test, idx_train_tfidf, idx_test_tfidf = \
    train_test_split(tfidf_matrix, df_filtered['Score'],df_id, test_size=0.4, random_state=42)
X_train_word2vec, X_test_word2vec, y_train_word2vec, y_test_word2vec, idx_train_word2vec, idx_test_word2vec = \
    train_test_split(Word2Vec_vectors, df_filtered['Score'], df_id, test_size=0.4, random_state=42)

# Initialize the Random Forest classifier
rf_classifier_tfidf = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier_word2vec = RandomForestClassifier(n_estimators=100, random_state=42)

# Train model
rf_classifier_tfidf.fit(X_train_tfidf, y_train)
rf_classifier_word2vec.fit(X_train_word2vec, y_train)

# Predictions
y_pred_tfidf = rf_classifier_tfidf.predict(X_test_tfidf)
y_pred_word2vec = rf_classifier_word2vec.predict(X_test_word2vec)

# Accuracy
accuracy_tfidf = accuracy_score(y_test, y_pred_tfidf)
print(f"TF-IDF Random Forest Accuracy: {accuracy_tfidf:.4f}")
accuracy_word2vec = accuracy_score(y_test, y_pred_word2vec)
print(f"Word2Vec Random Forest Accuracy: {accuracy_word2vec:.4f}")

TF-IDF Random Forest Accuracy: 0.8250
Word2Vec Random Forest Accuracy: 0.7875


In [None]:
# 3. Evaluate the model: Perform k-fold cross-validation with k=4 and calculate the accuracy
accuracy_scores_tfidf = cross_val_score(rf_classifier_tfidf, tfidf_matrix, df_filtered['Score'], cv=4, scoring='accuracy')
accuracy_scores_word2vec = cross_val_score(rf_classifier_word2vec, Word2Vec_vectors, df_filtered['Score'], cv=4, scoring='accuracy')
print(f"TF-IDF Average Accuracy: {accuracy_scores_tfidf.mean():.4f}")
print(f"Word2Vec Average Accuracy: {accuracy_scores_word2vec.mean():.4f}")

In [None]:
# submission
idx_test_tfidf_reset = pd.Series(range(1, len(idx_test_tfidf) + 1))
idx_test_word2vec_reset = pd.Series(range(1, len(idx_test_word2vec) + 1))

submission_tfidf = pd.DataFrame({
    'ID': range(1, len(idx_test_tfidf) + 1),
    'Score': y_pred_tfidf
})
submission_word2vec = pd.DataFrame({
    'ID': range(1, len(idx_test_word2vec) + 1),
    'Score': y_pred_word2vec
})

submission_tfidf.to_csv('submission_tfidf.csv', index=False)
submission_word2vec.to_csv('submission_word2vec.csv', index=False)