In [1]:
import googleapiclient.discovery
import googleapiclient.errors
import numpy as np
import pandas as pd
import glob
import nltk 
import os
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import matplotlib.pyplot as plt
import re
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.pipeline import Pipeline
from langdetect import detect
import shutil
import random
from wordcloud import WordCloud
import gensim
import seaborn as sns
from textblob import TextBlob
from tabulate import tabulate
from sklearn.model_selection import GridSearchCV 
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.datasets import load_files
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split


nltk.download('vader_lexicon')
nltk.download('stopwords')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\Archisa\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Archisa\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
train_df = pd.read_csv('train.csv.zip')
test_df = pd.read_csv('test.csv.zip')

In [3]:
train_df = train_df.head(10000)

In [4]:
train_df.shape

(10000, 2)

In [5]:
def remove_punctuation(text):
    # Define the pattern to match punctuation
    punctuation_pattern = r'[^\w\s]'
    # Replace punctuation with an empty string
    text_without_punctuation = re.sub(punctuation_pattern, '', text)
    # Normalize whitespace
    normalized_text = re.sub(r'\s+', ' ', text_without_punctuation)
    return normalized_text

# Text Processing
def preprocess_text(text, method='snowballstemming'):
    # Tokenize and lowercase
    tokens = word_tokenize(text.lower())

    # Define the set of stopwords
    stop_words = set(stopwords.words('english'))
    
    # Remove stopwords and punctuation
    tokens = [remove_punctuation(token) for token in tokens if token.isalnum() and token not in stop_words]
    
    if method == 'snowballstemming':
        stemmer = nltk.stem.SnowballStemmer('english')
        tokens = [stemmer.stem(token) for token in tokens]

    elif method == 'porterstemming':
        stemmer = nltk.stem.PorterStemmer()
        tokens = [stemmer.stem(token) for token in tokens]

    elif method == 'lemmatization':
        lemmatizer = nltk.stem.WordNetLemmatizer()
        tokens = [lemmatizer.lemmatize(token) for token in tokens]
    
    return ' '.join(tokens)

In [6]:
train_df['ssprocessing_text'] = train_df['Review'].apply(preprocess_text, method='snowballstemming')
train_df['psprocessing_text'] = train_df['Review'].apply(preprocess_text, method='porterstemming')
train_df['lemprocessing_text'] = train_df['Review'].apply(preprocess_text, method='lemmatization')

In [7]:
train_df

Unnamed: 0,overall,Review,ssprocessing_text,psprocessing_text,lemprocessing_text
0,5,I love these glitter pens. They sparkle deligh...,love glitter pen sparkl delight page brilliant...,love glitter pen sparkl delight page brilliant...,love glitter pen sparkle delightfully page bri...
1,5,It works well with my machine. I use mostly c...,work well machin use most cone,work well machin use mostli cone,work well machine use mostly cone
2,5,"This is a great assortment of colors, though t...",great assort color though lot pink mix still c...,great assort color though lot pink mix still c...,great assortment color though lot pink mix sti...
3,5,Just what I was looking for.,look,look,looking
4,5,I make 400 birds for the hospital each month.,make 400 bird hospit month,make 400 bird hospit month,make 400 bird hospital month
...,...,...,...,...,...
9995,5,"Lovely yarn, very fast delivery, I am so pleas...",love yarn fast deliveri pleas buy,love yarn fast deliveri pleas buy,lovely yarn fast delivery pleased buy
9996,5,These perfectly match some charms I ordered to...,perfect match charm order make photo pendant like,perfectli match charm order make photo pendant...,perfectly match charm ordered make photo penda...
9997,5,Perfect for my project.,perfect project,perfect project,perfect project
9998,5,Great product!,great product,great product,great product


In [8]:
import tensorflow as tf
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import LSTM, Embedding, Dense
from sklearn.model_selection import train_test_split

import gensim

In [9]:
# tokenized_df = train_df.copy()

# # Iterate over each column
# for column in train_df.columns:
#     # Check if the column is not 'overall'
#     if column != 'overall':
#         # Tokenize values in the column
#         tokenized_values = train_df[column].apply(lambda x: word_tokenize(x))
#         # Add tokenized values to the new DataFrame
#         tokenized_df[column] = tokenized_values

# tokenized_df

# word2vec_model = gensim.models.Word2Vec(subset_df["tokens"].tolist(), min_count=5, window=9, vector_size=100)

In [10]:
train_df_ss = train_df[['overall', 'ssprocessing_text']].copy()
train_df_ss

Unnamed: 0,overall,ssprocessing_text
0,5,love glitter pen sparkl delight page brilliant...
1,5,work well machin use most cone
2,5,great assort color though lot pink mix still c...
3,5,look
4,5,make 400 bird hospit month
...,...,...
9995,5,love yarn fast deliveri pleas buy
9996,5,perfect match charm order make photo pendant like
9997,5,perfect project
9998,5,great product


In [11]:
train_df_ps = train_df[['overall', 'psprocessing_text']].copy()
train_df_ps

Unnamed: 0,overall,psprocessing_text
0,5,love glitter pen sparkl delight page brilliant...
1,5,work well machin use mostli cone
2,5,great assort color though lot pink mix still c...
3,5,look
4,5,make 400 bird hospit month
...,...,...
9995,5,love yarn fast deliveri pleas buy
9996,5,perfectli match charm order make photo pendant...
9997,5,perfect project
9998,5,great product


In [12]:
train_df_lem = train_df[['overall', 'lemprocessing_text']].copy()
train_df_lem

Unnamed: 0,overall,lemprocessing_text
0,5,love glitter pen sparkle delightfully page bri...
1,5,work well machine use mostly cone
2,5,great assortment color though lot pink mix sti...
3,5,looking
4,5,make 400 bird hospital month
...,...,...
9995,5,lovely yarn fast delivery pleased buy
9996,5,perfectly match charm ordered make photo penda...
9997,5,perfect project
9998,5,great product


### Withoud word2vec

In [27]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout

# Split the data into features and target labels
X = train_df['lemprocessing_text']
y = train_df['overall']

# Tokenization
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X)
X_sequences = tokenizer.texts_to_sequences(X)

# Padding sequences
max_len = 100  # Define your maximum sequence length
X_padded = pad_sequences(X_sequences, maxlen=max_len)

# Convert target labels to numerical values
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_padded, y_encoded, test_size=0.2, random_state=42)

# Define LSTM model
def LSTM_model(input_length, vocab_size, embedding_size):
    model = Sequential()
    model.add(Embedding(vocab_size, embedding_size, input_length=input_length))
    model.add(LSTM(100))
    model.add(Dropout(0.2))
    model.add(Dense(5, activation='softmax'))
    model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

# Define parameters
vocab_size = len(tokenizer.word_index) + 1  # Add 1 for padding token
embedding_size = 100  # Define your embedding size

# Create and compile the LSTM model
lstm_model = LSTM_model(max_len, vocab_size, embedding_size)

# Train the model
lstm_model.fit(X_train, y_train, epochs=20, batch_size=32, validation_data=(X_test, y_test))

# Evaluate the model
loss, accuracy = lstm_model.evaluate(X_test, y_test)
print("Test Accuracy:", accuracy)


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Test Accuracy: 0.7120000123977661


In [22]:
from gensim.models import Word2Vec


# Split the data into features and target labels
X = train_df['lemprocessing_text']
y = train_df['overall']

# Tokenization
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X)
X_sequences = tokenizer.texts_to_sequences(X)

# Padding sequences
max_len = 100  # Define your maximum sequence length
X_padded = pad_sequences(X_sequences, maxlen=max_len)

# Train Word2Vec model
word2vec_model = Word2Vec(sentences=X, vector_size=embedding_size, window=5, min_count=1, workers=4)

# Convert words to Word2Vec embeddings
word_index = tokenizer.word_index
embedding_matrix = np.zeros((vocab_size, embedding_size))
# Convert words to Word2Vec embeddings
embedding_matrix = np.zeros((vocab_size, embedding_size))
for word, i in word_index.items():
    if word in word2vec_model.wv:
        embedding_matrix[i] = word2vec_model.wv[word]


# Convert target labels to numerical values
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_padded, y_encoded, test_size=0.2, random_state=42)

# Define LSTM model with Word2Vec embeddings
def LSTM_model_with_Word2Vec(input_length, vocab_size, embedding_size, embedding_matrix):
    model = Sequential()
    model.add(Embedding(vocab_size, embedding_size, input_length=input_length, weights=[embedding_matrix], trainable=False))
    model.add(LSTM(100))
    model.add(Dropout(0.2))
    model.add(Dense(5, activation='softmax'))
    model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

# Define parameters
vocab_size = len(tokenizer.word_index) + 1  # Add 1 for padding token
embedding_size = 100  # Define your embedding size

# Create and compile the LSTM model with Word2Vec embeddings
lstm_model_with_word2vec = LSTM_model_with_Word2Vec(max_len, vocab_size, embedding_size, embedding_matrix)

# Train the model
lstm_model_with_word2vec.fit(X_train, y_train, epochs=20, batch_size=32, validation_data=(X_test, y_test))

# Evaluate the model
loss, accuracy = lstm_model_with_word2vec.evaluate(X_test, y_test)
print("Test Accuracy with Word2Vec embeddings:", accuracy)


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Test Accuracy with Word2Vec embeddings: 0.7799999713897705


In [23]:
lstm_model_with_word2vec.save('lstm_model_lem.keras')

INFO:tensorflow:Assets written to: lstm_model_lem.model\assets


INFO:tensorflow:Assets written to: lstm_model_lem.model\assets


In [25]:
from gensim.models import Word2Vec

# Split the data into features and target labels
X = train_df['psprocessing_text']
y = train_df['overall']

# Tokenization
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X)
X_sequences = tokenizer.texts_to_sequences(X)

# Padding sequences
max_len = 100  # Define your maximum sequence length
X_padded = pad_sequences(X_sequences, maxlen=max_len)

# Train Word2Vec model
word2vec_model = Word2Vec(sentences=X, vector_size=embedding_size, window=5, min_count=1, workers=4)

# Convert words to Word2Vec embeddings
embedding_matrix = np.zeros((vocab_size, embedding_size))
for word, i in word_index.items():
    if word in word2vec_model.wv:
        embedding_vector = word2vec_model.wv[word]
        embedding_matrix[i] = embedding_vector

# Convert target labels to numerical values
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_padded, y_encoded, test_size=0.2, random_state=42)

# Define LSTM model with Word2Vec embeddings
def LSTM_model_with_Word2Vec(input_length, vocab_size, embedding_size, embedding_matrix):
    model = Sequential()
    model.add(Embedding(vocab_size, embedding_size, input_length=input_length, weights=[embedding_matrix], trainable=False))
    model.add(LSTM(100))
    model.add(Dropout(0.2))
    model.add(Dense(5, activation='softmax'))
    model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

# Define parameters
vocab_size = len(tokenizer.word_index) + 1  # Add 1 for padding token

# Create and compile the LSTM model with Word2Vec embeddings
lstm_model_with_word2vec = LSTM_model_with_Word2Vec(max_len, vocab_size, embedding_size, embedding_matrix)

# Train the model
lstm_model_with_word2vec.fit(X_train, y_train, epochs=20, batch_size=32, validation_data=(X_test, y_test))

# Evaluate the model
loss, accuracy = lstm_model_with_word2vec.evaluate(X_test, y_test)
print("Test Accuracy with Word2Vec embeddings:", accuracy)





Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Test Accuracy with Word2Vec embeddings: 0.7799999713897705
