In [1]:
import pandas as pd
import numpy as np

In [2]:
#Reading the dataset.
womendataset = pd.read_csv('WomensclothReviews.csv')
#Displaying the dimension of the original dataset.
print ("Dimension of the data set is: ", womendataset.shape)
#Understanding the dataset.
womendataset.info()
print ('The first five records are: \n')
print (womendataset.head())

Dimension of the data set is:  (23486, 11)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23486 entries, 0 to 23485
Data columns (total 11 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   Unnamed: 0               23486 non-null  int64 
 1   Clothing ID              23486 non-null  int64 
 2   Age                      23486 non-null  int64 
 3   Title                    19676 non-null  object
 4   Review Text              22641 non-null  object
 5   Rating                   23486 non-null  int64 
 6   Recommended IND          23486 non-null  int64 
 7   Positive Feedback Count  23486 non-null  int64 
 8   Division Name            23472 non-null  object
 9   Department Name          23472 non-null  object
 10  Class Name               23472 non-null  object
dtypes: int64(6), object(5)
memory usage: 2.0+ MB
The first five records are: 

   Unnamed: 0  Clothing ID  Age                    Title  \
0           0      

In [3]:
#Information related to number of missing observations for each column. 
print ("Number of missing values in column:\n",womendataset.isnull().sum())
#Removing missing values.
newwomendata= womendataset [['Review Text', 'Recommended IND']]
newwomendata.dropna (inplace=True)
print ("Dimension of the new data set is: ", newwomendata.shape)

Number of missing values in column:
 Unnamed: 0                    0
Clothing ID                   0
Age                           0
Title                      3810
Review Text                 845
Rating                        0
Recommended IND               0
Positive Feedback Count       0
Division Name                14
Department Name              14
Class Name                   14
dtype: int64
Dimension of the new data set is:  (22641, 2)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  newwomendata.dropna (inplace=True)


In [4]:
sentiments = newwomendata['Recommended IND'].values[:18000] 
reviews =newwomendata['Review Text'].values[:18000]

In [5]:
reviews_trg=reviews[:14000]
reviews_test=reviews[14000:]
sentiments_trg=sentiments[:14000]
sentiments_test=sentiments[14000:]

In [6]:
import transformers

In [7]:
bert_tokenizer=transformers.BertTokenizer.from_pretrained('bert-base-uncased')

In [8]:
#Importing necessary libraries. 
import tensorflow as tf 
import tqdm
from keras.preprocessing import sequence
#Creating a function.
def func_tokenizer (tokenizer_name, docs):
    features = []
    for doc in tqdm.tqdm(docs, desc="Converting documents to features"): 
        tokens= tokenizer_name.tokenize(doc)
        ids= tokenizer_name.convert_tokens_to_ids(tokens)
        features.append(ids)
    return features
print ("The function is created successfully")


The function is created successfully


In [9]:
bert_trg_features=func_tokenizer(bert_tokenizer,reviews_trg)

Converting documents to features: 100%|████████████████████████████████████████| 14000/14000 [00:10<00:00, 1383.95it/s]


In [10]:
bert_test_features=func_tokenizer(bert_tokenizer,reviews_test)

Converting documents to features: 100%|██████████████████████████████████████████| 4000/4000 [00:02<00:00, 1376.92it/s]


In [11]:
bert_trg=sequence.pad_sequences(bert_trg_features,maxlen=500)
bert_test=sequence.pad_sequences(bert_trg_features,maxlen=500)

In [13]:
gpt2_tokenizer=transformers.GPT2Tokenizer.from_pretrained('gpt2')

In [14]:
gpt2_trg_features=func_tokenizer(gpt2_tokenizer,reviews_trg)
gpt2_test_features=func_tokenizer(gpt2_tokenizer,reviews_test)

Converting documents to features: 100%|████████████████████████████████████████| 14000/14000 [00:04<00:00, 3306.02it/s]
Converting documents to features: 100%|██████████████████████████████████████████| 4000/4000 [00:01<00:00, 3644.23it/s]


In [19]:
#Importing necessary libraries.
import re
import nltk
import numpy as np
#Creating a function for performing stemming.
ps= nltk.porter. PorterStemmer ()
def func_stemming(text, stemmer=ps):
    text = ' '.join([stemmer.stem (word) for word in text.split()])
    return text
#Creating a function for removing special characters.
def func_special_characters(text, remove_digits=False):
    pattern = r'[^a-zA-Z0-9\s]' if not remove_digits else r'[^a-zA-Z\s]' 
    text = re.sub (pattern, '', text)
    return text


In [20]:
#Creating a list of English stop words from the corpus.
stop_words = nltk.corpus.stopwords.words ('english')
#Removing words like 'no' and 'not' from the list of English stop words. 
stop_words.remove('no')
stop_words.remove('not')

In [22]:
#Creating a function for removing stop words.
def func_stopwords (text, stopwords=None):
    tokens = nltk.word_tokenize (text)
    tokens = [token. strip() for token in tokens]
    filtered_tokens = [token for token in tokens if token not in stopwords] 
    filtered_text =' '.join(filtered_tokens)
    return filtered_text


In [23]:
import re

def func_text_process(document):
    # Convert to lower case
    document = document.lower()

    # Remove newlines and tabs
    document = document.replace('\n', ' ').replace('\t', ' ').replace('\r', ' ')

    # Remove special characters and/or digits
    document = func_special_characters(document, remove_digits=True)

    # Stemming
    document = func_stemming(document)

    # Remove stop words
    document = func_stopwords(document, stopwords=stop_words)

    # Normalize spaces
    document = re.sub(r'\s+', ' ', document).strip()

    return document

In [24]:
text_process=np.vectorize(func_text_process)

In [25]:
process_train_reviews=text_process(reviews_trg)
process_test_reviews=text_process(reviews_test)

In [26]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer

In [39]:
cv1=CountVectorizer(binary=False,min_df=1,max_df=0.9,ngram_range=(1,2))

In [40]:
cv_train_features1=cv1.fit_transform(process_train_reviews)

In [41]:
cv_test_features1=cv1.transform(process_test_reviews)

In [30]:
tv = TfidfVectorizer(
    use_idf=True,
    min_df=1,
    max_df=0.9,
    ngram_range=(1, 2),
    sublinear_tf=True
)
tv_train_features = tv.fit_transform(process_train_reviews)
tv_test_features = tv.transform(process_test_reviews)

In [31]:
from sklearn.linear_model import LogisticRegression

In [32]:
lrmodel=LogisticRegression(penalty='l2',max_iter=500,C=1,solver='lbfgs',random_state=42)

In [42]:
lrmodel.fit(cv_train_features1,sentiments_trg)

In [43]:
lrmodel_predictions=lrmodel.predict(cv_test_features1)

In [44]:
from sklearn.metrics import accuracy_score

In [45]:
lr_acc1=accuracy_score(sentiments_test,lrmodel_predictions)

In [46]:
lr_acc1

0.902

In [47]:
lrmodel2=LogisticRegression(penalty='l2',max_iter=500,C=1,solver='lbfgs',random_state=42)

In [48]:
bert_trg

array([[    0,     0,     0, ...,  7916,  1998,  6625],
       [    0,     0,     0, ...,  5621, 20146,  1012],
       [    0,     0,     0, ...,  1011,  2009,  1039],
       ...,
       [    0,     0,     0, ...,  7114,  2420,  1012],
       [    0,     0,     0, ...,  6774,  1010,  2021],
       [    0,     0,     0, ...,  5686,  2009,   999]], dtype=int32)

In [50]:
lrmodel2.fit(bert_trg,sentiments_trg)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [54]:
lrmodel_predictions2=lrmodel2.predict(bert_test)

In [52]:
lr_acc2=accuracy_score(sentiments_test,lrmodel_predictions2)

ValueError: Found input variables with inconsistent numbers of samples: [4000, 14000]