# Product Reviews - Topic Modeling and Interpretation

## NMF (Non-Negative Matrix Factorization (NMF)) with TF-IDF


## LDA with TF-IDF


## Topic Interpretation



## Step 1: Read in Data

In [1]:
from collections import OrderedDict
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('seaborn')

import re
import string

# from nltk.corpus import stopwords
# from nltk.tokenize import word_tokenize, sent_tokenize, RegexpTokenizer

# sklearn
from sklearn import datasets
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

In [2]:
product_df = pd.read_csv('product_reviews_cleaned.csv')
product_df.shape
product_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 66224 entries, 0 to 66223
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Unnamed: 0        66224 non-null  int64 
 1   reviews.rating    66224 non-null  int64 
 2   sentiment         66224 non-null  object
 3   reviews_keywords  66224 non-null  object
 4   reviewsConcat     66224 non-null  object
 5   name              66224 non-null  object
 6   reviews.title     66224 non-null  object
 7   categories        66224 non-null  object
dtypes: int64(2), object(6)
memory usage: 4.0+ MB


In [3]:
docs = product_df['reviewsConcat']

In [4]:
product_df.shape

(66224, 8)

In [5]:
vec = CountVectorizer(binary=True)

In [6]:
doc_term = vec.fit_transform(docs)


In [7]:
# pd.unique(doc_term.toarray().reshape(-1))

In [8]:
vec = TfidfVectorizer()
doc_term_tfidf = vec.fit_transform(docs.values)
doc_term_tfidf.shape
# doc_term.toarray()

(66224, 27769)

In [9]:
vec = CountVectorizer(stop_words='english', max_df=0.8)
doc_term_cv = vec.fit_transform(docs)
doc_term_cv.shape

(66224, 27468)

## Stemming and Lemmatization

Stemming and Lemmatization are Text Normalization techniques. In Natural Language Processing, these are used to prepare text, words, and documents for further processing

In [10]:
from nltk.stem import PorterStemmer, SnowballStemmer, LancasterStemmer

In [11]:
stemmer = SnowballStemmer("english")

In [12]:
def prep(word, stemmer=None):
    
    with open('./stop_words_english.txt', 'r', encoding='utf-8') as f:
        stopwords = [s.strip() for s in f.readlines()]
        
    if word.lower() in stopwords:
        return None
    
    elif stemmer is None:
        return word.lower()
    
    else:
        return stemmer.stem(word)

In [13]:
vec = CountVectorizer(stop_words='english',
                      min_df=1,
                      max_df=0.8,
                      preprocessor=prep)

In [14]:
doc_term = vec.fit_transform(docs)
doc_term.shape

(66224, 27468)

In [15]:
docs.shape

(66224,)

In [16]:
# vec.get_feature_names()

In [17]:
import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()

In [18]:
tf_vectorizer = CountVectorizer(strip_accents = 'unicode',
                                stop_words = 'english',
                                lowercase = True,
                                token_pattern = r'\b[a-zA-Z]{3,}\b',
                                max_df = 0.5, 
                                min_df = 10, preprocessor=stemmer.stem)
dtm_tf = tf_vectorizer.fit_transform(docs)
print(dtm_tf.shape)



(66224, 6023)


In [19]:
type(docs)

pandas.core.series.Series

In [20]:
tfidf_vectorizer = TfidfVectorizer(**tf_vectorizer.get_params())
dtm_tfidf = tfidf_vectorizer.fit_transform(docs)
print(dtm_tfidf.shape)



(66224, 6023)


## Non-Negative Matrix Factorization (NMF) 

Find two non-negative matrices (W, H) whose product approximates the non- negative matrix X. This factorization can be used for example for dimensionality reduction, source separation or topic extraction.

In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer, ENGLISH_STOP_WORDS
from sklearn.decomposition import NMF, TruncatedSVD, LatentDirichletAllocation

nmf = NMF(n_components=10)

nmf.fit(dtm_tfidf)



NMF(n_components=10)

In [22]:
doc_topic_matrix = nmf.transform(dtm_tfidf)
doc_topic_matrix

array([[0.00158521, 0.00051302, 0.0023962 , ..., 0.02151844, 0.        ,
        0.        ],
       [0.        , 0.02549304, 0.0063662 , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.0001609 , 0.        , 0.00267613, ..., 0.00017104, 0.        ,
        0.        ],
       [0.00031072, 0.00107464, 0.00946955, ..., 0.01152583, 0.        ,
        0.00051451],
       [0.00276446, 0.        , 0.00741775, ..., 0.00138762, 0.        ,
        0.00042677]])

## Document / Topic Matrix

In [23]:
doc_topic_matrix_df  = pd.DataFrame(doc_topic_matrix).add_prefix('topic_')
doc_topic_matrix_df[['reviewsConcat','reviews_keywords']] = product_df[['reviewsConcat','reviews_keywords']]
doc_topic_matrix_df.head(10)

Unnamed: 0,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,topic_9,reviewsConcat,reviews_keywords
0,0.001585,0.000513,0.002396,0.021649,0.000316,0.007011,0.0,0.021518,0.0,0.0,Just Awesome i love this album. it's very good...,['just awesome i love this album it s very go...
1,0.0,0.025493,0.006366,0.114941,0.0,0.0,0.008536,0.0,0.0,0.0,Good Good flavor. This was collected as part ...,['good good flavor this was collected as par...
2,0.0,0.0,0.0,0.124761,0.0,0.0,0.0,0.0,0.0,0.0,Good Good flavor.,['good good flavor ']
3,0.00071,0.005854,0.006272,0.000549,0.0,0.015413,0.000228,0.0,0.0,0.0,Disappointed I read through the s on here befo...,['disappointed i read through the s on here be...
4,0.000995,0.0,0.021589,0.000604,0.0,0.002492,0.001356,0.0,0.00015,0.0,Irritation My husband bought this gel for us. ...,['irritation my husband bought this gel for us...
5,0.00291,0.0,0.009574,0.001855,0.003528,0.007879,0.003689,0.023526,7.1e-05,0.0,Not worth it My boyfriend and I bought this to...,['not worth it my boyfriend and i bought this ...
6,0.001913,0.0,0.013087,0.002739,0.001626,0.013998,0.005397,0.001655,0.000638,0.0,Disappointing Bought this earlier today and wa...,['disappointing bought this earlier today and ...
7,0.001704,0.0,0.015364,0.0,0.001867,0.011956,0.011053,0.001256,0.003975,0.0,Not happy at all I bought this product for my ...,['not happy at all i bought this product for m...
8,0.000342,0.0,0.011788,0.002326,0.006829,0.010446,0.004096,0.001239,0.001502,0.014562,Very disappointing My husband and I bought thi...,['very disappointing my husband and i bought t...
9,0.001819,0.0,0.005902,0.003862,0.001376,0.015821,0.001282,0.002233,0.000531,0.0,Don't buy Got as a surprise for my husband the...,['don t buy got as a surprise for my husband t...


In [24]:
doc_topic_matrix_df.shape

(66224, 12)

## Word/Topic Matrix

In [25]:
vocab = tfidf_vectorizer.get_feature_names()
word_topic_matrix_df = pd.DataFrame(nmf.components_, columns=vocab).T.add_prefix('topic_')
word_topic_matrix_df.head(15)

Unnamed: 0,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,topic_9
aaron,0.001192,0.0,1.9e-05,0.0,0.0,0.0,0.0,0.001342,0.0,0.0
abilities,0.000502,0.001721,0.001057,0.001461,0.000157,0.001216,0.0,0.0,0.0,0.0
ability,0.0,0.007604,0.005578,0.001633,0.004755,0.009726,0.00013,0.003832,0.0,0.0
able,0.015126,0.012444,0.055856,0.00405,0.007109,0.057337,0.05833,0.015249,0.001215,0.0
abrasive,0.0,0.000492,0.00128,0.0,0.00103,0.0,0.000438,0.002961,0.0,0.0
absolute,0.002162,0.004015,0.009883,0.0,0.0,0.005615,0.000746,0.009566,0.0,0.0
absolutely,0.065942,0.0,0.055482,0.0,0.0,0.034958,0.094205,0.215682,0.021512,0.0
absolutley,0.0,0.001647,0.0,0.001093,0.0,0.000386,0.000553,0.003056,0.0,0.000354
absolutly,0.0,0.000394,0.0,0.0,0.000118,0.002636,0.001291,0.005477,0.0,0.0
absorb,0.0,0.0,0.020426,0.0,0.0,0.0,0.00178,0.0,0.0,0.0


In [26]:
def make_topics(docs, preprocessor, vectorizer, topic_modeler, print_n_words=15):
    """A very simple pipeline."""
    
    # Apply preprocessor, vectorizer, and topic modeler.
    if preprocessor is not None:
        docs = docs.apply(preprocessor)
    
    # Vectorize documents into a document-word matrix.
    doc_word_vectors = vectorizer.fit_transform(docs)
    
    # Fit the topic model.
    doc_topic_vectors = topic_modeler.fit_transform(doc_word_vectors)
    
    # Print the topics.
    vocab = vectorizer.get_feature_names()
    for idx, topic in enumerate(topic_modeler.components_):
        # Select the top 15 words in vocab for this topic.
        top_words = [vocab[i].upper() for i in topic.argsort()[:-print_n_words-1:-1]]
        print(f"Topic {idx}:\n", ", ".join(top_words), "\n")
    
    return doc_topic_vectors

In [27]:
# Fifteen topics.
docs = product_df['reviewsConcat']
preprocessor = None
vectorizer = TfidfVectorizer(stop_words="english")
topic_modeler = NMF(10, random_state=20, max_iter=1000)  # NOTE: I use a random state here to make the results deterministic.

make_topics(docs, preprocessor, vectorizer, topic_modeler);



Topic 0:
 MOVIE, ENJOYED, WATCH, LOVED, AWESOME, GODZILLA, CUTE, REALLY, WATCHING, ACTION, EXCELLENT, BEST, GREAT, WATCHED, RECOMMEND 

Topic 1:
 WIPES, CLOROX, COLLECTED, PROMOTION, USE, CLEAN, EASY, CLEANING, DISINFECTING, CONVENIENT, KITCHEN, HOUSE, BATHROOM, QUICK, PRODUCT 

Topic 2:
 SKIN, PRODUCT, MOISTURIZER, OLAY, FACE, COLLECTED, PROMOTION, USING, FEEL, TOTAL, EFFECTS, LIKE, FEELS, SMOOTH, AGING 

Topic 3:
 GOOD, ORIGINAL, PRICE, PRETTY, SEQUEL, ACTION, LIKE, STORY, QUALITY, REALLY, BETTER, LIKED, NICE, BUY, JUST 

Topic 4:
 GREAT, PRODUCT, PRICE, WORKS, BUY, SMELLS, RECOMMEND, MOVIES, QUALITY, USE, VALUE, CLEANING, JOB, STORY, DEAL 

Topic 5:
 TIDE, PODS, CLOTHES, CLEAN, LAUNDRY, SMELL, DETERGENT, USE, FRESH, EASY, SCENT, POD, JUST, PRODUCT, USED 

Topic 6:
 HAIR, CONDITIONER, SHAMPOO, SOFT, RECEIVED, PRODUCT, FREE, COLLECTED, PROMOTION, INFLUENSTER, TESTING, OILY, PURPOSES, OPINIONS, MASK 

Topic 7:
 LOVE, SMELL, PRODUCT, AWESOME, ABSOLUTELY, HOUSE, AMAZING, KIDS, JUST, MAKE

## Topic Interpretations

### TOPIC_0: Thriller Movie Review
MOVIE, ENJOYED, WATCH, LOVED, AWESOME, GODZILLA, CUTE, REALLY, WATCHING, ACTION, EXCELLENT, BEST, RECOMMEND, GREAT, WATCHED 

### TOPIC_1:Cleaning Product Review
WIPES, CLOROX, COLLECTED, PROMOTION, REVIEW, USE, CLEAN, EASY, CLEANING, CONVENIENT, DISINFECTING, HOUSE, KITCHEN, BATHROOM, QUICK 
    
### TOPIC_2: Skin Product Review (Moisturizer)
SKIN, PRODUCT, MOISTURIZER, OLAY, FACE, REVIEW, COLLECTED, PROMOTION, USING, FEEL, TOTAL, EFFECTS, FEELS, SMOOTH, LIKE 

### TOPIC_3: Movie Review
GOOD, ORIGINAL, PRICE, PRETTY, SEQUEL, ACTION, LIKE, STORY, QUALITY, REALLY, BETTER, LIKED, NICE, BUY, JUST 

### TOPIC_4: Household Cleaning Supplies Review
GREAT, PRODUCT, PRICE, WORKS, BUY, SMELLS, RECOMMEND, MOVIES, QUALITY, USE, VALUE, CLEANING, STORY, JOB, DEAL 
    
### TOPIC_5: Laundry Detergent Review
TIDE, PODS, CLOTHES, CLEAN, LAUNDRY, SMELL, USE, DETERGENT, FRESH, EASY, SCENT, PRODUCT, POD, JUST, USED 

### TOPIC_6: Hair Product Review
 HAIR, CONDITIONER, SHAMPOO, SOFT, REVIEW, RECEIVED, PRODUCT, FREE, COLLECTED, PROMOTION, INFLUENSTER, TESTING, OILY, PURPOSES, OPINIONS 

### TOPIC_7: Food / Personal Care Product Review
LOVE, SMELL, PRODUCT, ABSOLUTELY, AWESOME, HOUSE, JUST, KIDS, AMAZING, MAKES, SMELLS, PRODUCTS, LIP, MOP, COLOR 

### TOPIC_8: Comedy Movie Review
KIDS, FUNNY, ADULTS, LOVED, CUTE, MOVIE, WATCH, ENTERTAINING, HILARIOUS, ADULT, FUN, PETS, HUMOR, REALLY, ENJOY 

### TOPIC_9: Kids Movie Review
 FAMILY, FUN, WATCH, ENJOYED, NIGHT, ENTIRE, MOVIE, ENJOY, FILM, FRIENDS, CUTE, AGES, ENTERTAINING, PETS, GREAT 

## Let's look at some reviews from each of these topics

In [28]:
# for review in product_df['reviewsConcat'].sample(10).values:
for review in doc_topic_matrix_df.sort_values(by='topic_0', ascending=False).head(2)['reviewsConcat'].values:
    print(review)
    print()

Must have This was a must have movie since I have the first one.

Great movie This was a great movie. It's a must own movie for sure



In [29]:
for review in doc_topic_matrix_df.sort_values(by='topic_1', ascending=False).head(2)['reviewsConcat'].values:
    print(review)
    print()

Clorox Wipes Clorox wipes makes it so easy to clean. I use them everywhere. This  was collected as part of a promotion.

Clorox wipes Clorox disinfecting wipes are easy to use and convenient This  was collected as part of a promotion.



In [30]:
for review in doc_topic_matrix_df.sort_values(by='topic_2', ascending=False).head(1)['reviewsConcat'].values:
    print(review)
    print()

A Great Skin Product! I think Olay Total Effects 7 in 1 Anti-Aging Moisturizer GotItForFree is a wonderful product. I tend to have problems with very dry skin that never seems to have a glow to it. After trying this skin product I have noticed an overall difference. My skin looks hydrated with a nice glow, the fine lines and wrinkles are less noticeable, as well as there is a more even skin tone. I'm not sure the last time my skin felt so soft. If you are looking for a skin care product that provides you with soft, radiant, smooth and even toned skin.....give this one a try. This  was collected as part of a promotion.



In [31]:
for review in doc_topic_matrix_df.sort_values(by='topic_5', ascending=False).head(2)['reviewsConcat'].values:
    print(review)
    print()

Tide pods are great to use. Tide pods are great for the clothes and easy to use.

Tide Pods With Febreeze I absolutely LOVE these tide pods! They make my clothes smell great for long time!! They clean my clothes so well! My whole laundry room smelled great while the clothes were running. I do feel like they are a bit pricey but I think it Is worth it! I used this Tide pods a couple of times now and I really liked it. The smell was pleasant and fresh. I didn't use it for clothes with hard stains yet, but I washed our regular laundry with this. It was so easy to use, just pop the pods with laundry and start the washer. It gave our clothes a clean and nice fresh scent. Overall, loved this product and I would definitely recommend anyone to try it. GOTITFORFREE TIDEPODS BZZAGENT This  was collected as part of a promotion.



In [32]:
for review in doc_topic_matrix_df.sort_values(by='topic_6', ascending=False).head(1)['reviewsConcat'].values:
    print(review)
    print()

Smooth, Happy Hair For Days! This conditioner is wonderful. Using it as part of the three-part system, my hair is left soft, smelling amazing, and feeling clean and light for days. As thick is my hair is, I am used to having conditioner way my hair down, or leave my hair too dry if not enough is used. This struck the perfect balance, leaving my hair so, so soft and beautiful. I received these products free/complimentary for testing purposes, but all opinions are my own. This  was collected as part of a promotion.



In [33]:
for review in doc_topic_matrix_df.sort_values(by='topic_8', ascending=False).head(2)['reviewsConcat'].values:
    print(review)
    print()

Funny for kids My kids and I loved this movie, funny and entertaining.

Great kids movie My kids keep watching it over and over. It seems funny enough



In [34]:
for review in doc_topic_matrix_df.sort_values(by='topic_5', ascending=False).head(2)['reviewsConcat'].values:
    print(review)
    print()

Tide pods are great to use. Tide pods are great for the clothes and easy to use.

Tide Pods With Febreeze I absolutely LOVE these tide pods! They make my clothes smell great for long time!! They clean my clothes so well! My whole laundry room smelled great while the clothes were running. I do feel like they are a bit pricey but I think it Is worth it! I used this Tide pods a couple of times now and I really liked it. The smell was pleasant and fresh. I didn't use it for clothes with hard stains yet, but I washed our regular laundry with this. It was so easy to use, just pop the pods with laundry and start the washer. It gave our clothes a clean and nice fresh scent. Overall, loved this product and I would definitely recommend anyone to try it. GOTITFORFREE TIDEPODS BZZAGENT This  was collected as part of a promotion.



## Find Similar Product Reviews for a product

### Caclulate Pairwise distances using Cosine Similarity Metric

Customer can input 'cleaning pods' and check the reviews for products in similar categories

In [35]:
input_prod = ["soft skin cream"]

In [36]:
type(docs) 

pandas.core.series.Series

In [37]:
vt = tfidf_vectorizer.transform(input_prod)

In [38]:
nmf_input_prod = nmf.transform(vt)

In [39]:
nmf_input_prod

array([[0.        , 0.        , 0.07557635, 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ]])

In [40]:
from sklearn.metrics import pairwise_distances
dist = pairwise_distances(nmf_input_prod,doc_topic_matrix,metric='cosine').argsort()
dist

array([[51024, 51921, 50922, ..., 23811, 23792, 33111]], dtype=int64)

In [41]:
dist.shape
new_distance_list = list(dist)

In [43]:
similar_reviews = []
for i in new_distance_list:
    similar_reviews.append(product_df['reviewsConcat'][i])

In [44]:
similar_reviews

[51024    lines....lines go away The unique vitamin copl...
 51921    2 weeks and already am seeing results! Olay To...
 50922    Fountain of youth or close to it!! I have been...
 51414    Saved my Skin Total Effects Daily Moisturizer ...
 49890    7 day Olay Anti-aging 7 in 1 Moisturizer Trial...
                                ...                        
 23808    Visually awesome in 3D Great option for 3D. Lo...
 23810    Epic movie Great quality of movie lives up to ...
 23811    Great deal Anytime getting a 3-D movie for und...
 23792    Great Great movie couldn't wait to watch it an...
 33111    Great movie! Grands love this movie - bought i...
 Name: reviewsConcat, Length: 66224, dtype: object]

In [45]:
product_df.head(2)

Unnamed: 0.1,Unnamed: 0,reviews.rating,sentiment,reviews_keywords,reviewsConcat,name,reviews.title,categories
0,0,5,positive,['just awesome i love this album it s very go...,Just Awesome i love this album. it's very good...,Pink Friday: Roman Reloaded Re-Up (w/dvd),Just Awesome,"Movies, Music & Books,Music,R&b,Movies & TV,Mo..."
1,1,5,positive,['good good flavor this was collected as par...,Good Good flavor. This was collected as part ...,Lundberg Organic Cinnamon Toast Rice Cakes,Good,"Food,Packaged Foods,Snacks,Crackers,Snacks, Co..."


In [46]:
product_df['reviewsConcat'][51893]
# dfObj = pd.DataFrame(product_df, index=new_distance_list)
# dist_new = pairwise_distances(tt,doc_topic_matrix,metric='cosine')
# np.sort(dist_new)

"Smooth, younger skin I used to think Olay was my grandmothers brand until I tried Olay total effects 7 in 1 anti-aging moisturizer. After just a few weeks my skin looks and feels better than ever! People are actually noticing my glowing skin! I'm looking younger and feeling younger! Thanks Olay! This  was collected as part of a promotion."

In [47]:
product_df['name'][51893]

'Olay Total Effects Daily Moisturizer, 7-In-1 Anti-Aging, 0.5oz'

In [48]:
recommend_similar_products = []
for i in new_distance_list:
    recommend_similar_products.append(product_df['name'][i])

In [49]:
recommend_similar_products

[51024    Olay Total Effects Daily Moisturizer, 7-In-1 A...
 51921    Olay Total Effects Daily Moisturizer, 7-In-1 A...
 50922    Olay Total Effects Daily Moisturizer, 7-In-1 A...
 51414    Olay Total Effects Daily Moisturizer, 7-In-1 A...
 49890    Olay Total Effects Daily Moisturizer, 7-In-1 A...
                                ...                        
 23808    Godzilla 3d Includes Digital Copy Ultraviolet ...
 23810    Godzilla 3d Includes Digital Copy Ultraviolet ...
 23811    Godzilla 3d Includes Digital Copy Ultraviolet ...
 23792    Godzilla 3d Includes Digital Copy Ultraviolet ...
 33111              The Jungle Book (blu-Ray/dvd + Digital)
 Name: name, Length: 66224, dtype: object]

In [148]:
recommend_similar_products

['Pink Friday: Roman Reloaded Re-Up (w/dvd)']

In [149]:
# for TF DTM
# lda_tf = LatentDirichletAllocation(n_components=20, random_state=0)
# lda_tf.fit(dtm_tf)

# for TFIDF DTM
lda_tfidf = LatentDirichletAllocation(n_components=20, random_state=0)
lda_tfidf.fit(dtm_tfidf)

LatentDirichletAllocation(n_components=20, random_state=0)

In [150]:
lda_tfidf.transform(dtm_tfidf)

array([[0.01039043, 0.01039043, 0.01039043, ..., 0.01039043, 0.01039043,
        0.01039043],
       [0.01761003, 0.01761003, 0.01761003, ..., 0.46065858, 0.01761003,
        0.01761003],
       [0.0207501 , 0.0207501 , 0.0207501 , ..., 0.60574812, 0.0207501 ,
        0.0207501 ],
       ...,
       [0.02099729, 0.02099729, 0.02099729, ..., 0.02099729, 0.02099729,
        0.02099729],
       [0.00816714, 0.39197723, 0.00816713, ..., 0.00816713, 0.00816713,
        0.00816713],
       [0.01419147, 0.01419147, 0.10183504, ..., 0.01419147, 0.30611201,
        0.01419147]])

In [151]:
# pyLDAvis.sklearn.prepare(lda_tf, dtm_tf, tf_vectorizer)

In [152]:
pyLDAvis.sklearn.prepare(lda_tfidf, dtm_tfidf, tfidf_vectorizer)

In [154]:
y = product_df['sentiment']

In [155]:
X_train, X_test, y_train, y_test=train_test_split(doc_term,y,test_size=0.2, random_state=20)

In [156]:
# model=RandomForestClassifier()
# model.fit(X_train,y_train)
# print('Accuracy: ', model.score(X_train, y_train))

In [157]:
from sklearn.linear_model import LogisticRegression
lr_model=LogisticRegression()
lr_model.fit(X_train,y_train)
print('Accuracy: ', lr_model.score(X_train, y_train))

Accuracy:  0.9878819909775571


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
