In [None]:
# Importing necessary libraries and mounting Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install scattertext 

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting scattertext
  Downloading scattertext-0.1.19-py3-none-any.whl (8.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.2/8.2 MB[0m [31m46.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting flashtext
  Downloading flashtext-2.7.tar.gz (14 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: flashtext
  Building wheel for flashtext (setup.py) ... [?25l[?25hdone
  Created wheel for flashtext: filename=flashtext-2.7-py2.py3-none-any.whl size=9308 sha256=093f666dc76392099132b5733564f139a68cc3a3bba16ec14a3fa95fcc11a97f
  Stored in directory: /root/.cache/pip/wheels/65/3c/c7/44672c5062c16d05760b1eaddbf611d2f6a4b715c6d6777418
Successfully built flashtext
Installing collected packages: flashtext, scattertext
Successfully installed flashtext-2.7 scattertext-0.1.19


In [None]:
import scattertext as st
import spacy
from pprint import pprint
import pandas as pd
import re
import numpy as np
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.metrics import accuracy_score
import pickle
import nltk 

In [None]:
# Downloading necessary NLTK resources
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
df = pd.read_csv('/content/drive/MyDrive/NLP_Applications_1/Project/IMDB.csv')

In [None]:
# Printing the dataset
print(df)

                                                  review sentiment
0      One of the other reviewers has mentioned that ...  positive
1      A wonderful little production. <br /><br />The...  positive
2      I thought this was a wonderful way to spend ti...  positive
3      Basically there's a family where a little boy ...  negative
4      Petter Mattei's "Love in the Time of Money" is...  positive
...                                                  ...       ...
49995  I thought this movie did a down right good job...  positive
49996  Bad plot, bad dialogue, bad acting, idiotic di...  negative
49997  I am a Catholic taught in parochial elementary...  negative
49998  I'm going to have to disagree with the previou...  negative
49999  No one expects the Star Trek movies to be high...  negative

[50000 rows x 2 columns]


In [None]:
# Counting the number of positive and negative labels in the dataset
positive_labels = (df['sentiment'] == "positive").sum()
negative_labels = (df['sentiment'] == "negative").sum()

In [None]:
# Printing the number of positive and negative labels in the dataset
print(f"Number of positive labels: {positive_labels}")
print(f"Number of negative labels: {negative_labels}")

Number of positive labels: 25000
Number of negative labels: 25000


In [None]:
# Cleaning the review text by removing HTML tags
def clean(text):
    cleaned = re.compile(r'<.*?>')
    return re.sub(cleaned,'',text)

data = df
data.loc[:, 'review'] = data['review'].apply(clean)


In [None]:
# Removing non-alphanumeric characters from the review text
def is_special(text):
    rem = ''
    for i in text:
        if i.isalnum():
            rem = rem + i
        else:
            rem = rem + ' '
    return rem

data.review = data.review.apply(is_special)

In [None]:
# Converting review text to lowercase
def to_lower(text):
    return text.lower()

data.review = data.review.apply(to_lower)

# Removing stopwords from the review text
def rem_stopwords(text):
    stop_words = set(stopwords.words('english'))
    words = word_tokenize(text)
    return [w for w in words if w not in stop_words]

data = data.review.apply(rem_stopwords)

In [None]:
# Swapping columns in the dataset
def swap_columns(df, col1, col2):
    col_list = list(df.columns)
    x, y = col_list.index(col1), col_list.index(col2)
    col_list[y], col_list[x] = col_list[x], col_list[y]
    df = df[col_list]
    return df

df = swap_columns(df, 'review', 'sentiment')

In [None]:
# Displaying the final dataset
display(df)

Unnamed: 0,sentiment,review
0,positive,one of the other reviewers has mentioned that ...
1,positive,a wonderful little production the filming tec...
2,positive,i thought this was a wonderful way to spend ti...
3,negative,basically there s a family where a little boy ...
4,positive,petter mattei s love in the time of money is...
...,...,...
49995,positive,i thought this movie did a down right good job...
49996,negative,bad plot bad dialogue bad acting idiotic di...
49997,negative,i am a catholic taught in parochial elementary...
49998,negative,i m going to have to disagree with the previou...


In [None]:
text = df.iloc[:2000,:]

In [None]:
len(text)

2000

In [None]:
text = swap_columns(text, 'sentiment', 'review')

In [None]:
display(text)

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production the filming tec...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically there s a family where a little boy ...,negative
4,petter mattei s love in the time of money is...,positive
...,...,...
1995,feeling minnesota directed by steven baigelma...,negative
1996,the cell 2000 rating 8 10the cell like ant...,positive
1997,this movie despite its list of b c and d li...,negative
1998,i loved this movie it was all i could do not ...,positive


In [None]:
len(text)

2000

In [None]:
positive_labels = (text['sentiment'] == "positive").sum()
negative_labels = (text['sentiment'] == "negative").sum()

In [None]:
print(f"Number of positive labels: {positive_labels}")
print(f"Number of negative labels: {negative_labels}")

Number of positive labels: 1005
Number of negative labels: 995


In [None]:
import scattertext as st

df = text.assign(
    parse=lambda df: df.review.apply(st.whitespace_nlp_with_sentences))
corpus = st.CorpusWithoutCategoriesFromParsedDocuments(
    df, parsed_col='parse'
).build().get_unigram_corpus().remove_infrequent_words(minimum_term_count=6)

corpus.get_categories()
# Returns ['_']

['_']

In [None]:
dispersion = st.Dispersion(corpus)

dispersion_df = dispersion.get_df()
dispersion_df.head(3)

Unnamed: 0,Frequency,Range,SD,VC,Juilland's D,Rosengren's S,DP,DP norm,KL-divergence
one,2041,1101,1.317604,1.291136,0.970825,0.596722,0.415829,0.415846,0.880308
of,11327,1891,5.236532,0.924611,0.98711,0.927564,0.185104,0.185112,0.176035
the,26166,1982,11.198621,0.855967,0.990988,0.966736,0.136101,0.136107,0.090568


In [None]:
dispersion_df = dispersion_df.assign(
    X=lambda df: df.Frequency,
    Xpos=lambda df: st.Scalers.log_scale(df.X),
    Y=lambda df: df["Rosengren's S"],
    Ypos=lambda df: st.Scalers.scale(df.Y),
)

In [None]:
html = st.dataframe_scattertext(
    corpus,
    plot_df=dispersion_df,
    metadata=corpus.get_df()['sentiment'] + ' (' + corpus.get_df()['review'].str.upper() + ')',
    ignore_categories=True,
    x_label='Log Frequency',
    y_label="Rosengren's S",
    y_axis_labels=['Less Dispersion', 'Medium', 'More Dispersion'],
)

  vec_ss = (vec_ss - vec_ss.min()) * 1. / (vec_ss.max() - vec_ss.min())


In [None]:
nlp = spacy.load("en_core_web_sm")
corpus = st.CorpusFromPandas(text, 
                            category_col='sentiment', 
                             text_col='review',
                             nlp=nlp).build()

In [None]:
print(list(corpus.get_scaled_f_scores_vs_background().index[:10]))

['fulci', 'cinematography', 'slasher', 'believable', 'likable', 'laughable', 'hadn', 'filmed', 'cheesy', 'watchable']


In [None]:
term_freq_df = corpus.get_term_freq_df()
term_freq_df['positive_Score'] = corpus.get_scaled_f_scores('positive')
pprint(list(term_freq_df.sort_values(by='positive_Score', ascending=False).index[:10]))

['excellent',
 'wonderful',
 'loved',
 'perfect',
 'war',
 'the best',
 'performances',
 'a great',
 'great',
 'beautiful']


In [None]:
term_freq_df['negative_Score'] = corpus.get_scaled_f_scores('negative')
pprint(list(term_freq_df.sort_values(by='negative_Score', ascending=False).index[:10]))

['the worst',
 'waste',
 'awful',
 'worst',
 'boring',
 'stupid',
 'terrible',
 'bad',
 't even',
 'worse']


#Visualizing term associations

In [None]:
html = st.produce_scattertext_explorer(corpus,
         category ='negative',
         category_name ='Negative',
          not_category_name ='Positive',
         width_in_pixels = 1000,
         metadata = text['review'])
open("sentiment-Visualization.html", 'wb').write(html.encode('utf-8'))

6954594

#Scattertext 0.1.19

In [None]:
import scattertext as st

corpus = st.CorpusFromPandas(
    text, category_col='sentiment', text_col='review',
    nlp=st.whitespace_nlp_with_sentences
).build().get_unigram_corpus().compact(st.AssociationCompactor(2000))

html = st.produce_scattertext_explorer(
    corpus,
    category='positive', category_name='Positive', not_category_name='Negative',
    minimum_term_frequency=0, pmi_threshold_coefficient=0,
    width_in_pixels=1000, metadata=corpus.get_df()['sentiment'],
    transform=st.Scalers.dense_rank
)
open('./demo_compact.html', 'w').write(html)

3528206

In [None]:
!pip install --pre html5lib

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


#Ordering Terms by Corpus Characteristicness

In [None]:
import scattertext as st

corpus = (st.CorpusFromPandas(text,
                              category_col='sentiment',
                              text_col='review',
                              nlp=st.whitespace_nlp_with_sentences)
          .build()
          .get_unigram_corpus()
          .compact(st.ClassPercentageCompactor(term_count=2,
                                               term_ranker=st.OncePerDocFrequencyRanker)))
html = st.produce_characteristic_explorer(
	corpus,
	category='negative',
	category_name='negative',
	not_category_name='positive',
	metadata=corpus.get_df()['review']
)
open('demo_characteristic_chart.html', 'wb').write(html.encode('utf-8'))

7704144

#Visualizing Empath topics and categories

In [None]:
!pip install scattertext empath

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting empath
  Downloading empath-0.89.tar.gz (57 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.6/57.6 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: empath
  Building wheel for empath (setup.py) ... [?25l[?25hdone
  Created wheel for empath: filename=empath-0.89-py3-none-any.whl size=57823 sha256=62bb9d561a47a34dd6589f86adeff0a291a876a43742d925a76487b35d7bf500
  Stored in directory: /root/.cache/pip/wheels/5b/58/77/7eed8eef4c6be0cca8920ac319d916811537a37407da220bf1
Successfully built empath
Installing collected packages: empath
Successfully installed empath-0.89


In [None]:
feat_builder = st.FeatsFromOnlyEmpath()

In [None]:
corpus = st.CorpusFromParsedDocuments(text,
                                            category_col='sentiment',
                                             feats_from_spacy_doc=feat_builder,
                                             parsed_col='review').build()

In [None]:
html = st.produce_scattertext_explorer(corpus,
                                        category='negative',
                                        category_name='negative',
                                       not_category_name='positive',
                                       width_in_pixels=1000,
                                        metadata=text['review'],
                                       use_non_text_features=True,
                                       use_full_doc=True,
                                       topic_model_term_lists=feat_builder.get_top_model_term_lists())
open("Convention-Visualization-Empath.html", 'wb').write(html.encode('utf-8'))

7232582

#Developing and using bespoke word representations

In [None]:
!pip install gensim

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import spacy
from gensim.models import word2vec
from scattertext import SampleCorpora, word_similarity_explorer_gensim, Word2VecFromParsedCorpus
from scattertext.CorpusFromParsedDocuments import CorpusFromParsedDocuments

nlp = spacy.load("en_core_web_sm")
convention_df_1 = text
convention_df_1['parsed'] = convention_df_1.review.apply(nlp)
corpus = CorpusFromParsedDocuments(convention_df_1, category_col='sentiment', parsed_col='parsed').build()

model = word2vec.Word2Vec(vector_size=300,
                          alpha=0.025,
                          window=5,
                          min_count=5,
                          max_vocab_size=None,
                          sample=0,
                          seed=1,
                          workers=1,
                          min_alpha=0.0001,
                          sg=1,
                          hs=1,
                          negative=0,
                          cbow_mean=0,
                          epochs=1,
                          null_word=0,
                          trim_rule=None,
                          sorted_vocab=1)

In [None]:
html = word_similarity_explorer_gensim(corpus,
                                       category='negative',
                                       category_name='negative',
                                       not_category_name='positive',
                                       target_term='bad',
                                       minimum_term_frequency=5,
                                       pmi_threshold_coefficient=4,
                                       width_in_pixels=1000,
                                       metadata=text['review'],
                                       word2vec=Word2VecFromParsedCorpus(corpus, model).train(),
                                       max_p_val=0.05,
                                       save_svg_button=True)
open('./demo_gensim_similarity.html', 'wb').write(html.encode('utf-8'))

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  vec_ss = (vec_ss - vec_ss.min()) * 1. / (vec_ss.max() - vec_ss.min())


7306368

#Visualizing any kind of term score

In [None]:
from sklearn.linear_model import Lasso
from scattertext import sparse_explorer
html = sparse_explorer(corpus,
                        category='negative',
                        category_name='negative',
                        not_category_name='positive',
                        scores = corpus.get_regression_coefs('negative', Lasso(max_iter=10000)),
                        minimum_term_frequency=5,
                       pmi_threshold_coefficient=4,
                       width_in_pixels=1000,
                       metadata=text['review'])
open('./Convention-Visualization-Sparse.html', 'wb').write(html.encode('utf-8'))

  return np.log(


7339843

#Custom term positions

In [None]:
def scale(ar):
  return (ar - ar.min()) / (ar.max() - ar.min())

In [None]:
def zero_centered_scale(ar):
      ar[ar > 0] = scale(ar[ar > 0])
      ar[ar < 0] = -scale(-ar[ar < 0])
      return (ar + 1) / 2.

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
import numpy as np

In [None]:
frequencies_scaled = scale(np.log(term_freq_df.sum(axis=1).values))

In [None]:
scores = corpus.get_logreg_coefs('negative',
                                 LogisticRegression(penalty='l2', C=10, max_iter=10000, n_jobs=-1))

In [None]:
scores_scaled = zero_centered_scale(scores)

In [None]:
!pip install scattertext

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
from scattertext import produce_scattertext_explorer

In [None]:
html = produce_scattertext_explorer(corpus,
                                   category='negative',
                                     category_name='negative',
                                     not_category_name='positive',
                                     minimum_term_frequency=5,
                                     pmi_threshold_coefficient=4,
                                     width_in_pixels=1000,
                                     x_coords=frequencies_scaled,
                                     y_coords=scores_scaled,
                                     scores=scores,
                                     sort_by_dist=False,
                                     metadata=text['review'],
                                     x_label='Log frequency',
                                     y_label='L2-penalized logistic regression coef')
open('demo_custom_coordinates.html', 'wb').write(html.encode('utf-8'))

7363341