<a href="https://colab.research.google.com/github/encoras/Artificial-Intelligence-Group/blob/master/Sentiment_analysis_IMDB_movie_review_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
from mpl_toolkits import mplot3d
import numpy as np
import pandas as pd
import plotly.express as px
from scipy.cluster import hierarchy
import seaborn as sns
from sklearn import svm
from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN
from sklearn.datasets import load_iris, load_wine, fetch_20newsgroups, fetch_openml
from sklearn.impute import MissingIndicator, SimpleImputer
from sklearn.decomposition import PCA
from sklearn.ensemble import (
    RandomForestClassifier,
    RandomForestRegressor,
    GradientBoostingRegressor,
    AdaBoostRegressor,
    GradientBoostingClassifier,
    AdaBoostClassifier
)
from sklearn.feature_extraction.text import (
    CountVectorizer,
    TfidfTransformer,
    TfidfVectorizer
)
from sklearn.linear_model import (
    LinearRegression,
    LogisticRegression,
    LogisticRegressionCV,
    Lasso,
    Ridge,
    ElasticNet
)
from sklearn.metrics import (
    mean_absolute_error,
    mean_squared_error,
    classification_report,
    confusion_matrix,
    ConfusionMatrixDisplay,
    accuracy_score
)
from sklearn.model_selection import (
    train_test_split,
    GridSearchCV,
    cross_val_score,
    cross_validate
)
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import (
    MinMaxScaler,
    StandardScaler,
    OrdinalEncoder,
    LabelEncoder,
    OneHotEncoder,
    PolynomialFeatures
)
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.multiclass import OneVsRestClassifier

In [None]:
!pip install -q datasets hdbscan keybert

In [None]:
# extract keywords from texts
# used to assign meaningful names to clusters
from keybert import KeyBERT

## **Loading IMDB data set**

### About Dataset
IMDB dataset having 50K movie reviews for natural language processing or Text analytics.
This is a dataset for binary sentiment classification containing substantially more data than previous benchmark datasets. We provide a set of 25,000 highly polar movie reviews for training and 25,000 for testing. So, predict the number of positive and negative reviews using either classification or deep learning algorithms.
For more dataset information, please go through the following link,
http://ai.stanford.edu/~amaas/data/sentiment/

examples of data analysis:

https://www.kaggle.com/datasets/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews

In [None]:
from datasets import load_dataset

dataset_train = load_dataset('imdb',split='train')
dataset_train.rename_column('label', 'labels')
train_df=pd.DataFrame(dataset_train)

dataset_test = load_dataset('imdb',split='test')
dataset_test.rename_column('label', 'labels')
test_df=pd.DataFrame(dataset_test)

In [None]:
dataset_train
train_df

In [None]:
train_df['text'][0]

# **Remove HTML code tags**

In [None]:
train_df['text'] = train_df['text'].replace(r'<[^<]+?>', ' ', regex=True)

In [None]:
train_df['text'][0]

**Remove numbers Covid19, 20th, 1965s**

In [None]:
train_df['text'] = train_df['text'].str.replace(r'\b\w*\d\w*\b', '', regex=True)
# Optional: clean up extra spaces afterward
train_df['text'] = train_df['text'].str.replace(r'\s+', ' ', regex=True).str.strip()

In [None]:
train_df['text'][0]

In [None]:
# Remove the 5 most common junk words completely from the text
junk_words = ['café', 'hindus', 'shirdi', 'sai', 'baba', 'cain', 'abel', 'sodom', 'gomorrah', 'la', 'mj', 'california']

import re
for word in junk_words:
    train_df['text'] = train_df['text'].str.replace(rf'\b{word}\b', ' ', regex=True, case=False)
    test_df['text']  = test_df['text'].str.replace(rf'\b{word}\b', ' ', regex=True, case=False)

In [None]:
#https://mpolinowski.github.io/docs/Development/Python/2023-05-20-python-sklearn-cheat-sheet/2023-05-20/
train_df.info()
train_df['text'].str.isspace().sum()
# is the dataset balanced
train_df['label'].value_counts()

#CountVectorizer is used to create Bag of Words (BOW)

In [None]:
# find top 20 words in negative reviews
train_neg_df = train_df[train_df['label'] == 0]

count_vectorizer = CountVectorizer(analyzer='word', stop_words='english', max_df=0.8 ,min_df=100)
bag_of_words = count_vectorizer.fit_transform(train_neg_df['text'])
sum_words = bag_of_words.sum(axis=0)

words_freq = [
    (word, sum_words[0, idx]) for word, idx in count_vectorizer.vocabulary_.items()
]

words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)

x, y = zip(*words_freq[:30])

plt.figure(figsize=(12,5))
plt.bar(x,y)
plt.xticks(rotation=90)
plt.title('Top30 Words used in Negative Reviews')
#!mkdir assets
#plt.savefig('assets/Negative_word.webp', bbox_inches='tight')

In [None]:
# find top 20 words in positive reviews
train_pos_df = train_df[train_df['label'] == 1]

count_vectorizer = CountVectorizer(analyzer='word', stop_words='english')
bag_of_words = count_vectorizer.fit_transform(train_pos_df['text'])
sum_words = bag_of_words.sum(axis=0)

words_freq = [
    (word, sum_words[0, idx]) for word, idx in count_vectorizer.vocabulary_.items()
]

words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)

x, y = zip(*words_freq[:30])

plt.figure(figsize=(12,5))
plt.bar(x,y)
plt.xticks(rotation=90)
plt.title('Top30 Words used in Positive Reviews')

#plt.savefig('assets/Positive_words.webp', bbox_inches='tight')

In [None]:
X_rev_train = train_df['text']
y_rev_train = train_df['label']

X_rev_test = test_df['text']
y_rev_test = test_df['label']

In [None]:
#vect = CountVectorizer().fit(X_rev_train)
vect = CountVectorizer(binary=False, max_df=0.80, min_df=10, max_features=10000, stop_words="english").fit(X_rev_train)
#with ngram
#vect = CountVectorizer(binary=False, max_df=0.80, min_df=10, max_features=10000, stop_words="english", ngram_range=(1,2)).fit(X_rev_train)

X_train = vect.transform(X_rev_train)
X_test = vect.transform(X_rev_test)
print("X_train:\n{}".format(repr(X_train)))


In [None]:
X_train[0]


**Vocabulary:**


In [None]:
#print("Vocabulary: ", vect.vocabulary_)

In [None]:
print("Original review:")
print(X_rev_train[0])
print("\nFirst 20 words that appear in this review (in vocabulary order):")
[print(f"  {w:15} → {c}") for w,c in zip(vect.get_feature_names_out()[X_train[0].indices[:20]], X_train[0].data[:20])];

In [None]:
feature_names = vect.get_feature_names_out()
print("Number of features: {}".format(len(feature_names)))
print("First 20 features:\n{}".format(feature_names[:20]))
print("Features 5010 to 5030:\n{}".format(feature_names[5010:5030]))
print("Every 2000th feature:\n{}".format(feature_names[::1000]))


In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from warnings import simplefilter
from sklearn.exceptions import ConvergenceWarning
simplefilter("ignore", category=ConvergenceWarning)

scores = cross_val_score(LogisticRegression(), X_train, y_rev_train, cv=3)
print("Mean cross-validation accuracy: {:.2f}".format(np.mean(scores)))

CountVectorizer to Extracting Features from Text

https://www.geeksforgeeks.org/using-countvectorizer-to-extracting-features-from-text/

In [None]:
nb_vec = LogisticRegression()
nb_vec.fit(X_train, y_rev_train)
preds = nb_vec.predict(X_test)
print(classification_report(y_rev_test, preds))

In [None]:
#Show weights for BOW
# https://www.nlplanet.org/course-practical-nlp/01-intro-to-nlp/04-n-grams
# create a pandas dataframe that shows the unigrams in each text
keys_values_sorted = sorted(list(vect.vocabulary_.items()), key=lambda t: t[1])
keys_sorted = list(zip(*keys_values_sorted))[0]

# show logistic regression weights
from_unigram_to_weight = dict(zip(keys_sorted, nb_vec.coef_[0]))
from_unigram_to_weight
print('Positive')
sorted(zip(nb_vec.coef_[0].round(3) , keys_sorted) , reverse = True)[:20]


In [None]:
print('Negative')
sorted(zip(nb_vec.coef_[0].round(2) , keys_sorted) , reverse = False)[:20]

# **TfidfVectorizer**

The TfidfVectorizer is a feature extraction technique in the scikit-learn library for converting a collection of raw text documents into a matrix of TF-IDF (Term Frequency-Inverse Document Frequency) features. This is a common step in Natural Language Processing (NLP) and text mining tasks to transform text data into numerical data that machine learning algorithms can work with.

How TfidfVectorizer Works
Term Frequency (TF): This measures how frequently a term (word) appears in a document. The assumption is that the more frequently a term appears in a document, the more important it is. However, this alone can be misleading, as common words (like "the", "is", "and") will appear frequently in many documents.
Inverse Document Frequency (IDF): This measures how important a term is by considering how often it appears across all documents in the dataset. The more documents a term appears in, the less important it is. The IDF value of a term decreases as the number of documents containing the term increases.
TF-IDF: The product of TF and IDF. This score gives us an indication of how important a term is within a particular document while reducing the weight of commonly occurring terms that are less informative.
Formula of TF-IDF
The TF-IDF score for a term t in a document d is calculated as:

tf-idf
(
t
,
d
)
=
tf
(
t
,
d
)
×
idf
tf-idf(t,d)=tf(t,d)×idf

Where:

tf
(
t
,
d
)
tf(t,d)
 is the term frequency of term t in document d.
idf
(
t
)
idf(t)
 is the inverse document frequency of term t, calculated as:
idf
(
t
)
=
log
⁡
(
N
1
+
df
(
t
)
)
idf(t)=log(
1+df(t)
N
​
 )

Where:

N is the total number of documents.
df
(
t
)
df(t)
 is the number of documents containing the term t.
 https://www.geeksforgeeks.org/how-to-store-a-tfidfvectorizer-for-future-use-in-scikit-learn/

 https://www.linkedin.com/pulse/count-vectorizers-vs-tfidf-natural-language-processing-sheel-saket/


In [None]:
tfidf_rev_vec = TfidfVectorizer(
    binary=False, max_df=0.80, min_df=10,
    max_features=10000
)

X_rev_tfidf_train = tfidf_rev_vec.fit_transform(X_rev_train)
X_rev_tfidf_test = tfidf_rev_vec.transform(X_rev_test)

In [None]:
# print idf values
df_idf = pd.DataFrame(tfidf_rev_vec.idf_, index=vect.get_feature_names_out(),columns=["idf_weights"])

# sort ascending
df_idf.sort_values(by=['idf_weights'])

In [None]:
nb_rev = LogisticRegression()
nb_rev.fit(X_rev_tfidf_train, y_rev_train)

In [None]:
preds = nb_rev.predict(X_rev_tfidf_test)
print(classification_report(y_rev_test, preds))

In [None]:
conf_mtx = confusion_matrix(y_rev_test, preds)
conf_mtx_plot = ConfusionMatrixDisplay(
     confusion_matrix=conf_mtx
)
conf_mtx_plot.plot(cmap='plasma')

In [None]:
#Show weights for BOW
# https://www.nlplanet.org/course-practical-nlp/01-intro-to-nlp/04-n-grams
# create a pandas dataframe that shows the unigrams in each text
keys_values_sorted = sorted(list(tfidf_rev_vec.vocabulary_.items()), key=lambda t: t[1])
keys_sorted = list(zip(*keys_values_sorted))[0]

# show logistic regression weights
from_unigram_to_weight = dict(zip(keys_sorted, nb_rev.coef_[0]))
from_unigram_to_weight
print('Positive')
sorted(zip(nb_rev.coef_[0] , keys_sorted) , reverse = True)[:20]

In [None]:
print('Negative')
sorted(zip(nb_rev.coef_[0] , keys_sorted) , reverse = False)[:20]

# **Let's build a pipline**

In [None]:
from sklearn.naive_bayes import BernoulliNB
#('clf', LogisticRegression(solver='lbfgs')),])
#('clf', DecisionTreeClassifier(max_depth=35))])

text_clf = Pipeline([('vect', CountVectorizer(stop_words='english',max_features=10000,ngram_range=(1,2))),
                    ('tfidf', TfidfTransformer()),
                     ('clf', LogisticRegression(fit_intercept=True))])

In [None]:
text_clf.fit(X_rev_train, y_rev_train)
text_clf.score(X_rev_test,y_rev_test)

In [None]:
print(text_clf.predict(['false awful we we bla bla bla great great']))
print(text_clf.predict_proba(['false awful we we bla bla bla great great']))

# **Let's add PCA/SVD to our features**

In [None]:
from sklearn.decomposition import TruncatedSVD

text_clf = Pipeline([
    ('vect', CountVectorizer(stop_words='english', max_features=10000, ngram_range=(1,2))),
    ('tfidf', TfidfTransformer()),
    ('svd', TruncatedSVD(n_components=500, random_state=42)),  # no need to densify!
    ('clf', LogisticRegression(fit_intercept=True))
])

In [None]:
text_clf.fit(X_rev_train, y_rev_train)
text_clf.score(X_rev_test,y_rev_test)

In [None]:
print(text_clf.predict(['false awful we we bla bla bla great great']))
print(text_clf.predict_proba(['false awful we we bla bla bla great great']))

# Vocabulary Stemmer and Lemmatization

In [None]:
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import nltk

nltk.download('wordnet')
from sklearn import metrics
from nltk.corpus import stopwords
from sklearn.linear_model import SGDClassifier
nltk.download('stopwords')
stop_words = stopwords.words('english')
# Download required data
nltk.download('punkt')


# Initialize stemmer and stopwords
stemmer = PorterStemmer()
stop_words = nltk.corpus.stopwords.words('english')

def stem_tokenizer(text):
    text = re.sub(r'[^a-zA-Z]', ' ', text.lower())
    tokens = text.split()
    return [stemmer.stem(token) for token in tokens if token not in stop_words]

pipelineSGD = Pipeline([
    ('vectorizer', CountVectorizer(
        tokenizer=stem_tokenizer,
        ngram_range=(1, 2),
        lowercase=False  # we already lowercased in tokenizer
    )),
    ('tfidf', TfidfTransformer()),
    ('clf', SGDClassifier(loss='hinge', alpha=1e-4, max_iter=100, random_state=42))
])

pipelineSGD.fit(X_rev_train, y_rev_train)

y_predSGD = pipelineSGD.predict(X_rev_test)
print("Predicted: ", y_predSGD)

print("Accuracy: ", metrics.accuracy_score(y_rev_test, y_predSGD)*100, "%")

In [None]:
text = "DONT WATCH THIS MOVIE. its a complete trash"
from nltk.stem.porter import PorterStemmer
import re
ps = PorterStemmer()
CLEAN = re.compile("<.*?>")    # to remove everything between "<>"
result = re.sub(CLEAN, " ", text)
result = re.sub("[^a-zA-Z]" , " " , result)
result = result.lower()
result = result.split()   # to break sentences into words
word = [ps.stem(word) for word in result if word not in stopwords.words("english")]
result = " ".join(word)
print(result)
print(pipelineSGD.predict([text]))

# The pipeline examples

In [None]:
from sklearn.svm import LinearSVC

pipelineSVM = Pipeline([
    ('vectorizer', CountVectorizer(stop_words = stop_words, ngram_range = (1,2), max_features = None)),
    ('tfidf', TfidfTransformer()),
    ('clf', LinearSVC(C = 0.5, max_iter = 1000, penalty = 'l2', tol = 0.01))
])


pipelineSVM.fit(X_rev_train, y_rev_train)

y_predSVM = pipelineSVM.predict(X_rev_test)
print("Predicted: ", y_predSVM)

print("Accuracy: ", metrics.accuracy_score(y_rev_test, y_predSVM)*100, "%")

In [None]:
text_clf = Pipeline([('vect', CountVectorizer(analyzer='word', token_pattern=r'\w{2,}',  strip_accents='unicode', stop_words=stop_words, max_features=10000,ngram_range=(1,1))),
                     ('tfidf', TfidfTransformer()),
                      ('clf', LogisticRegression()),])
text_clf.fit(X_rev_train, y_rev_train)
text_clf.score(X_rev_test,y_rev_test)

In [None]:
# Most common words that were used in positive reviews
#https://github.com/Kasra1377/IMDB-sentiment-analysis/blob/master/data-preprocessing.ipynb
features = text_clf['vect'].get_feature_names_out()
sorted(zip(text_clf['clf'].coef_[0] , features) , reverse = True)[:20]


In [None]:
# Most common words that were used in negative reviews
sorted(zip(text_clf['clf'].coef_[0] , features))[:20]

Examining the Model Performance

In [None]:

text = "DONT WATCH THIS MOVIE. its a complete trash"

In [None]:
#evaluate original and stemmed
print(text_clf.predict_proba([text]))

# **The best sklearn result**

In [None]:
#https://www.kaggle.com/code/adamschroeder/countvectorizer-tfidfvectorizer-predict-comments
from sklearn.feature_extraction import text
my_stop_words = text.ENGLISH_STOP_WORDS.union(["book"])

pipe = make_pipeline(TfidfVectorizer(
                                    stop_words='english',
                                    strip_accents='unicode',
                                    token_pattern=r'\w{1,}', #accept tokens that have 1 or more characters
                                    analyzer='word',
                                    ngram_range=(1, 2),
                                    min_df=100),
                     LinearSVC(C = 0.5, max_iter = 1000, penalty = 'l2', tol = 0.01))
param_grid = {'tfidfvectorizer__max_features': [None],
              'tfidfvectorizer__sublinear_tf': [True],
              'tfidfvectorizer__smooth_idf': [True],
              'tfidfvectorizer__use_idf': [True],
              'tfidfvectorizer__stop_words': [None],
              'linearsvc__C': [ 0.1],

             }
grid = GridSearchCV(pipe, param_grid, cv=3)

grid3 = grid.fit(X_rev_train, y_rev_train)



In [None]:
print(grid3.best_params_)
print(grid3.best_score_)


In [None]:
grid3.score(X_rev_test,y_rev_test)

In [None]:
# for the comparition best results:
# https://paperswithcode.com/sota/sentiment-analysis-on-imdb

In [None]:
print(grid3.predict(['ok recommended its greate 10 of 10 movie']))

In [None]:
y_pred=grid3.predict(X_rev_test)

In [None]:
print("Accuracy: ", metrics.accuracy_score(y_rev_test, y_pred)*100, "%")

In [None]:
y_test = np.asarray(y_rev_test)
misclassified = np.where(y_test != grid3.predict(X_rev_test))
len(misclassified[0])

In [None]:
i=200
print('True label=',y_test[misclassified[0][i]], ' predicted =', 1-y_test[misclassified[0][i]] )
X_rev_test[misclassified[0][i]]


In [None]:
cm=confusion_matrix(y_test, grid3.predict(X_rev_test))
disp = ConfusionMatrixDisplay(confusion_matrix=cm,
                              display_labels=grid3.classes_)
disp.plot()
plt.show()

# Let's cluster train part of DB

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import MiniBatchKMeans
import numpy as np
import pandas as pd

# 1. Very light vectorizer (we just want to catch duplicates/spam)
vect = TfidfVectorizer(
    ngram_range=(1,3),          # trigrams catch entire spam sentences!
    max_features=20000,
    min_df=3,
    stop_words='english',
    lowercase=True
)

X = vect.fit_transform(X_rev_train)   # X_rev_train = list/array of raw reviews

# 2. Cluster into 50–200 clusters (MiniBatchKMeans is fast on 50k docs)
n_clusters = 100
kmeans = MiniBatchKMeans(n_clusters=n_clusters, random_state=42, batch_size=1000)
labels = kmeans.fit_predict(X)

# 3. Show the biggest clusters → spam jumps out immediately
cluster_sizes = np.bincount(labels)
biggest_clusters = np.argsort(cluster_sizes)[::-1]

print("Top 10 biggest clusters (these are almost always spam):")
for i in biggest_clusters[:10]:
    size = cluster_sizes[i]
    print(f"\nCluster {i} → {size} reviews ({size/len(X_rev_train)*100:.1f}%)")

    # Show 3 example reviews from this cluster
    examples_idx = np.where(labels == i)[0][:3]
    for j, idx in enumerate(examples_idx, 1):
        text = X_rev_train[idx]
        print(f"   {j}. \"{text.replace(chr(10), ' ')[:180]}...\"")

In [None]:
def filter_keywords(X_rev_train, labels, cluster, n_keep=3) -> str:
    mask = (labels == cluster)
    text = ". ".join(X_rev_train.loc[mask].astype(str))
    kw_model = KeyBERT()
    candidates = kw_model.extract_keywords(text, keyphrase_ngram_range=(1, 1),
                                      top_n=n_keep*2)
    #print(candidates)
    keywords_with_scores = candidates  # already a list of tuples

    # 4. Greedily select non-overlapping keywords
    selected = []
    for kw, score in keywords_with_scores:
        if any(kw in seen for seen in selected):
            continue
        if any(seen in kw for seen in selected):
            continue
        selected.append(kw)
        if len(selected) >= n_keep:
            break

    # 5. Return pretty string
    if len(selected) > 1:
        return (", ".join(selected[:-1]) + " and " + selected[-1]).capitalize()
    elif selected:
        return selected[0].capitalize()
    else:
        return "No keywords"

In [None]:
def filter_keywords_in_text(text_to_analize, n_keep=3) -> str:

    text = ". ".join(text_to_analize.astype(str))
    kw_model = KeyBERT()
    candidates = kw_model.extract_keywords(text, keyphrase_ngram_range=(1, 1),
                                      top_n=n_keep*2)
    #print(candidates)
    keywords_with_scores = candidates  # already a list of tuples

    # 4. Greedily select non-overlapping keywords
    selected = []
    for kw, score in keywords_with_scores:
        if any(kw in seen for seen in selected):
            continue
        if any(seen in kw for seen in selected):
            continue
        selected.append(kw)
        if len(selected) >= n_keep:
            break

    # 5. Return pretty string
    if len(selected) > 1:
        return (", ".join(selected[:-1]) + " and " + selected[-1]).capitalize()
    elif selected:
        return selected[0].capitalize()
    else:
        return "No keywords"

In [None]:
keywords = filter_keywords(X_rev_train, labels, cluster=27, n_keep=3)
print(keywords)

In [None]:

print("Top 40 biggest clusters (these are almost always spam):")
for i in biggest_clusters[:40]:
    size = cluster_sizes[i]
    print(f"\nCluster {i} → {size} reviews ({size/len(X_rev_train)*100:.1f}%) Keywords: ", filter_keywords(X_rev_train, labels, cluster=i, n_keep=3))

    # Show 3 example reviews from this cluster
    examples_idx = np.where(labels == i)[0][:5]
    for j, idx in enumerate(examples_idx, 1):
        text = X_rev_train[idx]
        print(f"   {j}. \"{text.replace(chr(10), ' ')[:180]}...\"")