# 2022-12-07

In [42]:
import re
from collections import Counter
from typing import List

import eli5
import matplotlib.pyplot as plt
import nltk
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer, SnowballStemmer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from wordcloud import WordCloud
from nltk.tokenize import word_tokenize
from nltk.tokenize import RegexpTokenizer

In [40]:
# Downloading the required nltk dependencies
nltk.download("stopwords") # <- stopwords
nltk.download("wordnet") # <- lemmatizer
nltk.download("omw-1.4") # <- stemmer

nltk.download('punkt')

[nltk_data] Downloading package stopwords to /Users/dqmis/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/dqmis/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/dqmis/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt to /Users/dqmis/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

## Let's write a transformer!!

In [50]:
class TextTransformer(BaseEstimator, TransformerMixin):
    def __init__(self) -> None:
        self._stemmer = SnowballStemmer("english")
        self._lemmatizer = WordNetLemmatizer()
        self._tokenizer = RegexpTokenizer(r"\w+")

    def fit(self, X, y=None):
        return self

    def _clean_html(self, text: str) -> str:
        # Fuction that removes html artifacts
        return re.sub(r"<.*?>", "", text).strip()

    def _transform_text(self, text: str) -> str:
        text = self._clean_html(text)
        words = self._tokenizer.tokenize(text)
        return " ".join([
            self._stemmer.stem(self._lemmatizer.lemmatize(word)) for word in words
        ])

    def transform(self, X: List[str], y=None) -> List[str]:
        return [self._transform_text(x) for x in X]

In [51]:
text_transformer = TextTransformer()

In [5]:
cv = CountVectorizer()

In [8]:
df = pd.read_csv("../data/imdb.csv")

In [9]:
cv.fit_transform(df["review"].values)

<50000x101895 sparse matrix of type '<class 'numpy.int64'>'
	with 6826529 stored elements in Compressed Sparse Row format>

In [52]:
x = text_transformer.transform(df["review"].values)

In [53]:
cv = CountVectorizer()
cv.fit_transform(x)

<50000x72846 sparse matrix of type '<class 'numpy.int64'>'
	with 6544070 stored elements in Compressed Sparse Row format>

In [None]:
100712

In [54]:
len(cv.get_feature_names_out())

72846

### Creating pipeline

In [72]:
pipeline = Pipeline(
    [
        ("text_preprocessor", TextTransformer()),
        ("cv", CountVectorizer(stop_words=["english"])),
        ("model", LogisticRegression()),
    ]
)

In [64]:
df = pd.read_csv("../data/imdb.csv")
df["label"] = df["sentiment"].apply(lambda x: int(x == "positive"))

df.groupby(by=["label"]).size()

label
0    25000
1    25000
dtype: int64

In [65]:
train_df, test_df = train_test_split(df)

In [73]:
pipeline.fit(train_df["review"].values, train_df["label"].values, )

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [74]:
y_true = test_df["label"].values

predictions = pipeline.predict(test_df["review"].values)

print("Accuracy: ", accuracy_score(y_true, predictions))
print(classification_report(y_true, predictions))

Accuracy:  0.882
              precision    recall  f1-score   support

           0       0.88      0.88      0.88      6223
           1       0.88      0.89      0.88      6277

    accuracy                           0.88     12500
   macro avg       0.88      0.88      0.88     12500
weighted avg       0.88      0.88      0.88     12500



In [60]:
y_true = test_df["label"].values

predictions = pipeline.predict(test_df["review"].values)

print("Accuracy: ", accuracy_score(y_true, predictions))
print(classification_report(y_true, predictions))

Accuracy:  0.88456
              precision    recall  f1-score   support

           0       0.89      0.87      0.88      6210
           1       0.88      0.89      0.89      6290

    accuracy                           0.88     12500
   macro avg       0.88      0.88      0.88     12500
weighted avg       0.88      0.88      0.88     12500



In [71]:
pipeline.predict_proba(
    ["This movies was really slow and uninteresting. I fell asleep while wathching it"]
)

NotFittedError: Vocabulary not fitted or provided

In [151]:
pipeline.predict_proba(["What an experience! This is the best cinema can offer"])

array([[0.24698671, 0.75301329]])

In [159]:
pipeline.predict_proba(["Sitas filmas buvo pasibaisetinas. Uzmigau kai ji ziurejau"])

array([[0.45780823, 0.54219177]])

In [154]:
pipeline.predict_proba(
    [
        "What an experience! This is the best cinema can offer. This movies was really slow and"
        " uninteresting. I fell asleep while wathching it"
    ]
)

array([[0.52812123, 0.47187877]])

In [75]:
feature_names = pipeline.named_steps["cv"].get_feature_names_out()
feature_names

array(['00', '000', '00000000000', ..., 'þorleifsson', 'þór', 'żmijewski'],
      dtype=object)

In [76]:
eli5.explain_weights(pipeline.named_steps["model"], top=50, feature_names=feature_names)

Weight?,Feature
+2.173,refresh
+1.647,subtl
+1.633,pleasant
+1.538,unexpect
+1.482,marvel
+1.384,excel
+1.333,underr
+1.328,notch
+1.316,flawless
+1.301,gem
