In [2]:
!pip install nltk

Collecting nltk
  Downloading nltk-3.9.1-py3-none-any.whl.metadata (2.9 kB)
Collecting regex>=2021.8.3 (from nltk)
  Downloading regex-2024.11.6-cp38-cp38-win_amd64.whl.metadata (41 kB)
Downloading nltk-3.9.1-py3-none-any.whl (1.5 MB)
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   ---------------------------------------- 1.5/1.5 MB 15.9 MB/s eta 0:00:00
Downloading regex-2024.11.6-cp38-cp38-win_amd64.whl (274 kB)
Installing collected packages: regex, nltk
Successfully installed nltk-3.9.1 regex-2024.11.6


In [47]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer

nltk.download('punkt_tab') # word_tokenize
nltk.download('stopwords') # stopwords
nltk.download('wordnet') # WordNetLemmatizer

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\erfan\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\erfan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\erfan\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

### Sample text

In [48]:
text = "Cats were chasing the mice while the dogs were barking loudly in the yard."

### Tokenization

In [49]:
tokens = word_tokenize(text.lower())
print("Tokens:\n", tokens)

Tokens:
 ['cats', 'were', 'chasing', 'the', 'mice', 'while', 'the', 'dogs', 'were', 'barking', 'loudly', 'in', 'the', 'yard', '.']


### Removing Stopwords

In [50]:
filtered = [w for w in tokens if w not in stopwords.words('english')] # a list comprehension for accessing each word in tokens
removed = [w for w in tokens if w in stopwords.words('english')] # a list comprehension for accessing each word in tokens

print("words with less english meaning (removed words):",removed)
print("\nwords with more english meaning (kept words)   :",filtered)


words with less english meaning (removed words): ['were', 'the', 'while', 'the', 'were', 'in', 'the']

words with more english meaning (kept words)   : ['cats', 'chasing', 'mice', 'dogs', 'barking', 'loudly', 'yard', '.']


### Stemming

In [51]:
stemmer = PorterStemmer()
stemmed = [stemmer.stem(w) for w in filtered]

print("after stemming", stemmed)

after stemming ['cat', 'chase', 'mice', 'dog', 'bark', 'loudli', 'yard', '.']


### Lemmatization

In [54]:
lemmatizer = WordNetLemmatizer()
lemmatized = [lemmatizer.lemmatize(w) for w in filtered]
print("Lemmatized:", lemmatized)

Lemmatized: ['cat', 'chasing', 'mouse', 'dog', 'barking', 'loudly', 'yard', '.']


# Feature Representation in NLP

### 1. Bag of Words

In [61]:
from sklearn.feature_extraction.text import CountVectorizer

docs = ["I love NLP", "I love machine learning"]
vectorizer = CountVectorizer()
x = vectorizer.fit_transform(docs)

print(vectorizer.get_feature_names_out())
print(x.toarray())

['learning' 'love' 'machine' 'nlp']
[[0 1 0 1]
 [1 1 1 0]]


### 2. TF-IDF

In [58]:
from sklearn.feature_extraction.text import TfidfVectorizer

docs = ["I love NLP", "I love machine learning"]
vectorizer = TfidfVectorizer()
x = vectorizer.fit_transform(docs)

print(vectorizer.get_feature_names_out())
print(x.toarray())

['learning' 'love' 'machine' 'nlp']
[[0.         0.57973867 0.         0.81480247]
 [0.6316672  0.44943642 0.6316672  0.        ]]


### Mini Project

In [79]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.svm import LinearSVC

# some tweets
texts = [
    # Positive reviews (1)
    "I love this movie",
    "Amazing experience, I enjoyed it",
    "Fantastic acting and story",
    "Great film with beautiful cinematography",
    "The music and emotions were perfect",
    "Wonderful direction and amazing visuals",
    "A masterpiece of storytelling",
    "Heartwarming and inspiring movie",
    "Brilliant performance by the cast",
    "Loved every minute of it",
    "The ending was powerful and emotional",
    "Such a fun and entertaining film",
    "The characters felt real and relatable",
    "A beautiful and touching experience",
    "An absolute joy to watch",

    # Negative reviews (0)
    "This film was terrible",
    "I hate this movie",
    "Worst movie ever",
    "The plot was boring and predictable",
    "Bad acting and weak script",
    "I almost fell asleep while watching",
    "Too long and painfully slow",
    "Poor direction and horrible pacing",
    "The ending made no sense",
    "Awful sound quality and editing",
    "Disappointing and overhyped",
    "Nothing interesting happened",
    "I regret spending time on this",
    "The movie was just a waste of time"
]

labels = [
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  # 15 positives
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0       # 15 negatives
]
# --- Bag-of-Words ---
bow = CountVectorizer()
X_bow = bow.fit_transform(texts)
X_train, X_test, y_train, y_test = train_test_split(X_bow, labels, test_size=0.3, random_state=42)

model_bow = MultinomialNB() # Naive Bayes
model_bow.fit(X_train, y_train)
pred_bow = model_bow.predict(X_test)
print("BoW Accuracy:", accuracy_score(y_test, pred_bow))

# --- TF-IDF ---
tfidf = TfidfVectorizer()
X_tfidf = tfidf.fit_transform(texts)
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, labels, test_size=0.3, random_state=42)

model_tfidf = MultinomialNB() # Naive Bayes
model_tfidf.fit(X_train, y_train)
pred_tfidf = model_tfidf.predict(X_test)


print("TF-IDF Accuracy:", accuracy_score(y_test, pred_tfidf))


BoW Accuracy: 0.4444444444444444
TF-IDF Accuracy: 0.5555555555555556


In [80]:
print(list(enumerate(bow.get_feature_names_out())))
print(X_bow)

[(0, 'absolute'), (1, 'acting'), (2, 'almost'), (3, 'amazing'), (4, 'an'), (5, 'and'), (6, 'asleep'), (7, 'awful'), (8, 'bad'), (9, 'beautiful'), (10, 'boring'), (11, 'brilliant'), (12, 'by'), (13, 'cast'), (14, 'characters'), (15, 'cinematography'), (16, 'direction'), (17, 'disappointing'), (18, 'editing'), (19, 'emotional'), (20, 'emotions'), (21, 'ending'), (22, 'enjoyed'), (23, 'entertaining'), (24, 'ever'), (25, 'every'), (26, 'experience'), (27, 'fantastic'), (28, 'fell'), (29, 'felt'), (30, 'film'), (31, 'fun'), (32, 'great'), (33, 'happened'), (34, 'hate'), (35, 'heartwarming'), (36, 'horrible'), (37, 'inspiring'), (38, 'interesting'), (39, 'it'), (40, 'joy'), (41, 'just'), (42, 'long'), (43, 'love'), (44, 'loved'), (45, 'made'), (46, 'masterpiece'), (47, 'minute'), (48, 'movie'), (49, 'music'), (50, 'no'), (51, 'nothing'), (52, 'of'), (53, 'on'), (54, 'overhyped'), (55, 'pacing'), (56, 'painfully'), (57, 'perfect'), (58, 'performance'), (59, 'plot'), (60, 'poor'), (61, 'powerf

In [81]:
print(list(enumerate(tfidf.get_feature_names_out())))
print(X_tfidf)

[(0, 'absolute'), (1, 'acting'), (2, 'almost'), (3, 'amazing'), (4, 'an'), (5, 'and'), (6, 'asleep'), (7, 'awful'), (8, 'bad'), (9, 'beautiful'), (10, 'boring'), (11, 'brilliant'), (12, 'by'), (13, 'cast'), (14, 'characters'), (15, 'cinematography'), (16, 'direction'), (17, 'disappointing'), (18, 'editing'), (19, 'emotional'), (20, 'emotions'), (21, 'ending'), (22, 'enjoyed'), (23, 'entertaining'), (24, 'ever'), (25, 'every'), (26, 'experience'), (27, 'fantastic'), (28, 'fell'), (29, 'felt'), (30, 'film'), (31, 'fun'), (32, 'great'), (33, 'happened'), (34, 'hate'), (35, 'heartwarming'), (36, 'horrible'), (37, 'inspiring'), (38, 'interesting'), (39, 'it'), (40, 'joy'), (41, 'just'), (42, 'long'), (43, 'love'), (44, 'loved'), (45, 'made'), (46, 'masterpiece'), (47, 'minute'), (48, 'movie'), (49, 'music'), (50, 'no'), (51, 'nothing'), (52, 'of'), (53, 'on'), (54, 'overhyped'), (55, 'pacing'), (56, 'painfully'), (57, 'perfect'), (58, 'performance'), (59, 'plot'), (60, 'poor'), (61, 'powerf