In [2]:
pip install nltk

Defaulting to user installation because normal site-packages is not writeable
Collecting nltk
  Downloading nltk-3.9.1-py3-none-any.whl (1.5 MB)
[K     |████████████████████████████████| 1.5 MB 4.1 MB/s eta 0:00:01
[?25hCollecting regex>=2021.8.3
  Downloading regex-2024.9.11-cp39-cp39-macosx_11_0_arm64.whl (284 kB)
[K     |████████████████████████████████| 284 kB 6.7 MB/s eta 0:00:01
Collecting click
  Downloading click-8.1.7-py3-none-any.whl (97 kB)
[K     |████████████████████████████████| 97 kB 888 kB/s eta 0:00:01
[?25hInstalling collected packages: regex, click, nltk
Successfully installed click-8.1.7 nltk-3.9.1 regex-2024.9.11
You should consider upgrading via the '/Library/Developer/CommandLineTools/usr/bin/python3 -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.


In [4]:
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/delinaivanova/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [6]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/delinaivanova/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [8]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/delinaivanova/nltk_data...


True

#### Stemming:
Definition: Stemming is a crude heuristic process that chops off the ends of words to reduce them to their base form, often resulting in words that are not valid or recognized words. It doesn’t consider the context in which a word is used.
Example:
"Running" → "Run"
"Studies" → "Studi"
"Happily" → "Happili"
Pros: Stemming is typically faster since it's just cutting off word suffixes using predefined rules.
Cons: The output may not always be linguistically correct (e.g., "happily" becomes "happili" or "studies" becomes "studi").

#### Lemmatization:
Definition: Lemmatization is a more sophisticated process that reduces words to their dictionary form (lemma), considering the context and part of speech. It ensures that the base form is an actual word.

Example:

"Running" → "Run"
"Studies" → "Study"
"Better" → "Good"
Pros: Lemmatization produces more accurate base forms of words, which are valid and meaningful.

Cons: It is slower compared to stemming, as it needs to analyze the word and sometimes the sentence to determine the part of speech.

In [9]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
import string

# Sample sentence
sentence = "I ate an apple today! It was red and delicious."

# Tokenize the sentence
tokens = word_tokenize(sentence)
print("Tokens:", tokens)

# Convert to lowercase and remove punctuation
tokens = [word.lower() for word in tokens if word.isalpha()]
print("Tokens after removing punctuation:", tokens)

# Remove stopwords
stop_words = set(stopwords.words('english'))
filtered_tokens = [word for word in tokens if word not in stop_words]
print("Filtered Tokens (no stopwords):", filtered_tokens)

# Apply stemming and lemmatization
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

stemmed_tokens = [stemmer.stem(token) for token in filtered_tokens]
lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]

print("Stemmed Tokens:", stemmed_tokens)
print("Lemmatized Tokens:", lemmatized_tokens)


Tokens: ['I', 'ate', 'an', 'apple', 'today', '!', 'It', 'was', 'red', 'and', 'delicious', '.']
Tokens after removing punctuation: ['i', 'ate', 'an', 'apple', 'today', 'it', 'was', 'red', 'and', 'delicious']
Filtered Tokens (no stopwords): ['ate', 'apple', 'today', 'red', 'delicious']
Stemmed Tokens: ['ate', 'appl', 'today', 'red', 'delici']
Lemmatized Tokens: ['ate', 'apple', 'today', 'red', 'delicious']


Bag of words: simply counts the number of times words appear.
N-gram: looks at compbinations of words and how often they appear.

In [11]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

# Sample data
documents = [
    "I love the product. It is amazing and easy to use.",
    "The product did not work as expected. Very disappointing.",
    "Fantastic product! I would recommend it to everyone.",
    "Terrible product. Waste of money."
]

# Create Bag-of-Words model
vectorizer_bow = CountVectorizer()
X_bow = vectorizer_bow.fit_transform(documents)

# Convert to DataFrame for better understanding
bow_df = pd.DataFrame(X_bow.toarray(), columns=vectorizer_bow.get_feature_names_out())
print("Bag-of-Words Model:\n", bow_df)

# Create an N-Gram model (bigram)
vectorizer_ngram = CountVectorizer(ngram_range=(2, 2))  # Using bigrams
X_ngram = vectorizer_ngram.fit_transform(documents)

# Convert to DataFrame for better understanding
ngram_df = pd.DataFrame(X_ngram.toarray(), columns=vectorizer_ngram.get_feature_names_out())
print("N-Gram Model (Bigrams):\n", ngram_df)


Bag-of-Words Model:
    amazing  and  as  did  disappointing  easy  everyone  expected  fantastic  \
0        1    1   0    0              0     1         0         0          0   
1        0    0   1    1              1     0         0         1          0   
2        0    0   0    0              0     0         1         0          1   
3        0    0   0    0              0     0         0         0          0   

   is  ...  product  recommend  terrible  the  to  use  very  waste  work  \
0   1  ...        1          0         0    1   1    1     0      0     0   
1   0  ...        1          0         0    1   0    0     1      0     1   
2   0  ...        1          1         0    0   1    0     0      0     0   
3   0  ...        1          0         1    0   0    0     0      1     0   

   would  
0      0  
1      0  
2      1  
3      0  

[4 rows x 25 columns]
N-Gram Model (Bigrams):
    amazing and  and easy  as expected  did not  easy to  expected very  \
0            1 

What the TF-IDF Output Shows:
The TF-IDF matrix is a 4x25 table where:

* Rows: Each row represents a document from the input documents. In this case, there are 4 rows, meaning you have 4 documents.
* Columns: Each column represents a unique word from the corpus (all the documents combined). In this case, there are 25 unique words across all documents.
* Values (TF-IDF Scores): The values in the matrix are the TF-IDF scores for each word in each document. The TF-IDF score tells us how important a word is in a particular document relative to the entire corpus.

Key Points to Consider:
* High TF-IDF Score: A higher score indicates that the word is important to that specific document but not very common across all documents.
* Low TF-IDF Score: A lower score indicates that the word is either common across all documents or is not particularly important in this document.
* Zeros: A score of 0 means the word does not appear in that document.

* High TF-IDF Score: Indicates that the word is frequent in a specific document (high TF) but rare across the entire corpus (high IDF). Such words are considered important because they capture the document's unique content.
* Low TF-IDF Score: Indicates either that the word is common across many documents (low IDF) or that it appears rarely within the document (low TF). Such words are not considered important or distinctive.

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Create a TF-IDF model
vectorizer_tfidf = TfidfVectorizer()
X_tfidf = vectorizer_tfidf.fit_transform(documents)

# Convert to DataFrame for better understanding
tfidf_df = pd.DataFrame(X_tfidf.toarray(), columns=vectorizer_tfidf.get_feature_names_out())
print("TF-IDF Model:\n", tfidf_df)


TF-IDF Model:
     amazing       and        as       did  disappointing      easy  everyone  \
0  0.350562  0.350562  0.000000  0.000000       0.000000  0.350562  0.000000   
1  0.000000  0.000000  0.355921  0.355921       0.355921  0.000000  0.000000   
2  0.000000  0.000000  0.000000  0.000000       0.000000  0.000000  0.425802   
3  0.000000  0.000000  0.000000  0.000000       0.000000  0.000000  0.000000   

   expected  fantastic        is  ...   product  recommend  terrible  \
0  0.000000   0.000000  0.350562  ...  0.182938   0.000000  0.000000   
1  0.355921   0.000000  0.000000  ...  0.185734   0.000000  0.000000   
2  0.000000   0.425802  0.000000  ...  0.222201   0.425802  0.000000   
3  0.000000   0.000000  0.000000  ...  0.252468   0.000000  0.483803   

        the        to       use      very     waste      work     would  
0  0.276387  0.276387  0.350562  0.000000  0.000000  0.000000  0.000000  
1  0.280612  0.000000  0.000000  0.355921  0.000000  0.355921  0.000000  
2

In [14]:
pip install gensim

Defaulting to user installation because normal site-packages is not writeable
Collecting gensim
  Downloading gensim-4.3.3-cp39-cp39-macosx_11_0_arm64.whl (24.0 MB)
[K     |████████████████████████████████| 24.0 MB 946 kB/s eta 0:00:011
Collecting smart-open>=1.8.1
  Downloading smart_open-7.0.5-py3-none-any.whl (61 kB)
[K     |████████████████████████████████| 61 kB 573 kB/s eta 0:00:011
Collecting wrapt
  Downloading wrapt-1.16.0-cp39-cp39-macosx_11_0_arm64.whl (38 kB)
Installing collected packages: wrapt, smart-open, gensim
Successfully installed gensim-4.3.3 smart-open-7.0.5 wrapt-1.16.0
You should consider upgrading via the '/Library/Developer/CommandLineTools/usr/bin/python3 -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.


In [15]:
import gensim
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize

# Sample data (tokenized)
sentences = [
    word_tokenize("I love the product. It is amazing and easy to use."),
    word_tokenize("The product did not work as expected. Very disappointing."),
    word_tokenize("Fantastic product! I would recommend it to everyone."),
    word_tokenize("Terrible product. Waste of money.")
]

# Train a Word2Vec model
word2vec_model = Word2Vec(sentences, vector_size=50, window=5, min_count=1, workers=4)

# Get word embeddings for "product"
print("Word Embedding for 'product':\n", word2vec_model.wv['product'])

# Find similar words to "product"
similar_words = word2vec_model.wv.most_similar('product', topn=5)
print("Words similar to 'product':", similar_words)


Word Embedding for 'product':
 [-0.01632074  0.0089843  -0.00828079  0.00164966  0.01698902 -0.00892575
  0.00904536 -0.01356366 -0.00710585  0.01880129 -0.00315039  0.0006427
 -0.00827438 -0.01536828 -0.00301453  0.00495137 -0.00178086  0.01107326
 -0.00549231  0.00450072  0.01091657  0.0166995  -0.00290105 -0.0184196
  0.00874619  0.00114292  0.01488733 -0.00162391 -0.00528543 -0.01750779
 -0.00171116  0.00566212  0.01080635  0.0141175  -0.01139941  0.00372219
  0.01218469 -0.00959842 -0.0062074   0.01359864  0.00326303  0.00038398
  0.00693083  0.00044581  0.01924413  0.01012162 -0.0178444  -0.01409721
  0.00179413  0.01279023]
Words similar to 'product': [('work', 0.230143204331398), ('Waste', 0.2206723541021347), ('it', 0.21899765729904175), ('Very', 0.1608201116323471), ('expected', 0.14899705350399017)]


In [17]:
import pandas as pd

# Generate more synthetic data
data = {
    'Review': [
        'I love the product! It works great and is very easy to use.',
        'The product was okay, but I had some issues with customer service.',
        'I am very disappointed. The product stopped working after two weeks.',
        'Fantastic product! Will definitely buy again.',
        'Not worth the money. Very poor quality.',
        'Absolutely horrible experience. The product broke after one day.',
        'I would recommend this product to everyone. Best purchase I ever made!',
        'Customer service was great, but the product was just average.',
        'The product exceeded my expectations! Worth every penny.',
        'Terrible! The product didn’t work as advertised.'
    ],
    'Sentiment': [1, 0, 0, 1, 0, 0, 1, 0, 1, 0]  # 1 = positive, 0 = negative
}

# Convert to a DataFrame
reviews_df = pd.DataFrame(data)


In [18]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import gensim
from nltk.tokenize import word_tokenize

# Prepare the data
documents = reviews_df['Review'].tolist()
labels = reviews_df['Sentiment'].tolist()

# Tokenize sentences for Word2Vec model
sentences = [word_tokenize(doc) for doc in documents]

# Train a Word2Vec model
word2vec_model = Word2Vec(sentences, vector_size=50, window=5, min_count=1, workers=4)

# Function to convert a document to its embedding representation (summing vectors)
def document_embedding(doc):
    tokens = word_tokenize(doc)
    valid_words = [word for word in tokens if word in word2vec_model.wv]
    if valid_words:
        return np.sum([word2vec_model.wv[word] for word in valid_words], axis=0)  # Sum instead of mean
    else:
        return np.zeros((50,))

# Create feature matrix
X = np.array([document_embedding(doc) for doc in documents])

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.2, random_state=42)

# Train a simple Logistic Regression model
clf = LogisticRegression()
clf.fit(X_train, y_train)

# Evaluate the model
accuracy = clf.score(X_test, y_test)
print(f"Sentiment Classifier Accuracy: {accuracy:.2%}")


Sentiment Classifier Accuracy: 50.00%
