In [2]:
def find_sentiment(sentence, pos, neg):
    # split sentence by a space
    sentence = sentence.split()
    sentence = set(sentence)
    num_pos = len(sentence.intersection(pos))
    num_neg = len(sentence.intersection(neg))
    if num_pos > num_neg:
        return "positive"
    if num_pos < num_neg:
        return "negative"
    return "neutral"




### Work tokenize

In [5]:
# NLTK
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package punkt to /home/danph/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
sentence = "hi, how are you?"
sentence.split()

In [6]:
word_tokenize(sentence)

['hi', ',', 'how', 'are', 'you', '?']

### Bag of words

In [8]:
from sklearn.feature_extraction.text import CountVectorizer

# Create a corpus of sentences
corpus = [
    "hello, how are you?",
    "im getting bored at home. And you? What do you think?",
    "did you know about counts",
    "let's see if this works!",
    "YES!!!!!"
]

# Create CountVectorizer and fit the corpus
ctv = CountVectorizer()
ctv.fit(corpus)
corpus_tfm = ctv.transform(corpus)
print(corpus_tfm)


  (0, 2)	1
  (0, 9)	1
  (0, 11)	1
  (0, 22)	1
  (1, 1)	1
  (1, 3)	1
  (1, 4)	1
  (1, 7)	1
  (1, 8)	1
  (1, 10)	1
  (1, 13)	1
  (1, 17)	1
  (1, 19)	1
  (1, 22)	2
  (2, 0)	1
  (2, 5)	1
  (2, 6)	1
  (2, 14)	1
  (2, 22)	1
  (3, 12)	1
  (3, 15)	1
  (3, 16)	1
  (3, 18)	1
  (3, 20)	1
  (4, 21)	1


In [10]:
print(ctv.vocabulary_)

{'hello': 9, 'how': 11, 'are': 2, 'you': 22, 'im': 13, 'getting': 8, 'bored': 4, 'at': 3, 'home': 10, 'and': 1, 'what': 19, 'do': 7, 'think': 17, 'did': 6, 'know': 14, 'about': 0, 'counts': 5, 'let': 15, 'see': 16, 'if': 12, 'this': 18, 'works': 20, 'yes': 21}


### Bags of words with word tokenize

In [5]:
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import word_tokenize

# Create a corpus of sentences
corpus = [
    "hello, how are you?",
    "im getting bored at home. And you? What do you think?",
    "did you know about counts",
    "let's see if this works!",
    "YES!!!!!"
]

# Create CountVectorizer and fit the corpus
ctv = CountVectorizer(tokenizer=word_tokenize, token_pattern=None)
ctv.fit(corpus)
corpus_tfm = ctv.transform(corpus)
print(ctv.vocabulary_)

{'hello': 14, ',': 2, 'how': 16, 'are': 7, 'you': 27, '?': 4, 'im': 18, 'getting': 13, 'bored': 9, 'at': 8, 'home': 15, '.': 3, 'and': 6, 'what': 24, 'do': 12, 'think': 22, 'did': 11, 'know': 19, 'about': 5, 'counts': 10, 'let': 20, "'s": 1, 'see': 21, 'if': 17, 'this': 23, 'works': 25, '!': 0, 'yes': 26}


### TF-IDF vectorize

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import word_tokenize

# Create a corpus of sentences
corpus = [
    "hello, how are you?",
    "im getting bored at home. And you? What do you think?",
    "did you know about counts",
    "let's see if this works!",
    "YES!!!!!"
]

tfv = TfidfVectorizer(tokenizer=word_tokenize, token_pattern=None)
tfv.fit(corpus)

corpus_tfm = tfv.transform(corpus)
print(corpus_tfm)

  (0, 27)	0.2965698850220162
  (0, 16)	0.4428321995085722
  (0, 14)	0.4428321995085722
  (0, 7)	0.4428321995085722
  (0, 4)	0.35727423026525224
  (0, 2)	0.4428321995085722
  (1, 27)	0.35299699146792735
  (1, 24)	0.2635440111190765
  (1, 22)	0.2635440111190765
  (1, 18)	0.2635440111190765
  (1, 15)	0.2635440111190765
  (1, 13)	0.2635440111190765
  (1, 12)	0.2635440111190765
  (1, 9)	0.2635440111190765
  (1, 8)	0.2635440111190765
  (1, 6)	0.2635440111190765
  (1, 4)	0.42525129752567803
  (1, 3)	0.2635440111190765
  (2, 27)	0.31752680284846835
  (2, 19)	0.4741246485558491
  (2, 11)	0.4741246485558491
  (2, 10)	0.4741246485558491
  (2, 5)	0.4741246485558491
  (3, 25)	0.38775666010579296
  (3, 23)	0.38775666010579296
  (3, 21)	0.38775666010579296
  (3, 20)	0.38775666010579296
  (3, 17)	0.38775666010579296
  (3, 1)	0.38775666010579296
  (3, 0)	0.3128396318588854
  (4, 26)	0.2406120346077416
  (4, 0)	0.9706213725247981


In [3]:
tfv.vocabulary_

{'hello': 14,
 ',': 2,
 'how': 16,
 'are': 7,
 'you': 27,
 '?': 4,
 'im': 18,
 'getting': 13,
 'bored': 9,
 'at': 8,
 'home': 15,
 '.': 3,
 'and': 6,
 'what': 24,
 'do': 12,
 'think': 22,
 'did': 11,
 'know': 19,
 'about': 5,
 'counts': 10,
 'let': 20,
 "'s": 1,
 'see': 21,
 'if': 17,
 'this': 23,
 'works': 25,
 '!': 0,
 'yes': 26}

In [7]:
# Check if they use the same list of vocabularies
ctv.vocabulary_ == tfv.vocabulary_

True

### N-grams: words in order

In [11]:
from nltk import ngrams
from nltk.tokenize import word_tokenize

# Set n-grams
N = 2
sentence = "hi, how are you?"
sentence_tkn = word_tokenize(sentence)
# Generate n-grams
n_grams = list(ngrams(sentence_tkn, N))
print(n_grams)

[('hi', ','), (',', 'how'), ('how', 'are'), ('are', 'you'), ('you', '?')]


### Stemming and lemmatiztion

In [16]:
import nltk
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
from nltk.stem.snowball import SnowballStemmer

lemzer = WordNetLemmatizer()
stemmer = SnowballStemmer("english")

words = ["fishing", "fishes", "fished"]

for word in words:
    print(f"Word={word}")
    print(f"Stem={stemmer.stem(word)}")
    print(f"Lemm={lemzer.lemmatize(word)}")
    print("")

Word=fishing
Stem=fish
Lemm=fishing

Word=fishes
Stem=fish
Lemm=fish

Word=fished
Stem=fish
Lemm=fished

[nltk_data] Downloading package wordnet to /home/danph/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


### Topic extraction

In [25]:
import pandas as pd
from nltk.tokenize import word_tokenize
from sklearn import decomposition
from sklearn.feature_extraction.text import TfidfVectorizer

# Create a corpus of sentences
corpus = pd.read_csv("../data/imdb.csv", nrows=10000)
corpus.loc[:, 'review'] = corpus.review.apply(clean_text)
corpus = corpus.review.values

In [26]:
# Tfidf vectorize with work_tokenize
tfv = TfidfVectorizer(tokenizer=word_tokenize, token_pattern=None)
tfv.fit(corpus)
corpus_tfm = tfv.transform(corpus)
# Generate SVD with 10 components
svd = decomposition.TruncatedSVD(n_components=10)
corpus_svd = svd.fit(corpus_tfm)

In [27]:
# Create a dictionary of feature names and scores from svd
sample_index = 0
feature_scores = dict(zip(tfv.get_feature_names(), corpus_svd.components_[sample_index]))
# Sort to get top N topics
N = 10
print(sorted(feature_scores, key=feature_scores.get, reverse=True)[:N])

['the', 'a', 'and', 'of', 'to', 'is', 'i', 'in', 'it', 'this']


In [24]:
import re, string

def clean_text(s):
    s = s.split()
    s = " ".join(s)
    # Remove all punctuations
    s = re.sub(f'[{re.escape(string.punctuation)}]', '', s)
    return s    