In [2]:
def find_sentiment(sentence, pos, neg):
    """
    This function returns sentiment of sentence :param sentence: sentence, a string
    :param pos: set of positive words
    :param neg: set of negative words
    :return: returns positive, negative or neutral sentiment 
    """
    # split sentence by a space
    # "this is a sentence!" becomes:
    # ["this", "is" "a", "sentence!"]
    # note that im splitting on all whitespaces
    # if you want to split by space use .split(" ") 
    sentence = sentence.split()
    
    # make sentence into a set
    sentence = set(sentence)
    
    # check number of common words with positive
    num_common_pos = len(sentence.intersection(pos))
    
    # check number of common words with negative
    num_common_neg = len(sentence.intersection(neg))
    
    # make conditions and return
    # see how return used eliminates if else
    if num_common_pos > num_common_neg:
        return "positive"
    if num_common_pos < num_common_neg:
        return "negative" 
    return "neutral"
    

In [1]:
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/guozhiqi/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [2]:
sentence = "hi, how are you?"

In [3]:
sentence.split()

['hi,', 'how', 'are', 'you?']

In [4]:
word_tokenize(sentence)

['hi', ',', 'how', 'are', 'you', '?']

In [5]:
from sklearn.feature_extraction.text import CountVectorizer

In [6]:
# create a corpus of sentences
corpus = [
"hello, how are you?",
"im getting bored at home. And you? What do you think?", "did you know about counts",
"let's see if this works!",
"YES!!!!"
]

In [7]:
# initialize CountVectorizer
ctv = CountVectorizer()

In [8]:
# fit the vectorizer on corpus
ctv.fit(corpus)

CountVectorizer()

In [9]:
corpus_transformed = ctv.transform(corpus)

In [10]:
corpus_transformed

<5x23 sparse matrix of type '<class 'numpy.int64'>'
	with 25 stored elements in Compressed Sparse Row format>

In [11]:
print(corpus_transformed)

  (0, 2)	1
  (0, 9)	1
  (0, 11)	1
  (0, 22)	1
  (1, 1)	1
  (1, 3)	1
  (1, 4)	1
  (1, 7)	1
  (1, 8)	1
  (1, 10)	1
  (1, 13)	1
  (1, 17)	1
  (1, 19)	1
  (1, 22)	2
  (2, 0)	1
  (2, 5)	1
  (2, 6)	1
  (2, 14)	1
  (2, 22)	1
  (3, 12)	1
  (3, 15)	1
  (3, 16)	1
  (3, 18)	1
  (3, 20)	1
  (4, 21)	1


In [12]:
print(ctv.vocabulary_)

{'hello': 9, 'how': 11, 'are': 2, 'you': 22, 'im': 13, 'getting': 8, 'bored': 4, 'at': 3, 'home': 10, 'and': 1, 'what': 19, 'do': 7, 'think': 17, 'did': 6, 'know': 14, 'about': 0, 'counts': 5, 'let': 15, 'see': 16, 'if': 12, 'this': 18, 'works': 20, 'yes': 21}


In [13]:
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import word_tokenize

In [14]:
# create a corpus of sentences
corpus = [
"hello, how are you?",
"im getting bored at home. And you? What do you think?", 
"did you know about counts",
"let's see if this works!",
"YES!!!!"
]

In [15]:
# initialize CountVectorizer with word_tokenize from nltk
# as the tokenizer
ctv = CountVectorizer(tokenizer=word_tokenize, token_pattern=None)

In [16]:
# fit the vectorizer on corpus
ctv.fit(corpus)

CountVectorizer(token_pattern=None,
                tokenizer=<function word_tokenize at 0x7fa8a7fcdf80>)

In [24]:
corpus_transformed = ctv.transform(corpus)
print(ctv.vocabulary_)

{'hello': 14, ',': 2, 'how': 16, 'are': 7, 'you': 27, '?': 4, 'im': 18, 'getting': 13, 'bored': 9, 'at': 8, 'home': 15, '.': 3, 'and': 6, 'what': 24, 'do': 12, 'think': 22, 'did': 11, 'know': 19, 'about': 5, 'counts': 10, 'let': 20, "'s": 1, 'see': 21, 'if': 17, 'this': 23, 'works': 25, '!': 0, 'yes': 26}


In [25]:
corpus_transformed

<5x28 sparse matrix of type '<class 'numpy.int64'>'
	with 32 stored elements in Compressed Sparse Row format>

In [26]:
from scipy.sparse import csr_matrix
import numpy as np

# Convert to dense matrix
dense_matrix = corpus_transformed.toarray()

# Print the dense matrix
print(dense_matrix)

[[0 0 1 0 1 0 0 1 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 1]
 [0 0 0 1 2 0 1 0 1 1 0 0 1 1 0 1 0 0 1 0 0 0 1 0 1 0 0 2]
 [0 0 0 0 0 1 0 0 0 0 1 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1]
 [1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 1 0 1 0 1 0 0]
 [4 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0]]


In [31]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/guozhiqi/nltk_data...


True

In [32]:
from nltk.stem import WordNetLemmatizer
from nltk.stem.snowball import SnowballStemmer

# initialize lemmatizer
lemmatizer = WordNetLemmatizer()

# initialize stemmer
stemmer = SnowballStemmer("english")

words = ["fishing", "fishes", "fished"]

for word in words:
    print(f"word={word}") 
    print(f"stemmed_word={stemmer.stem(word)}") 
    print(f"lemma={lemmatizer.lemmatize(word)}") 
    print("")

word=fishing
stemmed_word=fish
lemma=fishing

word=fishes
stemmed_word=fish
lemma=fish

word=fished
stemmed_word=fish
lemma=fished



In [35]:
import io
import numpy as np
import pandas as pd
from nltk.tokenize import word_tokenize
from sklearn import linear_model
from sklearn import metrics
from sklearn import model_selection

def load_vectors(fname):
    # taken from: https://fasttext.cc/docs/en/english-vectors.html
    fin = io.open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore')
    n, d = map(int, fin.readline().split())
    data = {}
    for line in fin:
        tokens = line.rstrip().split(' ')
        data[tokens[0]] = list(map(float, tokens[1:]))
    return data

In [36]:
embeddings = load_vectors("../input/crawl-300d-2M.vec")

In [38]:
type(embeddings)

dict

In [40]:
len(embeddings['happy'])

300

In [41]:
embeddings['happy'][:20]

[0.1526,
 -0.5961,
 0.04,
 0.4482,
 -0.0285,
 0.0128,
 -0.0705,
 -0.1424,
 0.3894,
 -0.0739,
 0.0872,
 0.0228,
 -0.2588,
 0.0661,
 -0.1623,
 0.0924,
 0.0574,
 -0.0593,
 -0.1985,
 0.0283]