In [0]:
import nltk

In [0]:
text = "This is a simple sentence, isn't it?"

In [0]:
tokenizer = nltk.tokenize.WhitespaceTokenizer()

In [8]:
tokenizer.tokenize(text)

['This', 'is', 'a', 'simple', 'sentence,', "isn't", 'it?']

In [0]:
tokenizer = nltk.tokenize.TreebankWordTokenizer()

In [10]:
tokenizer.tokenize(text)

['This', 'is', 'a', 'simple', 'sentence', ',', 'is', "n't", 'it', '?']

In [0]:
text = "A wolf walked away from a pack of walking wolves"

In [0]:
tokens = tokenizer.tokenize(text)

In [0]:
stemmer = nltk.stem.PorterStemmer()

In [14]:
for token in tokens:
  print(stemmer.stem(token))

A
wolf
walk
away
from
a
pack
of
walk
wolv


In [0]:
lemmatizer = nltk.stem.WordNetLemmatizer()

In [16]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [17]:
for token in tokens:
  print(lemmatizer.lemmatize(token))

A
wolf
walked
away
from
a
pack
of
walking
wolf


In [0]:
import spacy

In [0]:
spc = spacy.load('en')

In [0]:
doc = spc(text)

In [21]:
for token in doc:
  print(token, token.lemma_)

A a
wolf wolf
walked walk
away away
from from
a a
pack pack
of of
walking walk
wolves wolf


In [0]:
text = "Sri Lankan Airlines will lose $50 million due to Covid-19"

In [0]:
doc = spc(text)

In [24]:
for token in doc:
  print(token, token.pos_)

Sri PROPN
Lankan PROPN
Airlines PROPN
will VERB
lose VERB
$ SYM
50 NUM
million NUM
due ADP
to ADP
Covid-19 NOUN


In [0]:
from spacy import displacy

In [26]:
displacy.render(doc, jupyter=True)

In [27]:
displacy.render(doc, jupyter=True, style='ent')

In [0]:
texts = ["good movie",
         "not a good movie",
         "did not like",
         "like it",
         "good one"]

In [0]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [0]:
tfidf = TfidfVectorizer(ngram_range=(1,2))

In [0]:
import pandas as pd

In [0]:
features = tfidf.fit_transform(texts)

In [33]:
pd.DataFrame(features.todense(), columns=tfidf.get_feature_names())

Unnamed: 0,did,did not,good,good movie,good one,it,like,like it,movie,not,not good,not like,one
0,0.0,0.0,0.506204,0.609818,0.0,0.0,0.0,0.0,0.609818,0.0,0.0,0.0,0.0
1,0.0,0.0,0.363135,0.437464,0.0,0.0,0.0,0.0,0.437464,0.437464,0.542226,0.0,0.0
2,0.48214,0.48214,0.0,0.0,0.0,0.0,0.388988,0.0,0.0,0.388988,0.0,0.48214,0.0
3,0.0,0.0,0.0,0.0,0.0,0.614189,0.495524,0.614189,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.427993,0.0,0.63907,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.63907


In [0]:
df = pd.read_csv('/content/IMDB Dataset.csv')

In [35]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [36]:
df['sentiment'].value_counts()

negative    25000
positive    25000
Name: sentiment, dtype: int64

In [0]:
X = df['review']
y = df['sentiment']

In [38]:
X.head()

0    One of the other reviewers has mentioned that ...
1    A wonderful little production. <br /><br />The...
2    I thought this was a wonderful way to spend ti...
3    Basically there's a family where a little boy ...
4    Petter Mattei's "Love in the Time of Money" is...
Name: review, dtype: object

In [39]:
y.head()

0    positive
1    positive
2    positive
3    negative
4    positive
Name: sentiment, dtype: object

In [0]:
from sklearn.model_selection import train_test_split

In [0]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [42]:
X_train.head()

35087    Obviously made to show famous 1950s stripper M...
8499     If this movie had been directed by a man, he w...
3254     Sergeant Ryker is accused of being a traitor d...
15350    To confess having fantasies about Brad Pitt is...
481      Things to Come is an early Sci-Fi film that sh...
Name: review, dtype: object

In [43]:
X_train.shape

(35000,)

In [0]:
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline

In [0]:
text_classifier = Pipeline([('tfidf', TfidfVectorizer(ngram_range=(1,2))), ('clf', LinearSVC())])

In [46]:
text_classifier.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 2), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('clf',
                 LinearSVC(C=1.0, class_weight=None, dual=True,
                           fit_intercept=True, intercept_scaling=1,
               

In [0]:
predictions = text_classifier.predict(X_test)

In [0]:
from sklearn.metrics import accuracy_score

In [49]:
accuracy_score(y_test, predictions)

0.9198

In [50]:
text_classifier.predict(["It was a good movie, with a nice story"])

array(['positive'], dtype=object)

In [51]:
text_classifier.predict(["It wasn't a good movie, with really bad editing"])

array(['negative'], dtype=object)

In [52]:
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


True

In [53]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer



In [0]:
analyzer = SentimentIntensityAnalyzer()

In [55]:
analyzer.polarity_scores("It was a good movie, with a nice story")

{'compound': 0.6908, 'neg': 0.0, 'neu': 0.467, 'pos': 0.533}

In [56]:
analyzer.polarity_scores("It wasn't a good movie, with really bad editing")

{'compound': -0.7351, 'neg': 0.508, 'neu': 0.492, 'pos': 0.0}