In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import spacy as sp
import numpy as np
from sklearn.svm import LinearSVC


In [2]:
nlp = sp.load("en_core_web_sm")

In [3]:
doc = nlp("have had having")

In [4]:
for token in doc:
    print(token.text,token.lemma_)

have have
had have
having have


In [5]:
#removing stopwords
from spacy.lang.en.stop_words import STOP_WORDS
stopwords = list(STOP_WORDS)
stopwords

['call',
 'above',
 'throughout',
 'meanwhile',
 'almost',
 'thereby',
 'ca',
 'whereas',
 'please',
 'latterly',
 'go',
 'ten',
 'all',
 'each',
 'others',
 'somehow',
 'twelve',
 'after',
 'has',
 'by',
 'we',
 'out',
 'always',
 "'m",
 'both',
 'empty',
 'first',
 'are',
 'five',
 'hundred',
 'done',
 'give',
 'off',
 'he',
 'via',
 'or',
 'is',
 'hereupon',
 'whence',
 'and',
 'their',
 'yourselves',
 'without',
 'noone',
 'already',
 "'ll",
 'no',
 'well',
 'how',
 'everything',
 'as',
 'thru',
 'using',
 'could',
 'there',
 'regarding',
 'four',
 'same',
 'had',
 'although',
 'than',
 'from',
 'unless',
 'they',
 'since',
 'became',
 'two',
 'bottom',
 'such',
 'eight',
 'hence',
 'nevertheless',
 'did',
 'will',
 'thereafter',
 'top',
 'beside',
 '‘ve',
 'anywhere',
 'whereby',
 'nothing',
 '‘d',
 'was',
 'whom',
 'sometimes',
 'his',
 'would',
 'within',
 '‘m',
 'anything',
 'might',
 'towards',
 'against',
 'yours',
 'must',
 'more',
 'been',
 '‘ll',
 'this',
 'ourselves',
 's

In [6]:
doc = nlp("Anas is using python for sentiment analysis to build an advanced nlp script.")

In [7]:
clean_tokens=[]
for token in doc:
    if token.text not in stopwords:
        clean_tokens.append(token.text)
clean_tokens

['Anas',
 'python',
 'sentiment',
 'analysis',
 'build',
 'advanced',
 'nlp',
 'script',
 '.']

In [8]:
#removing punctuation

doc=nlp("critique : a mind-blowing movie for the teen generation that touches on a very cool idea , but presents it in a very bad package . which is what makes this review an even harder one to write , since i generally applaud films which attempt to break the mold , mess with your head and such ( lost highway & memento ) , but there are good and bad ways of making all types of films , and these folks just didn't snag this one correctly . they seem to have taken this pretty neat concept , but executed it terribly .")

In [9]:
clean_token_from_punctuation = []
for token in doc:
    if token.pos_ !="PUNCT" and token.pos_ !="SYM":
        clean_token_from_punctuation.append(token.text)
clean_token_from_punctuation

['critique',
 'a',
 'mind',
 'blowing',
 'movie',
 'for',
 'the',
 'teen',
 'generation',
 'that',
 'touches',
 'on',
 'a',
 'very',
 'cool',
 'idea',
 'but',
 'presents',
 'it',
 'in',
 'a',
 'very',
 'bad',
 'package',
 'which',
 'is',
 'what',
 'makes',
 'this',
 'review',
 'an',
 'even',
 'harder',
 'one',
 'to',
 'write',
 'since',
 'i',
 'generally',
 'applaud',
 'films',
 'which',
 'attempt',
 'to',
 'break',
 'the',
 'mold',
 'mess',
 'with',
 'your',
 'head',
 'and',
 'such',
 'lost',
 'highway',
 '&',
 'memento',
 'but',
 'there',
 'are',
 'good',
 'and',
 'bad',
 'ways',
 'of',
 'making',
 'all',
 'types',
 'of',
 'films',
 'and',
 'these',
 'folks',
 'just',
 'did',
 "n't",
 'snag',
 'this',
 'one',
 'correctly',
 'they',
 'seem',
 'to',
 'have',
 'taken',
 'this',
 'pretty',
 'neat',
 'concept',
 'but',
 'executed',
 'it',
 'terribly']

In [10]:
stripped_lowercase_tokens = []
for token in doc:
    stripped_lowercase_tokens.append(token.text.lower().strip())
stripped_lowercase_tokens

['critique',
 ':',
 'a',
 'mind',
 '-',
 'blowing',
 'movie',
 'for',
 'the',
 'teen',
 'generation',
 'that',
 'touches',
 'on',
 'a',
 'very',
 'cool',
 'idea',
 ',',
 'but',
 'presents',
 'it',
 'in',
 'a',
 'very',
 'bad',
 'package',
 '.',
 'which',
 'is',
 'what',
 'makes',
 'this',
 'review',
 'an',
 'even',
 'harder',
 'one',
 'to',
 'write',
 ',',
 'since',
 'i',
 'generally',
 'applaud',
 'films',
 'which',
 'attempt',
 'to',
 'break',
 'the',
 'mold',
 ',',
 'mess',
 'with',
 'your',
 'head',
 'and',
 'such',
 '(',
 'lost',
 'highway',
 '&',
 'memento',
 ')',
 ',',
 'but',
 'there',
 'are',
 'good',
 'and',
 'bad',
 'ways',
 'of',
 'making',
 'all',
 'types',
 'of',
 'films',
 ',',
 'and',
 'these',
 'folks',
 'just',
 'did',
 "n't",
 'snag',
 'this',
 'one',
 'correctly',
 '.',
 'they',
 'seem',
 'to',
 'have',
 'taken',
 'this',
 'pretty',
 'neat',
 'concept',
 ',',
 'but',
 'executed',
 'it',
 'terribly',
 '.']

In [11]:
def preprocessing(sentence):
    stopwords = list(STOP_WORDS)
    doc = nlp(sentence)
    clean_tokens = []
    for token in doc:
        if token.text not in stopwords and token.pos_ !='PUNCT' and token.pos_ !='SYM' and token.pos_ !='SPACE':
            clean_tokens.append(token.text)
    return clean_tokens

In [12]:
data_amazon = pd.read_csv('amazon_reviews.txt', sep='\t', header=None, names=['Review','Sentiment'])
data_imdb = pd.read_csv('imdb_reviews.txt', sep='\t', header=None, names=['Review','Sentiment'])


In [13]:
data_amazon.shape, data_imdb.shape

((1000, 2), (748, 2))

In [14]:
data = pd.concat([data_amazon, data_imdb], ignore_index=True)


In [15]:
from sklearn.feature_extraction.text import CountVectorizer
countvec = CountVectorizer(tokenizer=preprocessing)
corpus = [
    'Anas is coding python',
    'ML/DL is fun',
    'This project is a good nlp practice',
]
result = countvec.fit_transform(corpus)
print(countvec.get_feature_names_out())
print(result.toarray())
#rows are sentences
#columns are words

['anas' 'coding' 'dl' 'fun' 'good' 'ml' 'nlp' 'practice' 'project'
 'python']
[[1 1 0 0 0 0 0 0 0 1]
 [0 0 1 1 0 1 0 0 0 0]
 [0 0 0 0 1 0 1 1 1 0]]




In [16]:
neg_cond = data.Sentiment == 0
pos_cond = data.Sentiment == 1

neg_df   = data[neg_cond]
pos_df   = data[pos_cond]

In [17]:
neg_result = countvec.fit_transform(neg_df.Review)
neg_vocabs = countvec.get_feature_names_out()

pos_result = countvec.fit_transform(pos_df.Review)
pos_vocabs = countvec.get_feature_names_out()

In [18]:
neg_result.shape, pos_result.shape


((862, 2434), (886, 2424))

In [19]:
neg_counts = np.sum(neg_result, axis = 0)
pos_counts = np.sum(pos_result, axis = 0)

In [20]:
df = pd.DataFrame(neg_counts, columns = neg_vocabs).T.sort_values(by=0, ascending=False)


In [21]:
df.head(10)


Unnamed: 0,0
1,99
movie,95
0,92
phone,78
bad,78
film,72
time,42
work,39
like,38
good,35


In [22]:
tfidvec = TfidfVectorizer(tokenizer=preprocessing)

neg_result = tfidvec.fit_transform(neg_df.Review)
neg_vocabs = tfidvec.get_feature_names_out()
pos_result = tfidvec.fit_transform(pos_df.Review)
pos_vocabs = tfidvec.get_feature_names_out()

neg_counts = np.sum(neg_result, axis = 0)
pos_counts = np.sum(pos_result, axis = 0)

neg_count_df = pd.DataFrame(neg_counts, columns = neg_vocabs).T.sort_values(by=0, ascending=False)
pos_count_df = pd.DataFrame(pos_counts, columns = pos_vocabs).T.sort_values(by=0, ascending=False)



In [23]:
pos_count_df.head(10)


Unnamed: 0,0
great,39.607633
good,28.479732
phone,28.415073
works,20.46774
film,18.895819
movie,18.782514
product,15.956729
excellent,15.417723
love,13.146654
price,12.785043


In [41]:
classifer = LinearSVC()
tfidvec   = TfidfVectorizer()

X = data["Review"]
y = data["Sentiment"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=333)
print(X_test.shape)

(525,)


In [50]:
clf = Pipeline([('tfidf', tfidvec), ('clf', classifer)])


In [52]:
clf.fit(X_train, y_train)


In [58]:
yhat = clf.predict(X_test)


In [60]:
print(classification_report(yhat, y_test))


              precision    recall  f1-score   support

           0       0.82      0.79      0.80       266
           1       0.79      0.82      0.81       259

    accuracy                           0.80       525
   macro avg       0.80      0.80      0.80       525
weighted avg       0.80      0.80      0.80       525



In [62]:
confusion_matrix(yhat, y_test)


array([[209,  57],
       [ 46, 213]], dtype=int64)

In [64]:
clf.predict(['A song of ice and fire is one of my favourite books'])

array([1], dtype=int64)

In [66]:
from sklearn.feature_extraction.text import TfidfTransformer

#imagine that we already have a frequency features.  We can perform normalization
#as a follow up
#here we got n=3, and m=2
counts = [[3, 0, 1],
          [2, 1, 0],
          [3, 2, 5]]
transformer = TfidfTransformer()
transformer.fit_transform(counts).toarray()

array([[0.91892665, 0.        , 0.39442846],
       [0.84080197, 0.54134281, 0.        ],
       [0.39706158, 0.34085938, 0.85214845]])