# Sentiment Analysis using SpaCy

## 0. Text processing using SpaCy

### 0.1 Lemmatization

It turns your word to its original form. Very common thing you wanna to do, because YouTubeVideo
do not want to confuse ypur model that run and running are different.

Note: But if you use very powerful neural network like transformer, NO need Lemmatization

In [1]:
import spacy
nlp = spacy.load("en_core_web_sm")
doc = nlp("run ran running")

for token in doc:
    print(token.text, token.lemma_)

#to NOT confuse the model, you want to convert words to thier lemma
#for very powetful neural network like Transformer (huggingface) ,NO NEED TO LEMMATIZATION, because they understand

run run
ran run
running run


### 0.2 Stop words

Common preprocessing is to remove stopwords, e.g., at, in, on, etc. Removing the helps model memorize only the keywords.

Note : In powerful network, we DON'T remove stop words

In [2]:
from spacy.lang.en.stop_words import STOP_WORDS

stopwords = list(STOP_WORDS)
print(stopwords[:5])

['would', 'before', "n't", 'go', 'anyway']


In [3]:
#let's demonstrate how to remove stopword
doc = nlp("Chaky is going to eat at Thammasat with his best friend Peter")

In [4]:
clean_tokens = []

for token in doc:
    if token.text not in stopwords:
            clean_tokens.append(token.text)
            
clean_tokens

['Chaky', 'going', 'eat', 'Thammasat', 'best', 'friend', 'Peter']

In [5]:
doc = nlp("There movie should have been good.")
clean_tokens = []

for token in doc:
    if token.text not in stopwords:
            clean_tokens.append(token.text)
            
clean_tokens

['There', 'movie', 'good', '.']

### 0.3 Removing punct

In [6]:
#remove punctuation
doc = nlp("Chaky, the teacher $ / @ # at AIT,!!!????? like to eat naan.")

In [7]:
#leverage pos tag
# for token in doc:
#     print(token.text, token.pos_)

In [8]:
token_no_punct = []

for token in doc:
    if token.pos_ != 'PUNCT' and token.pos_ != 'SPACE' and token.pos_ != 'SYM':
        token_no_punct.append(token.text)

token_no_punct

['Chaky', 'the', 'teacher', '@', '#', 'at', 'AIT', 'like', 'to', 'eat', 'naan']

### 0.4 Lowercasing and unnesscary spaces

In [9]:
stripped_lowercase_tokens = []

for token in doc:
    stripped_lowercase_tokens.append(token.text.lower())
stripped_lowercase_tokens


['chaky',
 ',',
 'the',
 'teacher',
 '$',
 '/',
 '@',
 '#',
 'at',
 'ait',
 ',',
 '!',
 '!',
 '!',
 '?',
 '?',
 '?',
 '?',
 '?',
 'like',
 'to',
 'eat',
 'naan',
 '.']

### 0.5 Combine everything 

In [10]:
#nowadays, we don't preprocess anympre, especially for big model, because you lose a lot of information 
#if fthere is something you can clean, is extra spaces or like duplicate symbols......

#if you use ML, e.g., SVM, KNN, EF, you need to preprocess
def preprocessing(sentence):

    stopwords = list(STOP_WORDS)

    doc = nlp(sentence)

    cleaned_tokens = []

    for token in doc:
        if token.text not in stopwords and token.pos_ != 'PUNCT' and token.pos_ != 'SPACE' and \
            token.pos_ != 'SYM':
                cleaned_tokens.append(token.text)
    
    return cleaned_tokens

## 1. Let's do sentiment analysis with the help sklearn and spacy!!!

In [22]:
#import stuff
import pandas as pd
from sklearn.feature_extraction.text import TfidfTransformer,TfidfVectorizer
from sklearn.pipeline import Pipeline 
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

### 1.1 load data

In [12]:
data_yelp   = pd.read_csv('../data/yelp_labelled.txt', sep='\t', header = None, names = ['Review', 'Sentiment'])
data_amazon = pd.read_csv('../data/amazon_labelled.txt', sep='\t', header = None, names = ['Review', 'Sentiment'])
data_imdb   = pd.read_csv('../data/imdb_labelled.txt', sep='\t', header = None, names = ['Review', 'Sentiment'])

In [13]:
data_yelp.head()

Unnamed: 0,Review,Sentiment
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [14]:
data_yelp.shape, data_amazon.shape, data_imdb.shape

((1000, 2), (1000, 2), (748, 2))

### 1.2 EDA

Check the mean and std; check any null values

In [15]:
data = pd.concat([data_yelp,data_amazon,data_imdb],ignore_index=True)
data.shape

(2748, 2)

In [16]:
data['Sentiment'].value_counts()

1    1386
0    1362
Name: Sentiment, dtype: int64

In [17]:
data.isnull().sum()

Review       0
Sentiment    0
dtype: int64

In [18]:
#count the frequency of words in postive and negative samples
#CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer

#preprocessing refers to the function we wrote earlier
    #the input should be a bunch of text
    #the output should return tokens
countvec = CountVectorizer(tokenizer = preprocessing)

#let's try
corpus = [
    'Chaky is coding python     ',
    'Deep learning is very deep',
    'Are you sure about this?????',
    'please hashtag #ilovepython'
]
result   = countvec.fit_transform(corpus)

#list of tokens
print(countvec.get_feature_names_out())

#count
#rows are sentences
#columns are
print(result.toarray())

['chaky' 'coding' 'deep' 'hashtag' 'ilovepython' 'learning' 'python'
 'sure']
[[1 1 0 0 0 0 1 0]
 [0 0 2 0 0 1 0 0]
 [0 0 0 0 0 0 0 1]
 [0 0 0 1 1 0 0 0]]


In [19]:
#let's look at top words catgprozed by postive and negative
import numpy as np

neg_cond = data.Sentiment == 0
pos_cond = data.Sentiment == 1

neg_df = data[neg_cond]
pos_df = data[pos_cond]

In [25]:
#count 
neg_result = countvec.fit_transform(neg_df.Review)
neg_vocabs = countvec.get_feature_names_out()

pos_result = countvec.fit_transform(pos_df.Review)
pos_vocabs = countvec.get_feature_names_out()

#sum the counts
neg_counts = np.sum(neg_result, axis = 0)
pos_counts = np.sum(pos_result, axis = 0)

In [27]:
#data frame
df = pd.DataFrame(neg_counts, columns = neg_vocabs).T.sort_values(by=0 ,ascending =False)
df.head()

Unnamed: 0,0
1,103
bad,96
movie,95
0,92
phone,78


In [28]:
df = pd.DataFrame(pos_counts, columns = pos_vocabs).T.sort_values(by=0 ,ascending =False)
df.head()

Unnamed: 0,0
great,192
good,171
film,91
movie,87
phone,87


### TfidfTransformer
- usually, in NLP, we don't use counvectorizer
- becausde it makes very frequent words a prominent feature, which we don't want to 
- we want something like normalized(countervectorizer) ==> tfidvectorizer

In [29]:
tfidvec = TfidfVectorizer(tokenizer=preprocessing)

#count
neg_result   = tfidvec.fit_transform(neg_df.Review)
neg_vocabs   = tfidvec.get_feature_names_out()
pos_result   = tfidvec.fit_transform(pos_df.Review)
pos_vocabs   = tfidvec.get_feature_names_out()

#sum words across all documents
neg_counts = np.sum(neg_result, axis=0)
pos_counts = np.sum(pos_result, axis=0)

print(neg_counts.shape, pos_counts.shape)
print(neg_vocabs.shape, pos_vocabs.shape)

(1, 3155) (1, 3116)
(3155,) (3116,)


## 2. Modeling and training

Use sklearn

In [44]:
from sklearn.svm import LinearSVC #here i am using machine learning, NOT deep learning
#define model
classifier  = LinearSVC()
tfidvec     = TfidfVectorizer()


#define X and y

X = data.Review
y = data.Sentiment

#split data 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((1923,), (825,), (1923,), (825,))

In [45]:
#mkae pipeline
clf = Pipeline([('tfidvec',tfidvec),('clf',classifier)])

In [46]:
#pipiline is the same as:

# X_trian_transformed = tfidvec.fit_transform(X_train)
# X_trian_transformed.shape #(words, features)

# classifier.fit(X_trian_transformed,y_train)

LinearSVC()

In [47]:
#train
clf.fit(X_train,y_train)
#predict
yhat = clf.predict(X_test)
#metric
print(classification_report(yhat,y_test))
#confusin matrix

              precision    recall  f1-score   support

           0       0.82      0.84      0.83       426
           1       0.83      0.81      0.82       399

    accuracy                           0.82       825
   macro avg       0.82      0.82      0.82       825
weighted avg       0.82      0.82      0.82       825



In [48]:
#confusin matrix
confusion_matrix(yhat,y_test)

array([[358,  68],
       [ 77, 322]], dtype=int64)

## 3.Real-World

In [49]:
clf.predict(['Chaky loves to eat sushi'])

array([0], dtype=int64)

In [50]:
clf.predict(['This movie is good'])

array([1], dtype=int64)

In [51]:
clf.predict(['This movie should have been good'])
#double negative is a very good test !!!!
#remember the sentiment tree bank

array([1], dtype=int64)

In [52]:
clf.predict(['This movie is crazily amazing'])

array([1], dtype=int64)

In [53]:
clf.predict(['This bad movie is good'])

array([0], dtype=int64)