# Sentiment analysis using SpaCy

## 0. Text processing using SpaCy

### 0.1 Lemmatization

It turns your word to its original form.  Very common thing you wanna to do, because YouTubeVideo
do not want to confuse your model that run and running are different.

Note:  But if you use very powerful neural network like transformer, NO NEED lemmatization....

In [None]:
import spacy

nlp = spacy.load("en_core_web_sm")
doc = nlp("run ran running")

for token in doc:
    print(token.text, token.lemma_)
    
#to NOT confuse the model, you want to convert words to their lemma
#for very powerful neural network like Transformer (huggingface), NO NEED TO LEMMATIZATION, bc they understand

### 0.2 Stop words

Common preprocessing is to remove stopwords, e.g., at, in, on, etc.  Removing them helps model memorize only the keywords.

Note: In powerful network, we DON'T remove stop words

In [None]:
from spacy.lang.en.stop_words import STOP_WORDS

stopwords = list(STOP_WORDS)
print(stopwords[:5])

In [None]:
#let's demonstrate how to remove stopword
doc = nlp("Chaky is going to eat at Thammasat with his best friend Peter.")

In [None]:
clean_tokens = []

for token in doc:
    if token.text not in stopwords:
        clean_tokens.append(token.text)
        
clean_tokens

In [None]:
doc = nlp("The movie should have been good.")

clean_tokens = []

for token in doc:
    if token.text not in stopwords:
        clean_tokens.append(token.text)
        
clean_tokens  #not good

### 0.3 Removing punct

In [None]:
#removing punctuation
doc = nlp("Chaky, the teacher $  /   @ # at AIT,!!!???? likes to eat naan.")

In [None]:
# # #leverage pos tag
# for token in doc:
#     print(token.text, token.pos_)

In [None]:
token_no_punct = []

for token in doc:
    if token.pos_ != 'PUNCT' and token.pos_ != 'SPACE' and token.pos_ != 'SYM':
        token_no_punct.append(token.text)

In [None]:
token_no_punct

### 0.4 Lowercasing and unnecessary spaces

In [None]:
stripped_lowercase_tokens = []

for token in doc:
    stripped_lowercase_tokens.append(token.text.lower().strip())
    
stripped_lowercase_tokens

### 0.5 Combine everything

In [None]:
#nowadays, we don't preprocess anymore, especially for big models, because you lose a lot of information
#if there is something you can clean, is extra spaces or like duplicate symbols.....

#if you use ML, e.g., SVM, KNN, RF, you need to preprocess
def preprocessing(sentence):
    
    stopwords = list(STOP_WORDS)
    doc = nlp(sentence)
    cleaned_tokens = []
    
    for token in doc:
        if token.text not in stopwords and token.pos_ != 'PUNCT' and token.pos_ != 'SPACE' and \
            token.pos_ != 'SYM':
                cleaned_tokens.append(token.text)
                
    return cleaned_tokens

## 1. Let's do sentiment analysis with the help sklearn and spacy!!!

In [None]:
#import stuff
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

### 1.1 Load data

In [None]:
data_yelp   = pd.read_csv('../data/yelp_labelled.txt', sep='\t', header = None, names = ['Review', 'Sentiment'])
data_amazon = pd.read_csv('../data/amazon_labelled.txt', sep='\t', header = None, names = ['Review', 'Sentiment'])
data_imdb = pd.read_csv('../data/imdb_labelled.txt', sep='\t', header = None, names = ['Review', 'Sentiment'])

In [None]:
data_yelp.head()

In [None]:
data_yelp.shape, data_amazon.shape, data_imdb.shape

### 1.2 EDA

Check the mean and std; check any null values

In [None]:
data = pd.concat([data_yelp, data_amazon, data_imdb], ignore_index=True)
data.shape

In [None]:
data['Sentiment'].value_counts()

In [None]:
data.isna().sum()

In [None]:
#count the frequency of words in postive and negative samples
#CountVectorizer

from sklearn.feature_extraction.text import CountVectorizer

countvec = CountVectorizer(tokenizer = preprocessing)

#let's try
corpus = [
    'Chaky is coding python     ',
    'Deep learning is very deep',
    'Are you sure about this?????',
    'please hashtag #ilovepython'
]
result   = countvec.fit_transform(corpus)

#list of tokens
print(countvec.get_feature_names_out())

#count
#rows are sentences
#columns are
print(result.toarray())

In [None]:
#let's look at top words categorized by postive and negative
import numpy as np

neg_cond = data.Sentiment == 0
pos_cond = data.Sentiment == 1

neg_df   = data[neg_cond]
pos_df   = data[pos_cond]

In [None]:
#count
neg_result = countvec.fit_transform(neg_df.Review)
neg_vocabs = countvec.get_feature_names_out()

pos_result = countvec.fit_transform(pos_df.Review)
pos_vocabs = countvec.get_feature_names_out()

In [None]:
neg_result.shape

In [None]:
#sum the counts
neg_counts = np.sum(neg_result, axis=0)
pos_counts = np.sum(pos_result, axis=0)

In [None]:
#data frame
df = pd.DataFrame(neg_counts, columns = neg_vocabs).T.sort_values(by=0, ascending=False)

In [None]:
df.head(10)

In [None]:
df = pd.DataFrame(pos_counts, columns = pos_vocabs).T.sort_values(by=0, ascending=False)
df.head(10)

In [None]:
#usually, in NLP, we don't use countvectorizer
#because it makes very frequent words a prominent feature, which we don't want to
#we want something like normalized(countvectorizer) ==> tfidvectorizer

In [None]:
tfidvec = TfidfVectorizer(tokenizer=preprocessing)

#count
neg_result   = tfidvec.fit_transform(neg_df.Review)
neg_vocabs   = tfidvec.get_feature_names_out()
pos_result   = tfidvec.fit_transform(pos_df.Review)
pos_vocabs   = tfidvec.get_feature_names_out()

#sum words across all documents
neg_counts = np.sum(neg_result, axis=0)
pos_counts = np.sum(pos_result, axis=0)

print(neg_counts.shape, pos_counts.shape)
print(neg_vocabs.shape, pos_vocabs.shape)

In [None]:
#top ten negative terms
df = pd.DataFrame(neg_counts, columns = neg_vocabs).T.sort_values(by=0, ascending=False)
df.head(10)

In [None]:
#top ten negative terms
df = pd.DataFrame(pos_counts, columns = pos_vocabs).T.sort_values(by=0, ascending=False)
df.head(10)