In [1]:
import numpy as np
import pandas as pd


# getting the data

We will use UCI label dataset

Dataset is available here: https://archive.ics.uci.edu/ml/datasets/Sentiment+Labelled+Sentences

In [2]:
from io import BytesIO
from zipfile import ZipFile
from urllib.request import urlopen

# https://archive.ics.uci.edu/ml/datasets/Sentiment+Labelled+Sentences
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00331/sentiment%20labelled%20sentences.zip'
response = urlopen(url)
zipfile = ZipFile(BytesIO(response.read()))
file_names = zipfile.namelist()
# file_names

# files to read
files_to_read = []
for s in file_names:
    if '.txt' in s: 
        if '__MACOSX' not in s: 
            if 'readme.txt' not in s:
                files_to_read.append(s)
files_to_read
    

source_lists = ['amazon', 'imdb', 'yelp']

for i in range(len(files_to_read)):
    exec(f'df_{i} = pd.read_csv(zipfile.open(files_to_read[{i}]), delimiter="\t", names=["Text","Sentiment"])')
    for j in range(len(source_lists)):
        if source_lists[j] in files_to_read[i]:
            exec(f'df_{i}["Source"] = source_lists[{i}]')  


# merging all the datasets from amazon, imdb, yelp
df_all = pd.concat([df_0, df_1, df_2])
# reseting the index numbers
df_all.reset_index(drop=True, inplace=True)

df_all

Unnamed: 0,Text,Sentiment,Source
0,So there is no way for me to plug it in here i...,0,amazon
1,"Good case, Excellent value.",1,amazon
2,Great for the jawbone.,1,amazon
3,Tied to charger for conversations lasting more...,0,amazon
4,The mic is great.,1,amazon
...,...,...,...
2743,I think food should have flavor and texture an...,0,yelp
2744,Appetite instantly gone.,0,yelp
2745,Overall I was not impressed and would not go b...,0,yelp
2746,"The whole experience was underwhelming, and I ...",0,yelp


# nltk and spacy for nlp

We will use nltk and spacy libraries in this notebook

If you don't have nltk and spacy already installed please install them first.


For spacy

pip install -U spacy

python -m spacy download en

https://spacy.io/usage/linguistic-features

In [3]:
# nltk
import nltk
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

import re

import spacy


In [4]:
# downloading the packages
# nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

In [5]:
# Spacy
# First you need to load the library
nlp_spacy = spacy.load('en_core_web_sm')



In [6]:
# some attributes for spacy library
text = "Apple is looking at buying U.K. startup for $1 billion"
doc = nlp_spacy(text)

for token in doc:
        print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
            token.shape_, token.is_alpha, token.is_stop)

print("\n\nNamed Entities \n")

for token in doc.ents:
        print(token.text, token.start_char, token.end_char, token.label_)



Apple Apple PROPN NNP nsubj Xxxxx True False
is be AUX VBZ aux xx True True
looking look VERB VBG ROOT xxxx True False
at at ADP IN prep xx True True
buying buy VERB VBG pcomp xxxx True False
U.K. U.K. PROPN NNP compound X.X. False False
startup startup NOUN NN dobj xxxx True False
for for ADP IN prep xxx True True
$ $ SYM $ quantmod $ False False
1 1 NUM CD compound d False False
billion billion NUM CD pobj xxxx True False


Named Entities 

Apple 0 5 ORG
U.K. 27 31 GPE
$1 billion 44 54 MONEY


Stemming a document
You can write your own function that can stem documents. Here is one way to stem a document using Python filing:

Take a document as the input.
1. Read the document line by line
2. Tokenize the line
3. Stem the words
4. Output the stemmed words (print on screen or write to a file)
5. Repeat step 2 to step 5 until it is to the end of the document.

source: https://www.datacamp.com/community/tutorials/stemming-lemmatization-python

# Stopwords

In [None]:
# This is nltk stopwords
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

# Stemming

Please see further about Porter Stemming here (https://tartarus.org/martin/PorterStemmer/)

for NLTK
Stemmers example (https://www.nltk.org/howto/stem.html)

Stemmer is not available on Spacy



In [7]:
# nltk
nltk_porter_stem = PorterStemmer()

In [8]:
nltk_stop_words = stopwords.words('english')

from nltk.tokenize import RegexpTokenizer

tokenizer = RegexpTokenizer(r'\w+')

# creating a function for stemming
def nltk_stemming(text):
    # First tokenize the word. 
    # token_words = text.split()
    
    # simple method is to tonkenize the text
    # token_words = word_tokenize(text)

    # another method is to tokenize and remove punctuation
    token_words = tokenizer.tokenize(text)

    # Creating an empty list for text after porter stemmer
    stem_sentence = []

    # Iterating each word using Porter stemmer
    for word in token_words:
        # removing stopwords
        if word in nltk_stop_words:
            continue
        else:
            # Removing punctuation (, and -)
            # word = re.sub(r"[,-]",'',word)
            stem_sentence.append(nltk_porter_stem.stem(word))
    # Join the word
    return " ".join(stem_sentence)


In [9]:
df_all['Text_nltk_stemming'] = df_all['Text'].apply(nltk_stemming)

df_all

Unnamed: 0,Text,Sentiment,Source,Text_nltk_stemming
0,So there is no way for me to plug it in here i...,0,amazon,so way plug us unless i go convert
1,"Good case, Excellent value.",1,amazon,good case excel valu
2,Great for the jawbone.,1,amazon,great jawbon
3,Tied to charger for conversations lasting more...,0,amazon,tie charger convers last 45 minut major problem
4,The mic is great.,1,amazon,the mic great
...,...,...,...,...
2743,I think food should have flavor and texture an...,0,yelp,i think food flavor textur lack
2744,Appetite instantly gone.,0,yelp,appetit instantli gone
2745,Overall I was not impressed and would not go b...,0,yelp,overal i impress would go back
2746,"The whole experience was underwhelming, and I ...",0,yelp,the whole experi underwhelm i think go ninja s...


# Lemmatization

In [10]:
# Using Spacy
nlp_spacy = spacy.load('en_core_web_sm')

def spacy_lemma(text):
    doc = nlp_spacy(text)
    tokens = []
    for word in doc:
        if word.is_stop or word.is_punct or word.is_digit:
            continue
        else:
            tokens.append(word.lemma_.lower())
    return ' '.join(tokens)

df_all['Text_spacy_lemma'] = df_all['Text'].apply(spacy_lemma)
df_all


Unnamed: 0,Text,Sentiment,Source,Text_nltk_stemming,Text_spacy_lemma
0,So there is no way for me to plug it in here i...,0,amazon,so way plug us unless i go convert,way plug converter
1,"Good case, Excellent value.",1,amazon,good case excel valu,good case excellent value
2,Great for the jawbone.,1,amazon,great jawbon,great jawbone
3,Tied to charger for conversations lasting more...,0,amazon,tie charger convers last 45 minut major problem,tie charger conversation last minute major pro...
4,The mic is great.,1,amazon,the mic great,mic great
...,...,...,...,...,...
2743,I think food should have flavor and texture an...,0,yelp,i think food flavor textur lack,think food flavor texture lack
2744,Appetite instantly gone.,0,yelp,appetit instantli gone,appetite instantly go
2745,Overall I was not impressed and would not go b...,0,yelp,overal i impress would go back,overall impressed
2746,"The whole experience was underwhelming, and I ...",0,yelp,the whole experi underwhelm i think go ninja s...,experience underwhelming think ninja sushi time


# CountVectorizer

In [312]:
# Example
from sklearn.feature_extraction.text import CountVectorizer

corpus = [
'This is the first document.',
'This document is the second document.',
'And this is the third one.',
'Is this the first document?',
]

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)
feature_names = vectorizer.get_feature_names()
X = X.toarray()
X_CountVectorizer = pd.DataFrame(X, columns=feature_names)
X_CountVectorizer

Unnamed: 0,and,document,first,is,one,second,the,third,this
0,0,1,1,1,0,0,1,0,1
1,0,2,0,1,0,1,1,0,1
2,1,0,0,1,1,0,1,1,1
3,0,1,1,1,0,0,1,0,1


In [11]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
X_stemming = vectorizer.fit_transform(df_all['Text_nltk_stemming'])
print(vectorizer.get_feature_names()[-20:])
X_stemming = X_stemming.toarray()

# for two words
vectorizer2 = CountVectorizer(analyzer='word', ngram_range=(2, 2))
X2_stemming = vectorizer2.fit_transform(df_all['Text_nltk_stemming'])
print(vectorizer2.get_feature_names()[-20:])
X2_stemming = X2_stemming.toarray()




['yellowtail', 'yelp', 'yelper', 'yet', 'you', 'young', 'younger', 'your', 'youth', 'youtub', 'yucki', 'yukon', 'yum', 'yummi', 'yun', 'z500a', 'zero', 'zillion', 'zombi', 'zombiez']
['young play', 'younger set', 'your brain', 'your server', 'your staff', 'youth energi', 'yukon gold', 'yum sauc', 'yum yum', 'yummi christma', 'yummi tri', 'yummi tummi', 'yun fat', 'z500a pretti', 'zero star', 'zero tast', 'zillion time', 'zombi movi', 'zombi student', 'zombiez part']


In [12]:
pd.DataFrame(X_stemming)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4009,4010,4011,4012,4013,4014,4015,4016,4017,4018
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2743,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2744,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2745,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2746,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Tfidf Transformer


https://betterprogramming.pub/a-friendly-guide-to-nlp-tf-idf-with-python-example-5fcb26286a33



In [333]:
# Example
from sklearn.feature_extraction.text import TfidfVectorizer

corpus = [
'This is the first document.',
'This document is the second document.',
'And this is the third one.',
'Is this the first document?',
]


vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(corpus)
feature_names = vectorizer.get_feature_names()
X = X.toarray()
X_TfidfVectorizer = pd.DataFrame(X, columns=feature_names)
X_TfidfVectorizer

Unnamed: 0,and,document,first,is,one,second,the,third,this
0,0.0,0.469791,0.580286,0.384085,0.0,0.0,0.384085,0.0,0.384085
1,0.0,0.687624,0.0,0.281089,0.0,0.538648,0.281089,0.0,0.281089
2,0.511849,0.0,0.0,0.267104,0.511849,0.0,0.267104,0.511849,0.267104
3,0.0,0.469791,0.580286,0.384085,0.0,0.0,0.384085,0.0,0.384085


In [334]:
X_CountVectorizer

Unnamed: 0,and,document,first,is,one,second,the,third,this
0,0,1,1,1,0,0,1,0,1
1,0,2,0,1,0,1,1,0,1
2,1,0,0,1,1,0,1,1,1
3,0,1,1,1,0,0,1,0,1


In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
X_stemming_tfidf = vectorizer.fit_transform(df_all['Text_nltk_stemming'])
print(vectorizer.get_feature_names()[-20:])
X_stemming_tfidf = X_stemming_tfidf.toarray()
X_stemming_tfidf

# for two words
vectorizer2 = TfidfVectorizer(analyzer='word', ngram_range=(2, 2))
X2_stemming_tfidf = vectorizer2.fit_transform(df_all['Text_nltk_stemming'])
print(vectorizer2.get_feature_names()[-20:])
X2_stemming_tfidf = X2_stemming_tfidf.toarray()




['yellowtail', 'yelp', 'yelper', 'yet', 'you', 'young', 'younger', 'your', 'youth', 'youtub', 'yucki', 'yukon', 'yum', 'yummi', 'yun', 'z500a', 'zero', 'zillion', 'zombi', 'zombiez']
['young play', 'younger set', 'your brain', 'your server', 'your staff', 'youth energi', 'yukon gold', 'yum sauc', 'yum yum', 'yummi christma', 'yummi tri', 'yummi tummi', 'yun fat', 'z500a pretti', 'zero star', 'zero tast', 'zillion time', 'zombi movi', 'zombi student', 'zombiez part']


In [14]:
# now we have a transformed data to train the model
X_stemming
X2_stemming

X_stemming_tfidf
X2_stemming_tfidf

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

# Modelling

In [15]:
from sklearn.linear_model import LogisticRegression

In [16]:
y = df_all['Sentiment']

In [17]:
clf = LogisticRegression()
clf.fit(X_stemming,y)
clf.score(X_stemming,y)

0.9596069868995634