In [1]:
import os

import sys

import tarfile

import time

import pandas as pd

import numpy as np

from sklearn.feature_extraction.text import CountVectorizer

from sklearn.feature_extraction.text import TfidfTransformer

import re

from nltk.stem.porter import PorterStemmer

import nltk

from nltk.corpus import stopwords

from sklearn.pipeline import Pipeline

from sklearn.linear_model import LogisticRegression

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.model_selection import GridSearchCV

from sklearn.model_selection import StratifiedKFold

from sklearn.model_selection import cross_val_score

from sklearn.feature_extraction.text import HashingVectorizer

In [2]:
count = CountVectorizer()

docs = np.array([

        'The sun is shining',

        'The weather is sweet',

        'The sun is shining, the weather is sweet, and one and one is two'])

bag = count.fit_transform(docs)

In [3]:
print(count.vocabulary_)

{'the': 6, 'sun': 4, 'is': 1, 'shining': 3, 'weather': 8, 'sweet': 5, 'and': 0, 'one': 2, 'two': 7}


In [4]:
print(bag)

  (0, 6)	1
  (0, 4)	1
  (0, 1)	1
  (0, 3)	1
  (1, 6)	1
  (1, 1)	1
  (1, 8)	1
  (1, 5)	1
  (2, 6)	2
  (2, 4)	1
  (2, 1)	3
  (2, 3)	1
  (2, 8)	1
  (2, 5)	1
  (2, 0)	2
  (2, 2)	2
  (2, 7)	1


In [5]:
print(bag.toarray())

[[0 1 0 1 1 0 1 0 0]
 [0 1 0 0 0 1 1 0 1]
 [2 3 2 1 1 1 2 1 1]]


In [6]:
df = pd.read_csv('movie_data.csv', encoding='utf-8')

df.head(3)

Unnamed: 0,review,sentiment
0,"In 1974, the teenager Martha Moxley (Maggie Gr...",1
1,OK... so... I really like Kris Kristofferson a...,0
2,"***SPOILER*** Do not read this, if you think a...",0


In [7]:
df.loc[0, 'review'][-50:]

'is seven.<br /><br />Title (Brazil): Not Available'

In [8]:
def preprocessor(text):

    text = re.sub('<[^>]*>', '', text)

    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)',

                           text)

    text = (re.sub('[\W]+', ' ', text.lower()) +

            ' '.join(emoticons).replace('-', ''))

    return text

In [9]:
preprocessor(df.loc[0, 'review'][-50:])

'is seven title brazil not available'

In [10]:
preprocessor("</a>This :) is :( a test :-)!")

'this is a test :) :( :)'

In [11]:
df['review'] = df['review'].apply(preprocessor)

In [12]:
porter = PorterStemmer()



def tokenizer(text):

    return text.split()

In [13]:
def tokenizer_porter(text):

    return [porter.stem(word) for word in text.split()]

In [14]:
tokenizer('runners like running and thus they run')

['runners', 'like', 'running', 'and', 'thus', 'they', 'run']

In [15]:
tokenizer_porter('runners like running and thus they run')

['runner', 'like', 'run', 'and', 'thu', 'they', 'run']

In [16]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [17]:
stop = stopwords.words('english')

[w for w in tokenizer_porter('a runner likes running and runs a lot')[-10:]

if w not in stop]

['runner', 'like', 'run', 'run', 'lot']

In [18]:
np.set_printoptions(precision=2)

In [19]:
tfidf = TfidfTransformer(use_idf=True, 

                         norm='l2', 

                         smooth_idf=True)

In [20]:
print(tfidf.fit_transform(count.fit_transform(docs))

      .toarray())

[[0.   0.43 0.   0.56 0.56 0.   0.43 0.   0.  ]
 [0.   0.43 0.   0.   0.   0.56 0.43 0.   0.56]
 [0.5  0.45 0.5  0.19 0.19 0.19 0.3  0.25 0.19]]


In [21]:
tf_is = 3

n_docs = 3

idf_is = np.log((n_docs+1) / (3+1))

tfidf_is = tf_is * (idf_is + 1)

In [22]:
print('tf-idf of term "is" = %.2f' % tfidf_is)

tf-idf of term "is" = 3.00


In [23]:
tfidf = TfidfTransformer(use_idf=True, norm=None, smooth_idf=True)

In [24]:

raw_tfidf = tfidf.fit_transform(count.fit_transform(docs)).toarray()[-1]

In [25]:
raw_tfidf

array([3.39, 3.  , 3.39, 1.29, 1.29, 1.29, 2.  , 1.69, 1.29])

In [26]:


l2_tfidf = raw_tfidf / np.sqrt(np.sum(raw_tfidf**2))

In [27]:
l2_tfidf

array([0.5 , 0.45, 0.5 , 0.19, 0.19, 0.19, 0.3 , 0.25, 0.19])