In [23]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer

In [2]:
train_set={"the sky sky is blue",'the sun is bright'}
# it is finding the top 3 feature
count_vec=CountVectorizer(max_features=3)
a=count_vec.fit_transform(train_set)
a.todense()

matrix([[1, 2, 1],
        [1, 0, 1]], dtype=int64)

In [3]:
# top 3 features
count_vec.get_feature_names()

['is', 'sky', 'the']

In [4]:
from nltk.corpus import movie_reviews
movie_reviews.categories()
# list of tuple for each doc (words, category)
doc=[]
for category in movie_reviews.categories():
    for fileid in movie_reviews.fileids(category):
        doc.append((movie_reviews.words(fileid),category))
# shuffling the document such that neg pos gets mixed and neg pos split becomes easy for us
import random
random.shuffle(doc)
doc[0:5]

[(['i', 'saw', 'this', 'film', 'on', 'christmas', 'day', ...], 'neg'),
 (['after', 'a', 'stylistic', 'detour', 'with', 'mrs', ...], 'pos'),
 (['if', 'the', 'current', 'trends', 'of', 'hollywood', ...], 'pos'),
 (['salaries', 'of', 'hollywood', 'top', 'actors', 'are', ...], 'neg'),
 (['the', 'cartoon', 'is', 'way', 'better', '.', 'that', ...], 'neg')]

# Data Cleaning

In [5]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk import pos_tag
lemmatizer=WordNetLemmatizer()

In [6]:
from nltk.corpus import wordnet
def get_simple_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [7]:
from nltk.corpus import stopwords
import string
stops=set(stopwords.words('english'))
punctuations=list(string.punctuation)
stops.update(punctuations)
stops,string.punctuation

({'!',
  '"',
  '#',
  '$',
  '%',
  '&',
  "'",
  '(',
  ')',
  '*',
  '+',
  ',',
  '-',
  '.',
  '/',
  ':',
  ';',
  '<',
  '=',
  '>',
  '?',
  '@',
  '[',
  '\\',
  ']',
  '^',
  '_',
  '`',
  'a',
  'about',
  'above',
  'after',
  'again',
  'against',
  'ain',
  'all',
  'am',
  'an',
  'and',
  'any',
  'are',
  'aren',
  "aren't",
  'as',
  'at',
  'be',
  'because',
  'been',
  'before',
  'being',
  'below',
  'between',
  'both',
  'but',
  'by',
  'can',
  'couldn',
  "couldn't",
  'd',
  'did',
  'didn',
  "didn't",
  'do',
  'does',
  'doesn',
  "doesn't",
  'doing',
  'don',
  "don't",
  'down',
  'during',
  'each',
  'few',
  'for',
  'from',
  'further',
  'had',
  'hadn',
  "hadn't",
  'has',
  'hasn',
  "hasn't",
  'have',
  'haven',
  "haven't",
  'having',
  'he',
  'her',
  'here',
  'hers',
  'herself',
  'him',
  'himself',
  'his',
  'how',
  'i',
  'if',
  'in',
  'into',
  'is',
  'isn',
  "isn't",
  'it',
  "it's",
  'its',
  'itself',
  'just',
  'll',


In [8]:
def clean_review(words):
    out=[]
    for w in words:
        if w.lower() not in stops:
            #passing without lowering it because lowering can change pos of the word
            pos=pos_tag([w])
            clean_word=lemmatizer.lemmatize(w,pos=get_simple_pos(pos[0][1]))
            out.append(clean_word.lower())
    return out
            
            

In [9]:
doc=[(clean_review(document),category) for document,category in doc]

In [10]:
doc[0]

(['saw',
  'film',
  'christmas',
  'day',
  'expect',
  'upbeat',
  'comedy',
  'boy',
  'christmas',
  'dissapointment',
  'hour',
  'movie',
  'ready',
  'change',
  'room',
  'another',
  'theater',
  'read',
  'see',
  'say',
  'four',
  'room',
  'star',
  'tim',
  'roth',
  'jennifer',
  'beals',
  'antonio',
  'banderas',
  'quentin',
  'tarantino',
  'valeria',
  'golino',
  'madonna',
  'bruce',
  'willis',
  'marisa',
  'tomei',
  'alicia',
  'witt',
  'lili',
  'taylor',
  'ione',
  'skye',
  'possible',
  'star',
  'four',
  'room',
  'suppose',
  'one',
  'big',
  'hit',
  'year',
  'key',
  'word',
  'suppose',
  'four',
  'big',
  'director',
  'hollywood',
  'quentin',
  'tarantino',
  'robert',
  'rodriguez',
  'alexander',
  'rockwell',
  'alison',
  'anders',
  'direct',
  'one',
  'big',
  'film',
  'big',
  'popular',
  'cast',
  'guess',
  'much',
  'turn',
  'big',
  'flop',
  'year',
  'could',
  'great',
  'plot',
  'new',
  'year',
  'eve',
  'bellboy',
  'fi

In [11]:
categories=[category for document,category in doc ]
categories

['neg',
 'pos',
 'pos',
 'neg',
 'neg',
 'neg',
 'pos',
 'neg',
 'pos',
 'neg',
 'pos',
 'neg',
 'pos',
 'pos',
 'pos',
 'pos',
 'neg',
 'neg',
 'pos',
 'pos',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'pos',
 'neg',
 'pos',
 'pos',
 'neg',
 'pos',
 'neg',
 'pos',
 'pos',
 'neg',
 'pos',
 'neg',
 'pos',
 'neg',
 'neg',
 'pos',
 'pos',
 'neg',
 'neg',
 'pos',
 'pos',
 'pos',
 'pos',
 'neg',
 'pos',
 'pos',
 'pos',
 'neg',
 'neg',
 'pos',
 'pos',
 'pos',
 'pos',
 'neg',
 'neg',
 'pos',
 'neg',
 'neg',
 'pos',
 'neg',
 'pos',
 'pos',
 'neg',
 'neg',
 'pos',
 'neg',
 'pos',
 'neg',
 'neg',
 'neg',
 'pos',
 'neg',
 'pos',
 'neg',
 'neg',
 'neg',
 'neg',
 'pos',
 'neg',
 'neg',
 'pos',
 'neg',
 'neg',
 'neg',
 'pos',
 'pos',
 'neg',
 'pos',
 'neg',
 'pos',
 'pos',
 'pos',
 'neg',
 'neg',
 'neg',
 'pos',
 'neg',
 'neg',
 'neg',
 'pos',
 'pos',
 'neg',
 'neg',
 'neg',
 'pos',
 'pos',
 'pos',
 'neg',
 'neg',
 'pos',
 'pos',
 'neg',
 'neg',
 'pos',
 'pos',
 'pos',
 'pos',
 'neg',
 'neg',
 'neg',


In [12]:
a=['he','is']
' '.join(a)

'he is'

In [13]:
text_doc=[" ".join(document) for document,cat in doc]

In [14]:
text_doc[0]

'saw film christmas day expect upbeat comedy boy christmas dissapointment hour movie ready change room another theater read see say four room star tim roth jennifer beals antonio banderas quentin tarantino valeria golino madonna bruce willis marisa tomei alicia witt lili taylor ione skye possible star four room suppose one big hit year key word suppose four big director hollywood quentin tarantino robert rodriguez alexander rockwell alison anders direct one big film big popular cast guess much turn big flop year could great plot new year eve bellboy first day job encounter many mysterious kinky hotel guest try handle problem tarantino told director plot write script turn write dark comedy anders write direct tale coven witch madonna valeria golino alicia witt ione skye bad one second room jennifer beals well lack plot room man accuses every man sleep wife third room antonio banderas best roomm two rambunctous kid trash hotel suite final one tarantino willis movie star want bellboy chop

In [15]:
from sklearn.model_selection import train_test_split

In [16]:
x_train,x_test,y_train,y_test=train_test_split(text_doc,categories)

In [25]:
count_vec=TfidfVectorizer(max_features=2000,ngram_range=(2,3),max_df=0.8,min_df=0.1)
x_train_features=count_vec.fit_transform(x_train)
x_train_features.todense()

matrix([[0., 0., 1.],
        [0., 0., 0.],
        [0., 0., 0.],
        ...,
        [0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.]])

In [26]:
# top 3 features
count_vec.get_feature_names()

['look like', 'special effect', 'year old']

In [27]:
x_test_features=count_vec.transform(x_test)
x_test_features.todense

<bound method spmatrix.todense of <500x3 sparse matrix of type '<class 'numpy.float64'>'
	with 175 stored elements in Compressed Sparse Row format>>

In [28]:
from sklearn.svm import SVC

In [29]:
svc=SVC()
svc.fit(x_train_features,y_train)



SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='rbf', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

In [30]:
svc.score(x_test_features,y_test)

0.522