In [1]:
from mytools import *

import pandas as pd
import numpy as np
import re
import json 

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from sklearn.linear_model import LogisticRegression

print("All modules imported.")

All modules imported.


In [2]:
traindf = load_csv("train")
traindf.head()

Unnamed: 0,movieid,reviewerName,isFrequentReviewer,reviewText,sentiment
0,marvelous_pirate,Benjamin Henry,False,Henry Selick’s first movie since 2009’s Corali...,POSITIVE
1,tony_montana_frodo_baggins_v_rocky_balboa,Felicia Lopez,False,With a cast that reads like the Vogue Oscar pa...,NEGATIVE
2,darth_vader_katniss_everdeen_sorcerer_donnie_d...,Mr. Charles Burgess,True,Creed II does not give us anything but another...,POSITIVE
3,lara_croft_glimmer,Ryan Barrett,False,"I know what you're thinking, but this is no Li...",POSITIVE
4,jason_bourne_surreal_the_terminator_indiana_jones,Alexander Glover,False,Director Fernando Meirelles tells the story wi...,POSITIVE


In [3]:
train_features = traindf["reviewText"]
train_labels = traindf["sentiment"]

train_features.shape, train_labels.shape

((162758,), (162758,))

In [4]:
train_features.fillna(" ", inplace=True)

In [5]:
def make_vocab_file(df, stopwords=None, max_features=100, min_df=1, max_df=1.0):
    vocab = get_tfidf_vocab(df, stop_words=stopwords, max_features=max_features, min_df=min_df, max_df=max_df)
    if type(stopwords) == "list":
        filename = f"vocab/vocab_{stopwords}_{max_features}_{min_df}_{max_df}.json"
    else:
        filename = f"vocab/vocab_sw_all_{max_features}_{min_df}_{max_df}.json"
    with open(filename, "w") as f:
        json.dump(vocab, f)
        print("Job done!")

## Stop words  

In [6]:
stop_words_all = json.load(open("stop_words.json", "r"))
type(stop_words_all), len(stop_words_all)

(list, 1160)

## Create and check various vocab files  

In [7]:
make_vocab_file(traindf, stopwords="english", max_features=100)

Job done!


In [8]:
make_vocab_file(traindf, stopwords=None, max_features=100)

Job done!


In [9]:
make_vocab_file(traindf, stopwords="english", max_features=100, max_df=0.3)

Job done!


In [10]:
make_vocab_file(traindf, stopwords=stop_words_all, max_features=100, max_df=0.3)



Job done!


## Checking document specific stop_words  

In [11]:
train_features.head()

0    Henry Selick’s first movie since 2009’s Corali...
1    With a cast that reads like the Vogue Oscar pa...
2    Creed II does not give us anything but another...
3    I know what you're thinking, but this is no Li...
4    Director Fernando Meirelles tells the story wi...
Name: reviewText, dtype: object

In [61]:
tvec = TfidfVectorizer(ngram_range=(1,2), max_df=0.01)
# tvec_alpha = TfidfVectorizer(token_pattern=u'(?u)\b\w*[a-zA-Z]\w*\b')

In [62]:
tvec.fit(train_features)

TfidfVectorizer(max_df=0.01, ngram_range=(1, 2))

In [63]:
len(tvec.get_feature_names())

1096298

In [65]:
tvec.stop_words_

{'44',
 '46',
 '8217',
 'about',
 'about the',
 'action',
 'after',
 'all',
 'all the',
 'almost',
 'also',
 'an',
 'and',
 'and it',
 'and the',
 'another',
 'any',
 'are',
 'as',
 'as it',
 'as the',
 'at',
 'at the',
 'audience',
 'back',
 'bad',
 'be',
 'because',
 'been',
 'before',
 'being',
 'best',
 'better',
 'between',
 'big',
 'bit',
 'both',
 'but',
 'but it',
 'but the',
 'by',
 'by the',
 'can',
 'cast',
 'character',
 'characters',
 'come',
 'comedy',
 'comes',
 'could',
 'despite',
 'director',
 'do',
 'documentary',
 'does',
 'doesn',
 'don',
 'down',
 'drama',
 'emotional',
 'end',
 'enough',
 'entertaining',
 'even',
 'ever',
 'every',
 'family',
 'far',
 'feel',
 'feels',
 'few',
 'film',
 'film is',
 'film that',
 'films',
 'find',
 'first',
 'for',
 'for the',
 'from',
 'from the',
 'full',
 'full review',
 'fun',
 'funny',
 'genre',
 'get',
 'gets',
 'go',
 'going',
 'good',
 'great',
 'had',
 'hard',
 'has',
 'have',
 'have been',
 'he',
 'heart',
 'her',
 'here

In [None]:
len(tvec.stop_words_)

280