### Imports

In [1]:
import nltk
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sn
import keras
import sklearn
from bs4 import BeautifulSoup
import re #for regular expressions
import string,unicodedata


  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


### Data Pre-Processing

#### reading the data

In [2]:
imdb = pd.read_csv('IMDB_Dataset.csv')
imdb.describe()

Unnamed: 0,review,sentiment
count,50000,50000
unique,49582,2
top,Loved today's show!!! It was a variety and not...,positive
freq,5,25000


#### train-test split

In [3]:
imdb.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [4]:
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(imdb.review, imdb.sentiment, test_size=0.33, random_state=42)

In [5]:
print 'train data size',X_train.shape
print 'test data size', X_test.shape

train data size (33500,)
test data size (16500,)


In [6]:
X_train = np.array(X_train)
X_test = np.array(X_test)
y_train[y_train == 'positive'] = 1
y_train[y_train == 'negative'] = 0
y_test[y_test == 'positive'] = 1
y_test[y_test == 'negative'] = 0
y_train = np.array(y_train)
y_test = np.array(y_test)

#### removing html tags and braces from the text

In [7]:
text_train = X_train
text_test = X_test

def removeNoise(text):
    soup = BeautifulSoup(text, "html.parser")
    text = soup.get_text()
    return re.sub('\[[^]]*\]', '', text)

text = [removeNoise(x) for x in text_train]

In [8]:
text_train = text
text_test = [removeNoise(x) for x in text_test]



#### Removing special characters from the text

In [9]:
def removeSpecialCharacters(text, remove_digits=True):
    pattern=r'[^a-zA-z0-9\s]'
    text=re.sub(pattern,'',text)
    return text

text_train=[removeSpecialCharacters(x) for x in text_train]
text_test = [removeSpecialCharacters(x) for x in text_test]

#### stemming

In [10]:
def stemmer(text):
    ps=nltk.porter.PorterStemmer()
    text= ' '.join([ps.stem(word) for word in text.split()])
    return text


text_train=[stemmer(x) for x in text_train]
text_test = [stemmer(x) for x in text_test]

#### remove stop words

In [11]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/camelliadebnath/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [12]:
stopwords =nltk.corpus.stopwords.words('english')
print stopwords

[u'i', u'me', u'my', u'myself', u'we', u'our', u'ours', u'ourselves', u'you', u"you're", u"you've", u"you'll", u"you'd", u'your', u'yours', u'yourself', u'yourselves', u'he', u'him', u'his', u'himself', u'she', u"she's", u'her', u'hers', u'herself', u'it', u"it's", u'its', u'itself', u'they', u'them', u'their', u'theirs', u'themselves', u'what', u'which', u'who', u'whom', u'this', u'that', u"that'll", u'these', u'those', u'am', u'is', u'are', u'was', u'were', u'be', u'been', u'being', u'have', u'has', u'had', u'having', u'do', u'does', u'did', u'doing', u'a', u'an', u'the', u'and', u'but', u'if', u'or', u'because', u'as', u'until', u'while', u'of', u'at', u'by', u'for', u'with', u'about', u'against', u'between', u'into', u'through', u'during', u'before', u'after', u'above', u'below', u'to', u'from', u'up', u'down', u'in', u'out', u'on', u'off', u'over', u'under', u'again', u'further', u'then', u'once', u'here', u'there', u'when', u'where', u'why', u'how', u'all', u'any', u'both', u'eac

In [13]:
tokenizer=nltk.tokenize.toktok.ToktokTokenizer()

def removeStopwords(text, is_lower_case=False):
    tokens = tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    if is_lower_case:
        filtered_tokens = [token for token in tokens if token not in stopwords]
    else:
        filtered_tokens = [token for token in tokens if token.lower() not in stopwords]
    filtered_text = ' '.join(filtered_tokens)    
    return filtered_text

text_train=[removeStopwords(x) for x in text_train]
text_test = [removeStopwords(x) for x in text_test]

### Logistic Regression


#### Bag of Words Model

In [14]:
countVectorizer=sklearn.feature_extraction.text.CountVectorizer(min_df=0,max_df=1,binary=False,ngram_range=(1,3))

#fit BOW on training dataset
cv_train_reviews=countVectorizer.fit_transform(text_train)

#apply model on test dataset
cv_test_reviews=countVectorizer.transform(text_test)

print('BOW_cv_train:',cv_train_reviews.shape)
print('BOW_cv_test:',cv_test_reviews.shape)
#vocab=cv.get_feature_names()-toget feature names

('BOW_cv_train:', (33500, 5305683))
('BOW_cv_test:', (16500, 5305683))


#### TF-IDF Vectorization

In [15]:
tfIdfVectorizer=sklearn.feature_extraction.text.TfidfVectorizer(min_df=0,max_df=1,use_idf=True,ngram_range=(1,3))

tv_train_reviews=tfIdfVectorizer.fit_transform(text_train)
tv_test_reviews=tfIdfVectorizer.transform(text_test)

print('Tfidf_train:',tv_train_reviews.shape)
print('Tfidf_test:',tv_test_reviews.shape)

('Tfidf_train:', (33500, 5305683))
('Tfidf_test:', (16500, 5305683))


#### Logistic Regression Modeling

In [16]:
text_train = np.array(text_train)
text_test = np.array(text_test)
y_train = np.array(y_train)
y_test = np.array(y_test)

In [17]:
print tv_train_reviews.size
print y_train.size

5305683
33500


In [18]:
help(sklearn.linear_model.LogisticRegression.fit)

Help on method fit in module sklearn.linear_model.logistic:

fit(self, X, y, sample_weight=None) unbound sklearn.linear_model.logistic.LogisticRegression method
    Fit the model according to the given training data.
    
    Parameters
    ----------
    X : {array-like, sparse matrix}, shape (n_samples, n_features)
        Training vector, where n_samples is the number of samples and
        n_features is the number of features.
    
    y : array-like, shape (n_samples,)
        Target vector relative to X.
    
    sample_weight : array-like, shape (n_samples,) optional
        Array of weights that are assigned to individual samples.
        If not provided, then each sample is given unit weight.
    
        .. versionadded:: 0.17
           *sample_weight* support to LogisticRegression.
    
    Returns
    -------
    self : object



In [19]:
y_train=y_train.astype('int')
y_test=y_test.astype('int')

#### Fitting model for both BOW and TFIDF vectors 

In [20]:
lr=sklearn.linear_model.LogisticRegression(penalty='l2',max_iter=500,C=1,random_state=42)

lr_bow=lr.fit(cv_train_reviews,y_train)
print(lr_bow)

lr_tfidf=lr.fit(tv_train_reviews,y_train)
print(lr_tfidf)



LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=500, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=42, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)
LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=500, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=42, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)


In [21]:
print y_test.shape
print y_train.shape


(16500,)
(33500,)


In [22]:
lr_bow_predict=lr.predict(cv_test_reviews)
print(lr_bow_predict)

lr_tfidf_predict=lr.predict(tv_test_reviews)
print(lr_tfidf_predict)

lr_bow_score=sklearn.metrics.accuracy_score(y_test,lr_bow_predict)
print("BOW score :",lr_bow_score)

lr_tfidf_score=sklearn.metrics.accuracy_score(y_test,lr_tfidf_predict)
print("TFIDF score :",lr_tfidf_score)

[0 1 0 ... 1 1 1]
[0 1 0 ... 1 1 1]
('BOW score :', 0.7401818181818182)
('TFIDF score :', 0.7396363636363636)


### Support Vector Machines

In [23]:
bowTrain = cv_train_reviews
bowTest = cv_test_reviews

tfidfTrain = tv_train_reviews
tfidfTest = tv_test_reviews



In [24]:
svm = sklearn.svm.SVC(gamma='scale',max_iter=100)
svm.fit(bowTrain, y_train)



SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
  max_iter=100, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [25]:
svm_bow_preds = svm.predict(bowTest)
accuracy = sum(svm_bow_preds == y_test)
print accuracy

8292


In [26]:
svm = sklearn.svm.SVC(gamma='scale',max_iter=100)
svm.fit(tfidfTrain, y_train)
svm_tfidf_preds = svm.predict(bowTest)
accuracy = sum(svm_tfidf_preds == y_test)



8167


In [30]:
y_test.shape[0]

16500

In [32]:
print (1.0*accuracy)/y_test.shape[0]

0.49496969697


### Random Forest

In [34]:
from sklearn.ensemble import RandomForestClassifier

#bowTrain,bowTest

#tfidfTrain,tfidfTest

In [36]:
clf = RandomForestClassifier(n_estimators=100, max_depth=2,random_state=0)
clf.fit(bowTrain, y_train) 
randomFOrest_bow_pred = clf.predict(bowTest)
accurate = sum(randomFOrest_bow_pred == y_test)
accuracy = (1.0*accurate/y_test.shape[0])
print accuracy

0.497515151515


In [37]:
clf = RandomForestClassifier(n_estimators=100, max_depth=2,random_state=0)
clf.fit(tfidfTrain, y_train) 
randomForest_tfidf_pred = clf.predict(tfidfTest)
accurate = sum(randomForest_tfidf_pred == y_test)
accuracy = (1.0*accurate/y_test.shape[0])
print accuracy

0.497515151515
