In [22]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np

In [7]:
df=pd.read_csv('IMDB Dataset.csv')
print(df.describe)
print(df.sentiment.value_counts())

<bound method NDFrame.describe of                                                   review sentiment
0      One of the other reviewers has mentioned that ...  positive
1      A wonderful little production. <br /><br />The...  positive
2      I thought this was a wonderful way to spend ti...  positive
3      Basically there's a family where a little boy ...  negative
4      Petter Mattei's "Love in the Time of Money" is...  positive
...                                                  ...       ...
49995  I thought this movie did a down right good job...  positive
49996  Bad plot, bad dialogue, bad acting, idiotic di...  negative
49997  I am a Catholic taught in parochial elementary...  negative
49998  I'm going to have to disagree with the previou...  negative
49999  No one expects the Star Trek movies to be high...  negative

[50000 rows x 2 columns]>
positive    25000
negative    25000
Name: sentiment, dtype: int64


In [11]:
df.review[1]

'A wonderful little production. <br /><br />The filming technique is very unassuming- very old-time-BBC fashion and gives a comforting, and sometimes discomforting, sense of realism to the entire piece. <br /><br />The actors are extremely well chosen- Michael Sheen not only "has got all the polari" but he has all the voices down pat too! You can truly see the seamless editing guided by the references to Williams\' diary entries, not only is it well worth the watching but it is a terrificly written and performed piece. A masterful production about one of the great master\'s of comedy and his life. <br /><br />The realism really comes home with the little things: the fantasy of the guard which, rather than use the traditional \'dream\' techniques remains solid then disappears. It plays on our knowledge and our senses, particularly with the scenes concerning Orton and Halliwell and the sets (particularly of their flat with Halliwell\'s murals decorating every surface) are terribly well d

In [8]:
# Create training and test sets
X_train, X_test, y_train, y_test = train_test_split(df['review'],df['sentiment'], test_size=0.2, random_state=1)


In [9]:
# Initialize a CountVectorizer object: count_vectorizer
count_vectorizer = CountVectorizer(stop_words="english")
# Transform the training data: 'review' column
count_train = count_vectorizer.fit_transform(X_train.values)
# Transform the test data: 'review' column
count_test = count_vectorizer.transform(X_test.values)

# Print the first 10 features of the count_vectorizer
print(count_vectorizer.get_feature_names()[:10])

['00', '000', '00000000000', '0000000000001', '00001', '00015', '000dm', '001', '003830', '0069']




In [10]:
# Initialize a TfidfVectorizer object: tfidf_vectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words="english", max_df=0.7)
# Transform the training data: tfidf_train 
tfidf_train = tfidf_vectorizer.fit_transform(X_train.values)
# Transform the test data: tfidf_test 
tfidf_test = tfidf_vectorizer.transform(X_test.values)
# Print the first 10 features
print(tfidf_vectorizer.get_feature_names()[:10])
# Print the first 5 vectors of the tfidf training data using slicing on the .A (or array) attribute of tfidf_train
print(tfidf_train.A[:5])


['00', '000', '00000000000', '0000000000001', '00001', '00015', '000dm', '001', '003830', '0069']
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [12]:
# Create the CountVectorizer DataFrame: count_df
count_df = pd.DataFrame(count_train.A, columns=count_vectorizer.get_feature_names())

# Create the TfidfVectorizer DataFrame: tfidf_df
tfidf_df = pd.DataFrame(tfidf_train.A, columns=tfidf_vectorizer.get_feature_names())

# Print the head of count_df
print(count_df.head())

# Print the head of tfidf_df
print(tfidf_df.head())

# Calculate the difference in columns: difference
difference = set(tfidf_df.columns) - set(count_df.columns)
print(difference)

# Check whether the DataFrames are equal
print(count_df.equals(tfidf_df))



   00  000  00000000000  0000000000001  00001  00015  000dm  001  003830  \
0   0    0            0              0      0      0      0    0       0   
1   0    0            0              0      0      0      0    0       0   
2   0    0            0              0      0      0      0    0       0   
3   0    0            0              0      0      0      0    0       0   
4   0    0            0              0      0      0      0    0       0   

   0069  ...  überwoman  ünel  ünfaithful  üvegtigris  üzümcü  ýs  \
0     0  ...          0     0           0           0       0   0   
1     0  ...          0     0           0           0       0   0   
2     0  ...          0     0           0           0       0   0   
3     0  ...          0     0           0           0       0   0   
4     0  ...          0     0           0           0       0   0   

   þorleifsson  żmijewski  יגאל  כרמון  
0            0          0     0      0  
1            0          0     0      0  
2    

In [18]:
#using CountVectorizer data for the model

# Import the necessary modules
from sklearn import metrics
from sklearn.naive_bayes import MultinomialNB

# Instantiate a Multinomial Naive Bayes classifier: nb_classifier
nb_classifier = MultinomialNB()
# Fit the classifier to the training data
nb_classifier.fit(count_train, y_train)

# Create the predicted tags: pred
pred = nb_classifier.predict(count_test)

# Calculate the accuracy score: score
score = metrics.accuracy_score(y_test, pred)
print(score)

# Calculate the confusion matrix: cm
cm = metrics.confusion_matrix(y_test,pred, labels=['positive','negative'])
print(cm)

0.8586
[[4138  818]
 [ 596 4448]]


4138 correctly labelled positive
4448 correctly labelled negatives

In [20]:

# Import the necessary modules
from sklearn import metrics
from sklearn.naive_bayes import MultinomialNB

# Instantiate a Multinomial Naive Bayes classifier: nb_classifier
nb_classifier = MultinomialNB()
# Fit the classifier to the training data
nb_classifier.fit(tfidf_train, y_train)

# Create the predicted tags: pred
pred = nb_classifier.predict(tfidf_test)

# Calculate the accuracy score: score
score = metrics.accuracy_score(y_test, pred)
print(score)

# Calculate the confusion matrix: cm
cm_td = metrics.confusion_matrix(y_test,pred, labels=['positive','negative'])
print(cm_td)

0.871
[[4257  699]
 [ 591 4453]]


Let's try using different alpha levels for the Tfidf vectors, to see if can improve score. 

In [27]:
# Create the list of alphas: alphas
alphas = np.arange(0,1.1,0.1)
print(alphas)

# Define train_and_predict()
def train_and_predict(alpha):
    # Instantiate the classifier: nb_classifier
    nb_classifier = MultinomialNB(alpha=alpha)
    # Fit to the training data
    nb_classifier.fit(tfidf_train,y_train)
    # Predict the labels: pred
    pred = nb_classifier.predict(tfidf_test)
    # Compute accuracy: score
    score = metrics.accuracy_score(y_test,pred)
    return score

# Iterate over the alphas and print the corresponding score
score_list=[]
for alpha in alphas:
    print('Alpha: ', alpha)
    print('Score: ', train_and_predict(alpha))
    score_list.append(alpha)

print('Best Alpha: ', max(score_list))


[0.  0.1 0.2 0.3 0.4 0.5 0.6 0.7 0.8 0.9 1. ]
Alpha:  0.0
Score:  0.7992
Alpha:  0.1




Score:  0.8645
Alpha:  0.2
Score:  0.8673
Alpha:  0.30000000000000004
Score:  0.8671
Alpha:  0.4
Score:  0.8681
Alpha:  0.5
Score:  0.8686
Alpha:  0.6000000000000001
Score:  0.8687
Alpha:  0.7000000000000001
Score:  0.8698
Alpha:  0.8
Score:  0.8699
Alpha:  0.9
Score:  0.8702
Alpha:  1.0
Score:  0.871
Best Alpha:  1.0


Having the default alpha of 1 for MultinomialNB will return the best score of 0.871