<a href="https://colab.research.google.com/github/gshreya5/colab/blob/main/sentiment_analysis_movie_reviews.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Sentiment Analysis of 🍅 Rotten Tomatoes Movie Reviews

## Import Libraries

In [16]:
import pandas as pd

# Load Dataset

In [11]:
!kaggle competitions download -c sentiment-analysis-on-movie-reviews

Downloading sentiment-analysis-on-movie-reviews.zip to /content
  0% 0.00/1.90M [00:00<?, ?B/s]
100% 1.90M/1.90M [00:00<00:00, 148MB/s]


In [None]:
!unzip /content/sentiment-analysis-on-movie-reviews.zip -d /content/

# Explore dataset

In [51]:
train = pd.read_csv('/content/train.tsv.zip',sep='\t')
test = pd.read_csv('/content/test.tsv.zip',sep='\t')

In [50]:
train.head()

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,A series of escapades demonstrating the adage ...,1
1,2,1,A series of escapades demonstrating the adage ...,2
2,3,1,A series,2
3,4,1,A,2
4,5,1,series,2


In [32]:
test.head()

Unnamed: 0,PhraseId,SentenceId,Phrase
0,156061,8545,An intermittently pleasing but mostly routine ...
1,156062,8545,An intermittently pleasing but mostly routine ...
2,156063,8545,An
3,156064,8545,intermittently pleasing but mostly routine effort
4,156065,8545,intermittently pleasing but mostly routine


In [33]:
train.shape, test.shape

((156060, 4), (66292, 3))

In [34]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 156060 entries, 0 to 156059
Data columns (total 4 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   PhraseId    156060 non-null  int64 
 1   SentenceId  156060 non-null  int64 
 2   Phrase      156060 non-null  object
 3   Sentiment   156060 non-null  int64 
dtypes: int64(3), object(1)
memory usage: 4.8+ MB


In [35]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 66292 entries, 0 to 66291
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   PhraseId    66292 non-null  int64 
 1   SentenceId  66292 non-null  int64 
 2   Phrase      66292 non-null  object
dtypes: int64(2), object(1)
memory usage: 1.5+ MB


In [36]:
train.isnull().sum().sum(), test.isnull().sum().sum()

(0, 0)

In [37]:
train.Sentiment.value_counts(normalize=True).sort_index()

0    0.045316
1    0.174760
2    0.509945
3    0.210989
4    0.058990
Name: Sentiment, dtype: float64

# Implement TF-IDF Technique


## Learn Vocabulary using TfidfVectorizer

In [38]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords

In [43]:
stemmer  = SnowballStemmer(language='english')
nltk.download('stopwords')
eng_stopword = stopwords.words('english')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


### Create custom tokenizer with stemming

In [40]:
def tokenize(text):
    return [stemmer.stem(token) for token in word_tokenize(text)]

## Stop words
we'll remove 'not' words from stop words list because they are indicative of sentiment 

In [75]:
selected_stopwords = eng_stopword[:eng_stopword.index('no')]

### Configure and create TfidfVectorizer

In [46]:
from sklearn.feature_extraction.text import TfidfVectorizer
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [76]:
#creating vectorizer
vectorizer = TfidfVectorizer(tokenizer=tokenize,
                             stop_words = selected_stopwords,
                             ngram_range=(1,2),
                             max_features=2000)

### Learn vocubulary from training set

In [77]:
vectorizer.fit(train.Phrase)



### Transform Training & Test Data


In [78]:
train_inputs = vectorizer.transform(train.Phrase)
test_inputs = vectorizer.transform(test.Phrase)

# Train Model

###  Split training and validation sets

not going to choose a random split for validation set because

in our df sentence is broken into many phrases, which could lead to 

inconsistencies

In [63]:
TRAIN_SIZE = 110_000

In [79]:
#creating inputs and targets for train data
X_inputs = train_inputs[:TRAIN_SIZE]
X_targets = train.Sentiment[:TRAIN_SIZE]

In [80]:
#creating inputs and targets for validation data
Y_inputs = train_inputs[TRAIN_SIZE:]
Y_targets = train.Sentiment[TRAIN_SIZE:]

# Logistic Regression Model

In [66]:
from sklearn.linear_model import LogisticRegression


In [90]:
model = LogisticRegression(max_iter=1000)

In [91]:
model.fit(X_inputs,X_targets)

In [92]:
Y_preds = model.predict(Y_inputs)

In [72]:
from sklearn.metrics import accuracy_score

In [93]:
accuracy_score(Y_targets,Y_preds)

0.5798089448545376

## Submission

In [94]:
test_preds = model.predict(test_inputs)

In [109]:
sub = pd.DataFrame(test.PhraseId)

In [110]:
sub['Sentiment'] = test_preds

In [113]:
sub.to_csv('sub.csv', index=None)

In [114]:
!head sub.csv

PhraseId,Sentiment
156061,2
156062,2
156063,2
156064,2
156065,2
156066,3
156067,3
156068,2
156069,3


# MultinomialNB

In [115]:
from sklearn.naive_bayes import MultinomialNB

In [122]:
model2 = MultinomialNB()

In [123]:
model2.fit(X_inputs,X_targets)

In [124]:
Y_preds = model2.predict(Y_inputs)

In [125]:
accuracy_score(Y_targets,Y_preds)

0.5431393834129397