In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/rotten-tomatoes/data_no_sentiment.csv
/kaggle/input/rotten-tomatoes/imdb-model.pt


In [2]:
import torch
print(torch.__version__)
import random
import numpy as np

SEED = 1234

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

1.13.0


In [3]:
def predict_sentiment(model, tokenizer, sentence):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)
    model.eval()
    tokens = tokenizer.tokenize(sentence)
    tokens = tokens[:max_input_length-2]
    indexed = [init_token_idx] + tokenizer.convert_tokens_to_ids(tokens) + [eos_token_idx]
    tensor = torch.LongTensor(indexed).to(device)
    tensor = tensor.unsqueeze(0)
    prediction = torch.sigmoid(model(tensor))
    return prediction.item()

In [4]:
from transformers import BertTokenizer, BertModel

bert = BertModel.from_pretrained('bert-base-uncased')

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [5]:
import torch.nn as nn

class BERTGRUSentiment(nn.Module):
    def __init__(self,
                 bert,
                 hidden_dim,
                 output_dim,
                 n_layers,
                 bidirectional,
                 dropout):
        
        super().__init__()
        
        self.bert = bert
        
        embedding_dim = bert.config.to_dict()['hidden_size']
        
        self.rnn = nn.GRU(embedding_dim,
                          hidden_dim,
                          num_layers = n_layers,
                          bidirectional = bidirectional,
                          batch_first = True,
                          dropout = 0 if n_layers < 2 else dropout)
        
        self.out = nn.Linear(hidden_dim * 2 if bidirectional else hidden_dim, output_dim)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, text):
        
        #text = [batch size, sent len]
                
        with torch.no_grad():
            embedded = self.bert(text)[0]
                
        #embedded = [batch size, sent len, emb dim]
        
        _, hidden = self.rnn(embedded)
        
        #hidden = [n layers * n directions, batch size, emb dim]
        
        if self.rnn.bidirectional:
            hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1))
        else:
            hidden = self.dropout(hidden[-1,:,:])
                
        #hidden = [batch size, hid dim]
        
        output = self.out(hidden)
        
        #output = [batch size, out dim]
        
        return output

In [6]:
HIDDEN_DIM = 256
OUTPUT_DIM = 1
N_LAYERS = 2
BIDIRECTIONAL = True
DROPOUT = 0.25

model = BERTGRUSentiment(bert,
                         HIDDEN_DIM,
                         OUTPUT_DIM,
                         N_LAYERS,
                         BIDIRECTIONAL,
                         DROPOUT)

In [7]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
init_token_idx = tokenizer.cls_token_id
eos_token_idx = tokenizer.sep_token_id
max_input_length = tokenizer.max_model_input_sizes['bert-base-uncased']

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

In [8]:
# Load model weight here!
model.load_state_dict(torch.load("/kaggle/input/rotten-tomatoes/imdb-model.pt"))

<All keys matched successfully>

In [9]:
import pandas as pd

# Use Parallelism to Load Data in Batches
tp = pd.read_csv("/kaggle/input/rotten-tomatoes/data_no_sentiment.csv", iterator=True, chunksize=5_000_000) 
df = pd.concat(tp, ignore_index=True)
df

Unnamed: 0.1,Unnamed: 0,rotten_tomatoes_link,critic_name,top_critic,publisher_name,review_type,review_date,review_content,movie_title,movie_info,...,production_company,tomatometer_status,tomatometer_rating,tomatometer_count,audience_status,audience_rating,audience_count,tomatometer_top_critics_count,tomatometer_fresh_critics_count,tomatometer_rotten_critics_count
0,0,m/0814255,Andrew L. Urban,False,Urban Cinefile,Fresh,2010-02-06,A fantasy adventure that fuses Greek mythology...,Percy Jackson & the Olympians: The Lightning T...,"Always trouble-prone, the life of teenager Per...",...,20th Century Fox,Rotten,49.0,149.0,Spilled,53.0,254421.0,43,73,76
1,1,m/0814255,Louise Keller,False,Urban Cinefile,Fresh,2010-02-06,"Uma Thurman as Medusa, the gorgon with a coiff...",Percy Jackson & the Olympians: The Lightning T...,"Always trouble-prone, the life of teenager Per...",...,20th Century Fox,Rotten,49.0,149.0,Spilled,53.0,254421.0,43,73,76
2,2,m/0814255,,False,FILMINK (Australia),Fresh,2010-02-09,With a top-notch cast and dazzling special eff...,Percy Jackson & the Olympians: The Lightning T...,"Always trouble-prone, the life of teenager Per...",...,20th Century Fox,Rotten,49.0,149.0,Spilled,53.0,254421.0,43,73,76
3,3,m/0814255,Ben McEachen,False,Sunday Mail (Australia),Fresh,2010-02-09,Whether audiences will get behind The Lightnin...,Percy Jackson & the Olympians: The Lightning T...,"Always trouble-prone, the life of teenager Per...",...,20th Century Fox,Rotten,49.0,149.0,Spilled,53.0,254421.0,43,73,76
4,4,m/0814255,Ethan Alter,True,Hollywood Reporter,Rotten,2010-02-10,What's really lacking in The Lightning Thief i...,Percy Jackson & the Olympians: The Lightning T...,"Always trouble-prone, the life of teenager Per...",...,20th Century Fox,Rotten,49.0,149.0,Spilled,53.0,254421.0,43,73,76
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1129882,1129882,m/zulu_dawn,Chuck O'Leary,False,Fantastica Daily,Rotten,2005-11-02,,Zulu Dawn,Sir Henry Bartle Frere's (John Mills) vastly o...,...,Tango Entertainment,Rotten,50.0,8.0,Upright,62.0,4469.0,0,4,4
1129883,1129883,m/zulu_dawn,Ken Hanke,False,"Mountain Xpress (Asheville, NC)",Fresh,2007-03-07,"Seen today, it's not only a startling indictme...",Zulu Dawn,Sir Henry Bartle Frere's (John Mills) vastly o...,...,Tango Entertainment,Rotten,50.0,8.0,Upright,62.0,4469.0,0,4,4
1129884,1129884,m/zulu_dawn,Dennis Schwartz,False,Dennis Schwartz Movie Reviews,Fresh,2010-09-16,A rousing visual spectacle that's a prequel of...,Zulu Dawn,Sir Henry Bartle Frere's (John Mills) vastly o...,...,Tango Entertainment,Rotten,50.0,8.0,Upright,62.0,4469.0,0,4,4
1129885,1129885,m/zulu_dawn,Christopher Lloyd,False,Sarasota Herald-Tribune,Rotten,2011-02-28,"A simple two-act story: Prelude to war, and th...",Zulu Dawn,Sir Henry Bartle Frere's (John Mills) vastly o...,...,Tango Entertainment,Rotten,50.0,8.0,Upright,62.0,4469.0,0,4,4


In [10]:
df.dropna(subset=['review_content'], inplace=True)

In [11]:
df2 = df.loc[0:2000]
df2

Unnamed: 0.1,Unnamed: 0,rotten_tomatoes_link,critic_name,top_critic,publisher_name,review_type,review_date,review_content,movie_title,movie_info,...,production_company,tomatometer_status,tomatometer_rating,tomatometer_count,audience_status,audience_rating,audience_count,tomatometer_top_critics_count,tomatometer_fresh_critics_count,tomatometer_rotten_critics_count
0,0,m/0814255,Andrew L. Urban,False,Urban Cinefile,Fresh,2010-02-06,A fantasy adventure that fuses Greek mythology...,Percy Jackson & the Olympians: The Lightning T...,"Always trouble-prone, the life of teenager Per...",...,20th Century Fox,Rotten,49.0,149.0,Spilled,53.0,254421.0,43,73,76
1,1,m/0814255,Louise Keller,False,Urban Cinefile,Fresh,2010-02-06,"Uma Thurman as Medusa, the gorgon with a coiff...",Percy Jackson & the Olympians: The Lightning T...,"Always trouble-prone, the life of teenager Per...",...,20th Century Fox,Rotten,49.0,149.0,Spilled,53.0,254421.0,43,73,76
2,2,m/0814255,,False,FILMINK (Australia),Fresh,2010-02-09,With a top-notch cast and dazzling special eff...,Percy Jackson & the Olympians: The Lightning T...,"Always trouble-prone, the life of teenager Per...",...,20th Century Fox,Rotten,49.0,149.0,Spilled,53.0,254421.0,43,73,76
3,3,m/0814255,Ben McEachen,False,Sunday Mail (Australia),Fresh,2010-02-09,Whether audiences will get behind The Lightnin...,Percy Jackson & the Olympians: The Lightning T...,"Always trouble-prone, the life of teenager Per...",...,20th Century Fox,Rotten,49.0,149.0,Spilled,53.0,254421.0,43,73,76
4,4,m/0814255,Ethan Alter,True,Hollywood Reporter,Rotten,2010-02-10,What's really lacking in The Lightning Thief i...,Percy Jackson & the Olympians: The Lightning T...,"Always trouble-prone, the life of teenager Per...",...,20th Century Fox,Rotten,49.0,149.0,Spilled,53.0,254421.0,43,73,76
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1996,1996,m/10005499-oliver_twist,Rich Cline,False,Shadows on the Wall,Fresh,2005-08-26,"Involving, captivating and nearly note perfect...",Oliver Twist,An orphan (Barney Clark) in 19th-century Londo...,...,Sony Pictures,Fresh,61.0,143.0,Upright,63.0,23294.0,39,87,56
1997,1997,m/10005499-oliver_twist,Kirk Honeycutt,True,Hollywood Reporter,Rotten,2005-09-09,The spark that an original point of view might...,Oliver Twist,An orphan (Barney Clark) in 19th-century Londo...,...,Sony Pictures,Fresh,61.0,143.0,Upright,63.0,23294.0,39,87,56
1998,1998,m/10005499-oliver_twist,Todd McCarthy,True,Variety,Rotten,2005-09-12,"Conventional, straightforward and very much wi...",Oliver Twist,An orphan (Barney Clark) in 19th-century Londo...,...,Sony Pictures,Fresh,61.0,143.0,Upright,63.0,23294.0,39,87,56
1999,1999,m/10005499-oliver_twist,Emanuel Levy,False,EmanuelLevy.Com,Fresh,2005-09-15,Though Polanski doesn't bring a new angle to D...,Oliver Twist,An orphan (Barney Clark) in 19th-century Londo...,...,Sony Pictures,Fresh,61.0,143.0,Upright,63.0,23294.0,39,87,56


In [12]:
prediction = []

for index, row in df2.iterrows():
    # Get the model's prediction
    predicted_sentiment = predict_sentiment(model, tokenizer, row['review_content'])

    # The output is a float, so we need to convert it into -1, 0, or 1
    # Play around with the thresholds
    if predicted_sentiment > 0.8:
        predicted_sentiment = 1
    elif predicted_sentiment < 0.2:
        predicted_sentiment = -1
    else:
        predicted_sentiment = 0
        
    prediction.append(predicted_sentiment)
    
df2['sentiment'] = prediction
df2

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0.1,Unnamed: 0,rotten_tomatoes_link,critic_name,top_critic,publisher_name,review_type,review_date,review_content,movie_title,movie_info,...,tomatometer_status,tomatometer_rating,tomatometer_count,audience_status,audience_rating,audience_count,tomatometer_top_critics_count,tomatometer_fresh_critics_count,tomatometer_rotten_critics_count,sentiment
0,0,m/0814255,Andrew L. Urban,False,Urban Cinefile,Fresh,2010-02-06,A fantasy adventure that fuses Greek mythology...,Percy Jackson & the Olympians: The Lightning T...,"Always trouble-prone, the life of teenager Per...",...,Rotten,49.0,149.0,Spilled,53.0,254421.0,43,73,76,1
1,1,m/0814255,Louise Keller,False,Urban Cinefile,Fresh,2010-02-06,"Uma Thurman as Medusa, the gorgon with a coiff...",Percy Jackson & the Olympians: The Lightning T...,"Always trouble-prone, the life of teenager Per...",...,Rotten,49.0,149.0,Spilled,53.0,254421.0,43,73,76,1
2,2,m/0814255,,False,FILMINK (Australia),Fresh,2010-02-09,With a top-notch cast and dazzling special eff...,Percy Jackson & the Olympians: The Lightning T...,"Always trouble-prone, the life of teenager Per...",...,Rotten,49.0,149.0,Spilled,53.0,254421.0,43,73,76,1
3,3,m/0814255,Ben McEachen,False,Sunday Mail (Australia),Fresh,2010-02-09,Whether audiences will get behind The Lightnin...,Percy Jackson & the Olympians: The Lightning T...,"Always trouble-prone, the life of teenager Per...",...,Rotten,49.0,149.0,Spilled,53.0,254421.0,43,73,76,1
4,4,m/0814255,Ethan Alter,True,Hollywood Reporter,Rotten,2010-02-10,What's really lacking in The Lightning Thief i...,Percy Jackson & the Olympians: The Lightning T...,"Always trouble-prone, the life of teenager Per...",...,Rotten,49.0,149.0,Spilled,53.0,254421.0,43,73,76,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1996,1996,m/10005499-oliver_twist,Rich Cline,False,Shadows on the Wall,Fresh,2005-08-26,"Involving, captivating and nearly note perfect...",Oliver Twist,An orphan (Barney Clark) in 19th-century Londo...,...,Fresh,61.0,143.0,Upright,63.0,23294.0,39,87,56,1
1997,1997,m/10005499-oliver_twist,Kirk Honeycutt,True,Hollywood Reporter,Rotten,2005-09-09,The spark that an original point of view might...,Oliver Twist,An orphan (Barney Clark) in 19th-century Londo...,...,Fresh,61.0,143.0,Upright,63.0,23294.0,39,87,56,0
1998,1998,m/10005499-oliver_twist,Todd McCarthy,True,Variety,Rotten,2005-09-12,"Conventional, straightforward and very much wi...",Oliver Twist,An orphan (Barney Clark) in 19th-century Londo...,...,Fresh,61.0,143.0,Upright,63.0,23294.0,39,87,56,1
1999,1999,m/10005499-oliver_twist,Emanuel Levy,False,EmanuelLevy.Com,Fresh,2005-09-15,Though Polanski doesn't bring a new angle to D...,Oliver Twist,An orphan (Barney Clark) in 19th-century Londo...,...,Fresh,61.0,143.0,Upright,63.0,23294.0,39,87,56,1


In [13]:
df2.to_csv('data_with_sentiment.csv')