In [None]:
#!git clone https://github.com/mhagiwara/realworldnlp.git
%cd nikit
%cd Allen NLP
%cd realworldnlp
from realworldnlp.predictors import SentenceClassifierPredictor
%cd ..

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
import torch.optim as optim
from allennlp.data import DataLoader, TextFieldTensors
from allennlp.data.samplers import BucketBatchSampler
from allennlp.data.vocabulary import Vocabulary
from allennlp.models import Model
from allennlp.modules.seq2vec_encoders import Seq2VecEncoder, PytorchSeq2VecWrapper
from allennlp.modules.text_field_embedders import TextFieldEmbedder, BasicTextFieldEmbedder
from allennlp.modules.token_embedders import Embedding
from allennlp.nn.util import get_text_field_mask
from allennlp.training.metrics import CategoricalAccuracy, F1Measure
from allennlp.training.trainer import GradientDescentTrainer as Trainer
from allennlp_models.classification.dataset_readers.stanford_sentiment_tree_bank import \
    StanfordSentimentTreeBankDatasetReader
from typing import Dict

from allennlp.predictors.predictor import Predictor
import allennlp_models.classification

EMBEDDING_DIM = 128
HIDDEN_DIM = 128

In [None]:
reader = StanfordSentimentTreeBankDatasetReader()

train_dataset = reader.read('data/stanfordSentimentTreebank/trees/train.txt')
dev_dataset = reader.read('data/stanfordSentimentTreebank/trees/dev.txt')

In [None]:
vocab = Vocabulary.from_instances(train_dataset + dev_dataset,
                                  min_count={'tokens': 3})

In [None]:
token_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'),
                            embedding_dim=EMBEDDING_DIM)
word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding})

In [None]:
class LstmClassifier(Model):
    def __init__(self,
                 word_embeddings: TextFieldEmbedder,
                 encoder: Seq2VecEncoder,
                 vocab: Vocabulary,
                 positive_label: str = '4') -> None:
        super().__init__(vocab)
        self.word_embeddings = word_embeddings
        self.encoder = encoder
        self.hidden2tag = torch.nn.Linear(in_features=encoder.get_output_dim(),
                                          out_features=vocab.get_vocab_size('labels'))
        self.accuracy = CategoricalAccuracy()
        self.loss_function = torch.nn.CrossEntropyLoss()
        positive_index = vocab.get_token_index(positive_label, namespace='labels')
        self.f1_measure = F1Measure(positive_index)

    def forward(self,
                tokens: Dict[str, torch.Tensor],
                label: torch.Tensor = None) -> torch.Tensor:
        mask = get_text_field_mask(tokens)
        embeddings = self.word_embeddings(tokens)
        encoder_out = self.encoder(embeddings, mask)
        logits = self.hidden2tag(encoder_out)

        output = {"logits": logits}
        if label is not None:
            self.accuracy(logits, label)
            self.f1_measure(logits, label)
            output["loss"] = self.loss_function(logits, label)

        return output
    
    def get_metrics(self, reset: bool = False) -> Dict[str, float]:
        precision, recall, f1_measure = self.f1_measure.get_metric(reset)
        return {'accuracy': self.accuracy.get_metric(reset),
                'precision': precision,
                'recall': recall,
                'f1_measure': f1_measure}

In [None]:
lstm = PytorchSeq2VecWrapper(
    torch.nn.LSTM(EMBEDDING_DIM, HIDDEN_DIM, batch_first=True))

model = LstmClassifier(word_embeddings, lstm, vocab)

In [None]:
##TRAINING
optimizer = optim.Adam(model.parameters(), lr=1e-4, weight_decay=1e-5)

train_dataset.index_with(vocab)
dev_dataset.index_with(vocab)

train_data_loader = DataLoader(train_dataset,
                                   batch_sampler=BucketBatchSampler(
                                       train_dataset,
                                       batch_size=32,
                                       sorting_keys=["tokens"]))
dev_data_loader = DataLoader(dev_dataset,
                                 batch_sampler=BucketBatchSampler(
                                     dev_dataset,
                                     batch_size=32,
                                     sorting_keys=["tokens"]))
trainer = Trainer(model=model,
                  optimizer=optimizer,
                  data_loader = train_data_loader,
                  validation_data_loader=dev_data_loader,
                  patience=10,
                  num_epochs=20)

trainer.train()

In [None]:
##TESTING

predictor = SentenceClassifierPredictor(model, dataset_reader=reader)
logits = predictor.predict('This is a happy movie!')['logits']
label_id = np.argmax(logits)

print(model.vocab.get_token_from_index(label_id, 'labels'))

In [None]:
##INPUT TEXT
filename = r"Sentences\daily_nation_oped_lifestyle_news_business_counties_sports_1998_2000_sentences_female1.txt"
Df_pd = pd.read_csv(filename,encoding = 'utf-8', header = None)

In [None]:
text = Df_pd.transpose()
text['Score'] = 100
text

In [None]:
##PREDICT
predictor = SentenceClassifierPredictor(model, dataset_reader=reader)

for ind in text.index:
    inputText = text[0][ind]
    logits = predictor.predict(inputText)['logits']
    label_id = np.argmax(logits) 
    a= model.vocab.get_token_from_index(label_id, 'labels')
    text.at[ind, 'Score']=a
    print(ind)

In [None]:
text.to_csv(r'Predictions\98female1.csv')
my_tab = text.Score.value_counts()
my_tab

In [None]:
##CALCULATING % SENTIMENTS FOR ALL YEARS - AFTER ALL YEARS' CSV SAVED
female98 = pd.read_csv(r"Predictions\98female1.csv",encoding = 'utf-8')
female98['Year'] = 1998 
female01 = pd.read_csv(r"Predictions\01female1.csv",encoding = 'utf-8')
female01['Year'] = 2001
female04 = pd.read_csv(r"Predictions\04female1.csv",encoding = 'utf-8')
female04['Year'] = 2004
female07 = pd.read_csv(r"Predictions\07female1.csv",encoding = 'utf-8')
female07['Year'] = 2007 
female10 = pd.read_csv(r"Predictions\10female1.csv",encoding = 'utf-8')
female10['Year'] = 2010 
female13 = pd.read_csv(r"Predictions\13female1.csv",encoding = 'utf-8')
female13['Year'] = 2013 
female16 = pd.read_csv(r"Predictions\16female1.csv",encoding = 'utf-8')
female16['Year'] = 2016 

female = female98.append([female01, female04, female07, female10, female13, female16])

my_tab= pd.crosstab(index=female["Year"], columns = female['Score'])
my_tab

In [None]:
male98 = pd.read_csv(r"Predictions\98male1.csv",encoding = 'utf-8')
male98['Year'] = 1998 
male01 = pd.read_csv(r"Predictions\01male1.csv",encoding = 'utf-8')
male01['Year'] = 2001
male04 = pd.read_csv(r"Predictions\04male1.csv",encoding = 'utf-8')
male04['Year'] = 2004
male07 = pd.read_csv(r"Predictions\07male1.csv",encoding = 'utf-8')
male07['Year'] = 2007 
male10 = pd.read_csv(r"Predictions\10male1.csv",encoding = 'utf-8')
male10['Year'] = 2010 
male13 = pd.read_csv(r"Predictions\13male1.csv",encoding = 'utf-8')
male13['Year'] = 2013 
male16 = pd.read_csv(r"Predictions\16male1.csv",encoding = 'utf-8')
male16['Year'] = 2016 

male = male98.append([male01, male04, male07, male10, male13, male16])

my_tab= pd.crosstab(index=male["Year"], columns = male['Score'])
my_tab