In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Loading All Saved Models

## Text Classification Model

In [None]:
import urllib.request
from bs4 import BeautifulSoup as soup
import requests
import nltk
from nltk.corpus import stopwords

# nltk.download('stopwords')
# stopwords.words('english')

In [None]:
import joblib
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline

# Our data is already tokenized, this function helps bypass sklearn's
def dummy(doc):
    return doc

# Create 
tf_idf_pipeline = Pipeline([
     ('vect', CountVectorizer(tokenizer=dummy, preprocessor=dummy, lowercase=False)),
     ('tfidf', TfidfTransformer())
 ])

# Load in model and dataset to setup pipeline
steam = pd.read_pickle("/content/drive/MyDrive/Colab Notebooks/NLP/Project/Text classificaiton/data_for_pipeline.pkl")
final_model = joblib.load("/content/drive/MyDrive/Colab Notebooks/NLP/Project/Text classificaiton/text_classification_model.pkl")

# Create X and y
X_train = steam['clean_text']

tf_idf_pipeline.fit(X_train)

https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations


Pipeline(steps=[('vect',
                 CountVectorizer(lowercase=False,
                                 preprocessor=<function dummy at 0x7f621e7b9440>,
                                 tokenizer=<function dummy at 0x7f621e7b9440>)),
                ('tfidf', TfidfTransformer())])

## Text Summarization Model

In [None]:
!pip install --quiet transformers==4.5.0
!pip install --quiet pytorch-lightning==1.2.7

[K     |████████████████████████████████| 2.1 MB 29.6 MB/s 
[K     |████████████████████████████████| 3.3 MB 70.7 MB/s 
[K     |████████████████████████████████| 895 kB 34.3 MB/s 
[K     |████████████████████████████████| 830 kB 37.1 MB/s 
[K     |████████████████████████████████| 136 kB 76.1 MB/s 
[K     |████████████████████████████████| 829 kB 58.0 MB/s 
[K     |████████████████████████████████| 408 kB 57.6 MB/s 
[K     |████████████████████████████████| 596 kB 69.7 MB/s 
[K     |████████████████████████████████| 1.1 MB 48.3 MB/s 
[K     |████████████████████████████████| 271 kB 73.7 MB/s 
[K     |████████████████████████████████| 94 kB 4.4 MB/s 
[K     |████████████████████████████████| 144 kB 77.0 MB/s 
[?25h  Building wheel for future (setup.py) ... [?25l[?25hdone


In [None]:
import json
import pandas as pd
import numpy as np
import torch
from pathlib import Path
from torch.utils.data import Dataset, DataLoader
import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.loggers import TensorBoardLogger
from sklearn.model_selection import train_test_split
from termcolor import colored
import textwrap

from transformers import (AdamW,
                          T5ForConditionalGeneration,
                          T5TokenizerFast as T5Tokenizer)
from tqdm.auto import tqdm

In [None]:
MODEL_NAME = "t5-base"

tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME)

Downloading:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

In [None]:
class WikiHowModel(pl.LightningModule):

    def __init__(self):
        super().__init__()
        self.model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME, return_dict=True)

    def forward(self, input_ids, attention_mask, decoder_attention_mask, labels=None):

        output = self.model(
            input_ids,
            attention_mask=attention_mask,
            labels=labels,
            decoder_attention_mask=decoder_attention_mask
        )

        return output.loss, output.logits

    def training_step(self, batch, batch_idx):
        input_ids = batch['text_input_ids']
        attention_mask = batch['text_attention_mask']
        labels = batch['labels']
        labels_attention_mask = batch['labels_attention_mask']

        loss, outputs = self(
            input_ids=input_ids,
             attention_mask=attention_mask,
             decoder_attention_mask=labels_attention_mask,
             labels=labels
        )

        self.log('train_loss', loss, prog_bar=True, logger=True)

        return loss

    def validation_step(self, batch, batch_idx):
        input_ids = batch['text_input_ids']
        attention_mask = batch['text_attention_mask']
        labels = batch['labels']
        labels_attention_mask = batch['labels_attention_mask']

        loss, outputs = self(
            input_ids=input_ids,
             attention_mask=attention_mask,
             decoder_attention_mask=labels_attention_mask,
             labels=labels
        )

        self.log('val_loss', loss, prog_bar=True, logger=True)
        
        return loss

    def test_step(self, batch, batch_idx):
        input_ids = batch['text_input_ids']
        attention_mask = batch['text_attention_mask']
        labels = batch['labels']
        labels_attention_mask = batch['labels_attention_mask']

        loss, outputs = self(
            input_ids=input_ids,
             attention_mask=attention_mask,
             decoder_attention_mask=labels_attention_mask,
             labels=labels
        )

        self.log('test_loss', loss, prog_bar=True, logger=True)
        
        return loss

    def configure_optimizers(self):
        return AdamW(self.parameters(), lr=0.0001)

In [None]:
trained_model = WikiHowModel()

Downloading:   0%|          | 0.00/1.20k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/892M [00:00<?, ?B/s]

In [None]:
checkpoint_path = "/content/drive/MyDrive/Colab Notebooks/NLP/Project/Text Summarization/best-checkpoint.ckpt"
trained_model.load_from_checkpoint(checkpoint_path)
trained_model.freeze()

## Text-to-Speech Model

* set-up and install libraries
* load model
* convert data to wav 

In [None]:
# Clone the Transformer TTS and MelGAN repos
!git clone https://github.com/as-ideas/TransformerTTS.git
!git clone https://github.com/seungwonpark/melgan.git
# Install requirements
!apt-get install -y espeak
!pip install -r TransformerTTS/requirements.txt
!cd TransformerTTS/; git checkout c3405c53e435a06c809533aa4453923469081147

# Set up the paths
from pathlib import Path
MelGAN_path = 'melgan/'
TTS_path = 'TransformerTTS/'

import sys
sys.path.append(TTS_path)

!pip install ruamel.yaml
!pip install phonemizer
!pip install webrtcvad
!pip install pyworld

# Load pretrained model
from model.factory import tts_ljspeech
from data.audio import Audio

model, config = tts_ljspeech()
audio = Audio(config)

Cloning into 'TransformerTTS'...
remote: Enumerating objects: 4183, done.[K
remote: Counting objects: 100% (722/722), done.[K
remote: Compressing objects: 100% (255/255), done.[K
remote: Total 4183 (delta 500), reused 666 (delta 466), pack-reused 3461[K
Receiving objects: 100% (4183/4183), 26.03 MiB | 11.72 MiB/s, done.
Resolving deltas: 100% (2870/2870), done.
Cloning into 'melgan'...
remote: Enumerating objects: 396, done.[K
remote: Total 396 (delta 0), reused 0 (delta 0), pack-reused 396[K
Receiving objects: 100% (396/396), 18.04 MiB | 18.79 MiB/s, done.
Resolving deltas: 100% (185/185), done.
Reading package lists... Done
Building dependency tree       
Reading state information... Done
The following additional packages will be installed:
  espeak-data libespeak1 libportaudio2 libsonic0
The following NEW packages will be installed:
  espeak espeak-data libespeak1 libportaudio2 libsonic0
0 upgraded, 5 newly installed, 0 to remove and 39 not upgraded.
Need to get 1,219 kB of ar

Note: checking out 'c3405c53e435a06c809533aa4453923469081147'.

You are in 'detached HEAD' state. You can look around, make experimental
changes and commit them, and you can discard any commits you make in this
state without impacting any branches by performing another checkout.

If you want to create a new branch to retain commits you create, you may
do so (now or later) by using -b with the checkout command again. Example:

  git checkout -b <new-branch-name>

HEAD is now at c3405c5 Fix path.
Downloading data from https://public-asai-dl-models.s3.eu-central-1.amazonaws.com/TransformerTTS/api_weights/ljspeech_tts_config_v1.yaml
Downloading data from https://public-asai-dl-models.s3.eu-central-1.amazonaws.com/TransformerTTS/api_weights/ljspeech_tts_weights_v1.hdf5


In [None]:
import IPython.display as ipd
import numpy as np

# trick to wrap text to the viewing window for this notebook
# Ref: https://stackoverflow.com/questions/58890109/line-wrapping-in-collaboratory-google-results
from IPython.display import HTML, display

def set_css():
  display(HTML('''
  <style>
    pre {
        white-space: pre-wrap;
    }
  </style>
  '''))
get_ipython().events.register('pre_run_cell', set_css)

# VideoGame Review Implementation

## Functions for Review analysis

In [None]:
# The Function getReview was taken from https://github.com/SeyiAgboola/Web-Scrape-IGN-Reviews-Sentiment/blob/master/IGN-Review-Extractor-Sentiment.py
def getReview(url):

    reqTotal = requests.get(url, headers={'User-Agent': 'Mozilla/5.0'}) 
    if reqTotal.status_code == 200:
        pageTotal = soup(reqTotal.text, 'html.parser')
        paras = pageTotal.find_all('p')
        nocode = list() #Create empty list to add text to
        for i in range(len(paras)):
            nocode.append(paras[i].get_text())
        fullreview = ' [New Paragraph] '.join(str(x) for x in nocode)
        fullreview = fullreview.replace('\n','')
        return fullreview.replace('[New Paragraph]', '')
    else:
        print('We were unable to extract main review text from ' + str(url))

def summarize(text, model):
  # Encode the text
  text_encoding =tokenizer(
      text,
      max_length=1000,
      padding="max_length",
      truncation=True,
      return_attention_mask=True,
      add_special_tokens=True,
      return_tensors="pt"

  )
  # Create ids
  generated_ids =model.model.generate(input_ids=text_encoding["input_ids"],
                                attention_mask=text_encoding["attention_mask"],
                                max_length=500,
                                num_beams=2,
                                repetition_penalty=2.5,
                                length_penalty=0.05,
                                early_stopping=True)
  

  preds = [
  tokenizer.decode(gen_id,skip_special_tokens=True,clear_up_tokenization_spaces=True)
  for gen_id in generated_ids
  ]  

  return "".join(preds)


def recommended_or_not(text, pipeline, text_clf_model):
    # text = getReview([text])
    text_tfidf = pipeline.transform([text])
    pred = text_clf_model.predict(text_tfidf)

    # 1: Not Recommended, 0: Recommended
    labels = {0:"NOT RECOMMMENDED", 1:"RECOMMENDED"}

    return labels[pred[0]]

def analyze_review(url, pipeline, text_clf_model, summary_model, tts_model, display_audio=True):
    # web scrap text
    review = getReview(url)
    
    # Summarize review
    summary = summarize(review, summary_model)

    # Classify review
    label = recommended_or_not(summary, pipeline, text_clf_model)

    

    print("=="*80,
        "HERE IS THE REVIEW SUMMARY:",
          "=="*80,
          summary,
          "=="*80,
          f"THIS GAME IS {label}",
          "=="*80,
          sep='\n')
    if display_audio:
        out_normal = tts_model.predict(summary)
        wav = audio.reconstruct_waveform(out_normal['mel'].numpy().T)
        ipd.display(ipd.Audio(wav, rate=config['sampling_rate']))

## Demo IGN Game Review Implementation

In [None]:
# IGN review
site = "https://www.ign.com/articles/2003/10/14/kirby-air-ride"

analyze_review(site, tf_idf_pipeline, final_model, trained_model, model)

  next_indices = next_tokens // vocab_size


HERE IS THE REVIEW SUMMARY:
the controls are sloppily designed, but they're disappointing. Kirby is designed to be as simple as possible -- it's not an intuitive racer. there are some nice touches that help separate the game from competing software.
THIS GAME IS RECOMMENDED
