In [1]:
from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer, AutoConfig
import numpy as np
from scipy.special import softmax
from sqlalchemy import MetaData, Table, select, desc, func
import pandas as pd

For one string

In [96]:
class Model:
    def __init__(self):
        model_name = f"cardiffnlp/twitter-roberta-base-sentiment-latest"
        self.model = AutoModelForSequenceClassification.from_pretrained(model_name)
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.config = AutoConfig.from_pretrained(model_name)
    
    @staticmethod
    def preprocess(text):
            new_text = []
            for t in text.split(" "):
                t = '@user' if t.startswith('@') and len(t) > 1 else t
                t = 'http' if t.startswith('http') else t
                new_text.append(t)
            return " ".join(new_text) 
    
    def display_result(self, scores):
        ranking = np.argsort(scores)
        ranking = ranking[::-1]
        
        for i in range(scores.shape[0]):
            l = self.config.id2label[ranking[i]]
            s = scores[ranking[i]]
            print(f"{i+1}) {l} {np.round(float(s), 4)}")
            
    def insert_result_into_dataframe(self, scores):
        score2text = '\n1;1;' + ";".join(str(s) for s in scores)
        
        with open('results.csv','a') as f:
            f.write(score2text)

    def predict(self, text):
        text = Model.preprocess(text)
        encoded_input = self.tokenizer(text, return_tensors='pt')
        output = self.model(**encoded_input)
        scores = output[0][0].detach().numpy()
        scores = softmax(scores)
        
        return scores

In [97]:
m = Model()

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [105]:
a = m.predict("Covid cases are increasing fast!")
# m.insert_result_into_dataframe(m.predict("Covid cases are increasing fast!"))
a

array([0.72357625, 0.2286794 , 0.04774441], dtype=float32)

For dataframe

In [23]:
class Model:
    def __init__(self, engine):
        model_name = f"cardiffnlp/twitter-roberta-base-sentiment-latest"
        self.model = AutoModelForSequenceClassification.from_pretrained(model_name)
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.config = AutoConfig.from_pretrained(model_name)
        
        self.engine = engine
        self.tweets = Table('tweets', MetaData(), autoload=True, autoload_with=engine)
        self.sentiments = Table('sentiments', MetaData(), autoload=True, autoload_with=engine)
        
        self.dataframe = self.read_data()
        
    def read_data(self):
        stmt = (
            select([
                self.tweets.columns.id,
                self.tweets.columns.body            
            ])
            .order_by(desc('id'))
            .limit(10)
        )
        
        return pd.DataFrame(self.engine.execute(stmt).fetchall())
        
    @staticmethod
    def preprocess(text):
            new_text = []
            for t in text.split(" "):
                t = '@user' if t.startswith('@') and len(t) > 1 else t
                t = 'http' if t.startswith('http') else t
                new_text.append(t)
            return " ".join(new_text) 
    
    def get_model_output(self, text):
        encoded_input = self.tokenizer(text, return_tensors='pt')
        output = self.model(**encoded_input)
        scores = output[0][0].detach().numpy()
        scores = softmax(scores)
        
        return list(scores)
    
    def split_model_output(self):
        model_output = np.array(list(self.dataframe['scores'].values))
        
        self.dataframe['positive'] = model_output[:, 2]
        self.dataframe['negative'] = model_output[:, 0]
        self.dataframe['neutral'] = model_output[:, 1]
        
        self.dataframe.drop(columns=['scores'], inplace=True)
        
    def update_id(self):
        stmt = (
            select(func.max(self.sentiments.columns.id))
        )
        new_id = self.engine.execute(stmt).fetchall()[0][0]
        
        if new_id is not None:
            self.dataframe.index += new_id + 1
            
    def postprocessing(self):
        self.dataframe.drop(columns=['body'], inplace=True)
        
        self.dataframe.rename(columns={'id': 'tweet_id'}, inplace=True)
        
        
    def insert_result_into_database(self):
        self.update_id()
        
        self.dataframe.to_sql('sentiments', self.engine, if_exists='append', index=True, index_label='id')
        
    def predict(self):
        self.dataframe.body = self.dataframe.body.apply(Model.preprocess)
        self.dataframe['scores'] = self.dataframe.body.apply(self.get_model_output)
        
        self.split_model_output()
        
        self.postprocessing()
        
        self.insert_result_into_database()

In [3]:
from sqlalchemy import create_engine
from sqlalchemy.ext.declarative import declarative_base

db_string = "postgresql://postgres:NaszPostgresikUkochany@34.32.246.110:5432/sandbox"

engine = create_engine(db_string)
Base = declarative_base()

In [27]:
pd.DataFrame(engine.execute('SELECT * FROM sentiments').fetchall())

Unnamed: 0,id,tweet_id,positive,negative,neutral
0,0,3649,0.009769,0.764061,0.22617
1,1,3648,0.334867,0.259942,0.405191
2,2,3647,0.789894,0.011655,0.198451
3,3,3646,0.887917,0.014168,0.097915
4,4,3645,0.010045,0.888397,0.101558
5,5,3644,0.135234,0.609589,0.255178
6,6,3643,0.008837,0.476656,0.514507
7,7,3642,0.036169,0.785779,0.178052
8,8,3641,0.01317,0.848672,0.138157
9,9,3640,0.163695,0.251535,0.584771


In [26]:
Model(engine).predict()

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
