In [None]:
# set up work environment
import os
from google.colab import drive
drive.mount('/content/drive')
os.chdir("/content/drive/My Drive/CS5224")
!pip install boto3
!pip install transformers

Mounted at /content/drive
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting boto3
  Downloading boto3-1.25.2-py3-none-any.whl (132 kB)
[K     |████████████████████████████████| 132 kB 5.1 MB/s 
[?25hCollecting jmespath<2.0.0,>=0.7.1
  Downloading jmespath-1.0.1-py3-none-any.whl (20 kB)
Collecting botocore<1.29.0,>=1.28.2
  Downloading botocore-1.28.2-py3-none-any.whl (9.3 MB)
[K     |████████████████████████████████| 9.3 MB 44.2 MB/s 
[?25hCollecting s3transfer<0.7.0,>=0.6.0
  Downloading s3transfer-0.6.0-py3-none-any.whl (79 kB)
[K     |████████████████████████████████| 79 kB 6.8 MB/s 
[?25hCollecting urllib3<1.27,>=1.25.4
  Downloading urllib3-1.26.12-py2.py3-none-any.whl (140 kB)
[K     |████████████████████████████████| 140 kB 37.8 MB/s 
Installing collected packages: urllib3, jmespath, botocore, s3transfer, boto3
  Attempting uninstall: urllib3
    Found existing installation: urllib3 1.24.3
    Uninstalling urllib

In [None]:
import transformers
from transformers import BertModel, BertTokenizer, get_linear_schedule_with_warmup
import torch

import pandas as pd
from torch import nn
from torch.utils.data import Dataset, DataLoader
from tqdm.notebook import tqdm
import numpy as np

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
PRE_TRAINED_MODEL_NAME = 'bert-base-cased'
tokenizer = BertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)
class_names = ['negative', 'positive']
MAX_LEN=32

class SentimentClassifier(nn.Module):

    def __init__(self, n_classes):
        super(SentimentClassifier, self).__init__()
        self.bert = BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME)
        self.drop = nn.Dropout(p=0.3)
        self.out = nn.Linear(self.bert.config.hidden_size, n_classes)

    def forward(self, input_ids, attention_mask):
        outs = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        output = self.drop(outs["pooler_output"])
        return self.out(output)


class TwitterSentimentDataset(Dataset):
    def __init__(self, df, tokenizer, max_len):
        self.df = df
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.df)

    def __getitem__(self, item):
        text = self.df.text[item]
        target = self.df.target[item]
        no = self.df.no[item]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            # padding="longest",
            padding=True,
            return_token_type_ids=False,
            return_attention_mask=True,
            truncation=True,
            return_tensors="pt",
        )

        return {
            "text": text,
            "input_ids": encoding["input_ids"].flatten(),
            "attention_mask": encoding["attention_mask"].flatten(),
            "targets": torch.tensor(target, dtype=torch.long),
            "no": torch.tensor(no, dtype=torch.long),
        }

def create_data_loader(df, tokenizer, MAX_LEN, batch_size):
    ds = TwitterSentimentDataset(df, tokenizer, MAX_LEN)
    return DataLoader(ds, batch_size, num_workers=4)



PATH = "best_model_state.bin"
model = SentimentClassifier(len(class_names)).to(device)
model.load_state_dict(torch.load(PATH, map_location=device))

def evalSingleSentence(sentence):
  evaluationDF = pd.DataFrame(columns=['target', 'text', 'no'])
  evaluationDF.loc[len(evaluationDF.index)] = [0, sentence, 0]

  eval_data_loader = create_data_loader(evaluationDF, tokenizer, MAX_LEN, 1)
  with torch.no_grad():
    for d in tqdm(eval_data_loader):
      input_ids = d["input_ids"].to(device)
      attention_mask = d["attention_mask"].to(device)

      targets = d["targets"].to(device)
      outputs = model(input_ids=input_ids, attention_mask=attention_mask)
      _, preds = torch.max(outputs, dim=1)
      return preds.item()




Downloading:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
filepath = 'keyword_tweets_OpIran_clean.csv'

In [None]:
def evalFile(filepath):
  df = pd.read_csv(filepath, header=None, encoding="ISO-8859-1")
  df.columns = ["no", "Time", "text"]
  df = df.drop(0)
  df = df.dropna()
  df.reset_index(drop=True,inplace=True)
  df['target'] = 0


  eval_data_loader = create_data_loader(df, tokenizer, MAX_LEN, 1)
  
  result = np.zeros(shape=(len(df) + 5))
  with torch.no_grad():
    for d in tqdm(eval_data_loader):
      input_ids = d["input_ids"].to(device)
      attention_mask = d["attention_mask"].to(device)

      targets = d["targets"].to(device)
      outputs = model(input_ids=input_ids, attention_mask=attention_mask)
      _, preds = torch.max(outputs, dim=1)
      score = preds.item()
      no = d['no'].item()
      result[no] = score

  df['target'] = result
  df.to_csv(filepath + '_flag.csv')






In [None]:
evalFile(filepath)

  cpuset_checked))


  0%|          | 0/99918 [00:00<?, ?it/s]

KeyboardInterrupt: ignored

In [None]:
df = pd.read_csv("keyword_tweets_UkraineRussianWar_70k_rows_clean.csv", header=None, encoding="ISO-8859-1")
df.columns = ["no", "Time", "text"]
df = df.drop(0)
df = df.dropna()
df.reset_index(drop=True,inplace=True)
df['target'] = 0
df


Unnamed: 0,no,Time,text,target
0,0.0,2022-10-16 12:43:16,putin russia looses wants us stop sanctions wo...,0
1,1.0,2022-10-16 12:42:29,rt belarusian media report least 6 mig 31s dag...,0
2,2.0,2022-10-16 12:42:06,ukraine russian jets jets carrying hypersonic ...,0
3,3.0,2022-10-16 12:40:55,rt himars missle hit russian positions direction,0
4,4.0,2022-10-16 12:40:45,stick pin arse deflate ego horror unspeakable ...,0
...,...,...,...,...
69869,69936.0,2022-10-07 22:51:27,last certain,0
69870,69937.0,2022-10-07 22:51:01,consequences shelling zaporizhzhia russian arm...,0
69871,69938.0,2022-10-07 22:50:29,last certain,0
69872,69939.0,2022-10-07 22:50:14,birthday loser via visit maps memes news other...,0


In [None]:
result = np.zeros(shape=(len(df) + 1))
result

array([0., 0., 0., ..., 0., 0., 0.])

In [None]:
eval_data_loader = create_data_loader(df, tokenizer, MAX_LEN, 1)

In [None]:
# eval_data_loader = create_data_loader(inputDF, tokenizer, MAX_LEN, 200)
with torch.no_grad():
  for d in tqdm(eval_data_loader):
    input_ids = d["input_ids"].to(device)
    attention_mask = d["attention_mask"].to(device)

    targets = d["targets"].to(device)
    outputs = model(input_ids=input_ids, attention_mask=attention_mask)
    _, preds = torch.max(outputs, dim=1)
    score = preds.item()
    no = d['no'].item()
    result[no] = score

  0%|          | 0/69874 [00:00<?, ?it/s]

IndexError: ignored

In [None]:
result

array([0., 0., 1., ..., 0., 1., 0.])

In [None]:
df

Unnamed: 0,no,Time,text,target
0,0.0,2022-10-16 12:43:16,putin russia looses wants us stop sanctions wo...,0
1,1.0,2022-10-16 12:42:29,rt belarusian media report least 6 mig 31s dag...,0
2,2.0,2022-10-16 12:42:06,ukraine russian jets jets carrying hypersonic ...,0
3,3.0,2022-10-16 12:40:55,rt himars missle hit russian positions direction,0
4,4.0,2022-10-16 12:40:45,stick pin arse deflate ego horror unspeakable ...,0
...,...,...,...,...
69869,69936.0,2022-10-07 22:51:27,last certain,0
69870,69937.0,2022-10-07 22:51:01,consequences shelling zaporizhzhia russian arm...,0
69871,69938.0,2022-10-07 22:50:29,last certain,0
69872,69939.0,2022-10-07 22:50:14,birthday loser via visit maps memes news other...,0


In [None]:
df['target'] = result

In [None]:
df

Unnamed: 0,no,Time,text,target
0,0.0,2022-10-16 12:43:16,putin russia looses wants us stop sanctions wo...,0.0
1,1.0,2022-10-16 12:42:29,rt belarusian media report least 6 mig 31s dag...,0.0
2,2.0,2022-10-16 12:42:06,ukraine russian jets jets carrying hypersonic ...,1.0
3,3.0,2022-10-16 12:40:55,rt himars missle hit russian positions direction,0.0
4,4.0,2022-10-16 12:40:45,stick pin arse deflate ego horror unspeakable ...,0.0
...,...,...,...,...
69869,69936.0,2022-10-07 22:51:27,last certain,0.0
69870,69937.0,2022-10-07 22:51:01,consequences shelling zaporizhzhia russian arm...,1.0
69871,69938.0,2022-10-07 22:50:29,last certain,0.0
69872,69939.0,2022-10-07 22:50:14,birthday loser via visit maps memes news other...,1.0


In [None]:
df.to_csv('../UkraineRussianWar_70k_rows_clean_flag.csv')