# Final Assignment

## Sentiment analysis

In [15]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report


In [16]:
#getting the datasets
from sklearn.datasets import load_files
import pathlib

cwd = pathlib.Path.cwd()
airline_tweets_folder = cwd.joinpath('airlinetweets')
airline_dataset = load_files(airline_tweets_folder)
# DATASET_COLUMNS  = ["sentiment", "ids", "date", "flag", "user", "text"]
# DATASET_ENCODING = "ISO-8859-1"

# you'll need to download the db at https://www.kaggle.com/datasets/kazanova/sentiment140
# train_dataset = pd.read_csv("./1milliontweets.csv",encoding=DATASET_ENCODING , names=DATASET_COLUMNS)

test_dataset = pd.read_csv("./sentiment-topic-test.tsv", sep='\t')

#### Using Multinomial Nayve-Bayes

In [17]:
sentences = test_dataset.get("text").values
golden_labels = test_dataset.get("sentiment").values

count_vec = CountVectorizer(min_df=1,tokenizer=nltk.word_tokenize,stop_words=stopwords.words('english'))

tfidf_transformer = TfidfTransformer()
train_count_vec = count_vec.fit_transform(airline_dataset.data)
train_tfidf = tfidf_transformer.fit_transform(train_count_vec)

sentences_counts = count_vec.transform(sentences)
sentences_tfidf = tfidf_transformer.fit_transform(sentences_counts)

clf = MultinomialNB().fit(train_tfidf,airline_dataset.target)

pred = clf.predict(sentences_tfidf)




In [27]:
def get_predicted_label(sentiment: float) -> str:
    if sentiment > 0:
        if sentiment > 1:
            return "positive"
        return "neutral"
    return "negative"
        

predicted_tolabel = []
for sentence,(golden,predicted) in zip(sentences,zip(golden_labels,pred)):
    print(sentence)
    print("GOLDEN LABEL:", golden)
    predicted_label = get_predicted_label(predicted)
    predicted_tolabel.append(predicted_label)
    print("PREDICTED:", predicted_label)

I wouldn't be caught dead watching the NFL if it weren't for Taylor Swift.
GOLDEN LABEL: negative
PREDICTED: negative
Chris O'Donnell stated that while filming for this movie, he felt like he was in a Toys ''R'' Us commercial.
GOLDEN LABEL: neutral
PREDICTED: negative
The whole game was a rollercoaster ride, but Los Angeles Lakers ultimately persevered and won!
GOLDEN LABEL: positive
PREDICTED: positive
Zendaya slayed in Dune 2, as she does in all her movies.
GOLDEN LABEL: positive
PREDICTED: negative
While my favorite player was playing this match and started off strongggg, it went downhill after Messi's injyry midgame.
GOLDEN LABEL: negative
PREDICTED: positive
My uncle's brother's neighbor's cat's veterinarian David reads the communist manifesto in his spare time.
GOLDEN LABEL: neutral
PREDICTED: neutral
He said that The Great Gatsby is the best novell ever, and I was about to throw hands.
GOLDEN LABEL: negative
PREDICTED: positive
I could not look away from this train wrck of a mov

In [28]:
from sklearn.metrics import classification_report

results = classification_report(golden_labels,predicted_tolabel)


In [29]:
print(results)

              precision    recall  f1-score   support

    negative       0.40      0.50      0.44         4
     neutral       1.00      0.33      0.50         3
    positive       0.25      0.33      0.29         3

    accuracy                           0.40        10
   macro avg       0.55      0.39      0.41        10
weighted avg       0.53      0.40      0.41        10



#### Using a Support-Vector Machine

In [31]:
from sklearn import svm

lin_classifier = svm.LinearSVC()
lin_classifier.fit(train_tfidf,airline_dataset.target)



In [32]:
pred = lin_classifier.predict(sentences_tfidf)

In [33]:
pred_labels = [get_predicted_label(number) for number in pred]

In [38]:
print(pred_labels)
print(golden_labels)
results_svm = classification_report(golden_labels.tolist(),pred_labels)

['negative', 'negative', 'positive', 'negative', 'negative', 'negative', 'positive', 'negative', 'negative', 'negative']
['negative' 'neutral' 'positive' 'positive' 'negative' 'neutral'
 'negative' 'negative' 'neutral' 'positive']


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [39]:
print(results_svm)

              precision    recall  f1-score   support

    negative       0.38      0.75      0.50         4
     neutral       0.00      0.00      0.00         3
    positive       0.50      0.33      0.40         3

    accuracy                           0.40        10
   macro avg       0.29      0.36      0.30        10
weighted avg       0.30      0.40      0.32        10



### Using a transformer

In [35]:
from transformers import pipeline

In [53]:
labels = []

sentimentenglish = pipeline("sentiment-analysis", 
                            model="lxyuan/distilbert-base-multilingual-cased-sentiments-student", 
                            top_k=None)
results = sentimentenglish(list(sentences))

def get_top_sentiment(list: result) -> str:
    top_sentiment = result[0]
    for sentiment in result[1:]:

i = 0
for result in results:
    top_sentiment = result[0]["label"]
    
    top_sentiment = result[0]["label"]
    print(sentences[i])
    print("GOLDEN LABEL: ",golden_labels[i])
    print("PREDICTED SENTIMENTS:")
    for sentiment in result:
        print(sentiment["label"]," - ", "{:.2f}%".format(sentiment["score"]*100))

I wouldn't be caught dead watching the NFL if it weren't for Taylor Swift.
GOLDEN LABEL:  negative
SENTIMENTS:
negative  -  39.02%
neutral  -  38.83%
positive  -  22.15%
I wouldn't be caught dead watching the NFL if it weren't for Taylor Swift.
GOLDEN LABEL:  negative
SENTIMENTS:
positive  -  40.77%
negative  -  37.60%
neutral  -  21.63%
I wouldn't be caught dead watching the NFL if it weren't for Taylor Swift.
GOLDEN LABEL:  negative
SENTIMENTS:
positive  -  43.59%
negative  -  43.17%
neutral  -  13.24%
I wouldn't be caught dead watching the NFL if it weren't for Taylor Swift.
GOLDEN LABEL:  negative
SENTIMENTS:
positive  -  42.89%
negative  -  37.33%
neutral  -  19.77%
I wouldn't be caught dead watching the NFL if it weren't for Taylor Swift.
GOLDEN LABEL:  negative
SENTIMENTS:
negative  -  40.35%
positive  -  39.99%
neutral  -  19.67%
I wouldn't be caught dead watching the NFL if it weren't for Taylor Swift.
GOLDEN LABEL:  negative
SENTIMENTS:
negative  -  64.63%
positive  -  22.45%

## Named Entity Recognition

In [4]:
from simpletransformers.ner import NERModel

  torch.utils._pytree._register_pytree_node(


In [5]:
englishmodel = NERModel(
        model_type="bert",
        model_name="dslim/bert-base-NER",
        use_cuda=False
)

Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [6]:
ner_test_dataset = pd.read_csv("./NER-test.tsv", sep='\t')

In [7]:
tokens_ner = ner_test_dataset.get("token")

In [8]:
predictions, raw_output = englishmodel.predict(tokens_ner)

  0%|          | 0/1 [00:00<?, ?it/s]

  torch.utils._pytree._register_pytree_node(


Running Prediction:   0%|          | 0/2 [00:00<?, ?it/s]

In [9]:
predictions

[[{'I': 'O'}],
 [{'would': 'O'}],
 [{"n't": 'O'}],
 [{'be': 'O'}],
 [{'caught': 'O'}],
 [{'dead': 'O'}],
 [{'watching': 'O'}],
 [{'the': 'O'}],
 [{'NFL': 'B-ORG'}],
 [{'if': 'O'}],
 [{'it': 'O'}],
 [{'were': 'O'}],
 [{"n't": 'O'}],
 [{'for': 'O'}],
 [{'Taylor': 'B-PER'}],
 [{'Swift': 'B-PER'}],
 [{'.': 'O'}],
 [{'Chris': 'B-PER'}],
 [{"O'Donnell": 'B-PER'}],
 [{'stated': 'O'}],
 [{'that': 'O'}],
 [{'while': 'O'}],
 [{'filming': 'O'}],
 [{'for': 'O'}],
 [{'this': 'O'}],
 [{'movie': 'O'}],
 [{',': 'O'}],
 [{'he': 'O'}],
 [{'felt': 'O'}],
 [{'like': 'O'}],
 [{'he': 'O'}],
 [{'was': 'O'}],
 [{'in': 'O'}],
 [{'a': 'O'}],
 [{'Toys': 'O'}],
 [{"''": 'O'}],
 [{'R': 'O'}],
 [{"''": 'O'}],
 [{'Us': 'O'}],
 [{'commercial': 'O'}],
 [{'.': 'O'}],
 [{'The': 'O'}],
 [{'whole': 'O'}],
 [{'game': 'O'}],
 [{'was': 'O'}],
 [{'a': 'O'}],
 [{'rollercoaster': 'O'}],
 [{'ride': 'O'}],
 [{',': 'O'}],
 [{'but': 'O'}],
 [{'Los': 'B-ORG'}],
 [{'Angeles': 'B-LOC'}],
 [{'Lakers': 'B-ORG'}],
 [{'ultimately': 'O'}],

## Topic analysis

In [10]:
!pip install simpletransformers





In [1]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.metrics import classification_report
from simpletransformers.classification import ClassificationModel, ClassificationArgs
import matplotlib.pyplot as plt 
import seaborn as sn 

  torch.utils._pytree._register_pytree_node(


In [2]:
from sklearn.datasets import fetch_20newsgroups

# load only a sub-selection of the categories (3 in our case)
categories = ['rec.sport.baseball', 'talk.politics.misc', 'talk.religion.misc'] 

# remove the headers, footers and quotes (to avoid overfitting)
newsgroups_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'), categories=categories, random_state=42)
newsgroups_test = fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes'), categories=categories, random_state=42)

In [3]:
train_df = pd.DataFrame({
    'text': newsgroups_train.data,
    'labels': newsgroups_train.target
})

test_df = pd.DataFrame({
    'text': newsgroups_test.data,
    'labels': newsgroups_test.target
})

In [6]:
from sklearn.model_selection import train_test_split

train, dev = train_test_split(train_df, test_size=0.1, random_state=0, 
                               stratify=train_df[['labels']])

In [7]:
# Model configuration # https://simpletransformers.ai/docs/usage/#configuring-a-simple-transformers-model 
model_args = ClassificationArgs()

model_args.overwrite_output_dir=True # overwrite existing saved models in the same directory
model_args.evaluate_during_training=True # to perform evaluation while training the model
# (eval data should be passed to the training method)

model_args.num_train_epochs=10 # number of epochs
model_args.train_batch_size=32 # batch size
model_args.learning_rate=4e-6 # learning rate
model_args.max_seq_length=256 # maximum sequence length
# Note! Increasing max_seq_len may provide better performance, but training time will increase. 
# For educational purposes, we set max_seq_len to 256.

# Early stopping to combat overfitting: https://simpletransformers.ai/docs/tips-and-tricks/#using-early-stopping
model_args.use_early_stopping=True
model_args.early_stopping_delta=0.01 # "The improvement over best_eval_loss necessary to count as a better checkpoint"
model_args.early_stopping_metric='eval_loss'
model_args.early_stopping_metric_minimize=True
model_args.early_stopping_patience=2
model_args.evaluate_during_training_steps=32 # how often you want to run validation in terms of training steps (or batches)

In [8]:
# Checking steps per epoch
steps_per_epoch = int(np.ceil(len(train) / float(model_args.train_batch_size)))
print('Each epoch will have {:,} steps.'.format(steps_per_epoch)) # 64 steps = validating 2 times per epoch

Each epoch will have 41 steps.


In [9]:
model = ClassificationModel('bert', 'bert-base-cased', num_labels=3, args=model_args, use_cuda=False) # CUDA is enabled

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
_, history = model.train_model(train, eval_df=dev,average='macro') 

  0%|          | 0/2 [00:00<?, ?it/s]

  torch.utils._pytree._register_pytree_node(
  torch.utils._pytree._register_pytree_node(
  torch.utils._pytree._register_pytree_node(


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Epoch:   0%|          | 0/10 [00:00<?, ?it/s]

Running Epoch 1 of 10:   0%|          | 0/41 [00:00<?, ?it/s]

0it [00:00, ?it/s]

  torch.utils._pytree._register_pytree_node(


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


TypeError: 'str' object is not callable