# Final Assignment

## Sentiment analysis

In [15]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report


In [16]:
#getting the datasets
from sklearn.datasets import load_files
import pathlib

cwd = pathlib.Path.cwd()
airline_tweets_folder = cwd.joinpath('airlinetweets')
airline_dataset = load_files(airline_tweets_folder)
# DATASET_COLUMNS  = ["sentiment", "ids", "date", "flag", "user", "text"]
# DATASET_ENCODING = "ISO-8859-1"

# you'll need to download the db at https://www.kaggle.com/datasets/kazanova/sentiment140
# train_dataset = pd.read_csv("./1milliontweets.csv",encoding=DATASET_ENCODING , names=DATASET_COLUMNS)

test_dataset = pd.read_csv("./sentiment-topic-test.tsv", sep='\t')

#### Using Multinomial Nayve-Bayes

In [17]:
sentences = test_dataset.get("text").values
golden_labels = test_dataset.get("sentiment").values

count_vec = CountVectorizer(min_df=1,tokenizer=nltk.word_tokenize,stop_words=stopwords.words('english'))

tfidf_transformer = TfidfTransformer()
train_count_vec = count_vec.fit_transform(airline_dataset.data)
train_tfidf = tfidf_transformer.fit_transform(train_count_vec)

sentences_counts = count_vec.transform(sentences)
sentences_tfidf = tfidf_transformer.fit_transform(sentences_counts)

clf = MultinomialNB().fit(train_tfidf,airline_dataset.target)

pred = clf.predict(sentences_tfidf)




In [27]:
def get_predicted_label(sentiment: float) -> str:
    if sentiment > 0:
        if sentiment > 1:
            return "positive"
        return "neutral"
    return "negative"
        

predicted_tolabel = []
for sentence,(golden,predicted) in zip(sentences,zip(golden_labels,pred)):
    print(sentence)
    print("GOLDEN LABEL:", golden)
    predicted_label = get_predicted_label(predicted)
    predicted_tolabel.append(predicted_label)
    print("PREDICTED:", predicted_label)

I wouldn't be caught dead watching the NFL if it weren't for Taylor Swift.
GOLDEN LABEL: negative
PREDICTED: negative
Chris O'Donnell stated that while filming for this movie, he felt like he was in a Toys ''R'' Us commercial.
GOLDEN LABEL: neutral
PREDICTED: negative
The whole game was a rollercoaster ride, but Los Angeles Lakers ultimately persevered and won!
GOLDEN LABEL: positive
PREDICTED: positive
Zendaya slayed in Dune 2, as she does in all her movies.
GOLDEN LABEL: positive
PREDICTED: negative
While my favorite player was playing this match and started off strongggg, it went downhill after Messi's injyry midgame.
GOLDEN LABEL: negative
PREDICTED: positive
My uncle's brother's neighbor's cat's veterinarian David reads the communist manifesto in his spare time.
GOLDEN LABEL: neutral
PREDICTED: neutral
He said that The Great Gatsby is the best novell ever, and I was about to throw hands.
GOLDEN LABEL: negative
PREDICTED: positive
I could not look away from this train wrck of a mov

In [28]:
from sklearn.metrics import classification_report

results = classification_report(golden_labels,predicted_tolabel)


In [29]:
print(results)

              precision    recall  f1-score   support

    negative       0.40      0.50      0.44         4
     neutral       1.00      0.33      0.50         3
    positive       0.25      0.33      0.29         3

    accuracy                           0.40        10
   macro avg       0.55      0.39      0.41        10
weighted avg       0.53      0.40      0.41        10



#### Using a Support-Vector Machine

In [31]:
from sklearn import svm

lin_classifier = svm.LinearSVC()
lin_classifier.fit(train_tfidf,airline_dataset.target)



In [32]:
pred = lin_classifier.predict(sentences_tfidf)

In [33]:
pred_labels = [get_predicted_label(number) for number in pred]

In [38]:
print(pred_labels)
print(golden_labels)
results_svm = classification_report(golden_labels.tolist(),pred_labels)

['negative', 'negative', 'positive', 'negative', 'negative', 'negative', 'positive', 'negative', 'negative', 'negative']
['negative' 'neutral' 'positive' 'positive' 'negative' 'neutral'
 'negative' 'negative' 'neutral' 'positive']


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [39]:
print(results_svm)

              precision    recall  f1-score   support

    negative       0.38      0.75      0.50         4
     neutral       0.00      0.00      0.00         3
    positive       0.50      0.33      0.40         3

    accuracy                           0.40        10
   macro avg       0.29      0.36      0.30        10
weighted avg       0.30      0.40      0.32        10



### Using a transformer

In [35]:
from transformers import pipeline

In [56]:
predicted_labels = []

sentimentenglish = pipeline("sentiment-analysis", 
                            model="lxyuan/distilbert-base-multilingual-cased-sentiments-student", 
                            top_k=None)
results = sentimentenglish(list(sentences))

def get_top_sentiment(list: result) -> str:
    top_sentiment = result[0]["label"]
    top_score = result[0]["score"]
    for sentiment in result[1:]:
        if sentiment["score"] > top_score:
            top_score = sentiment["score"]
            top_sentiment = sentiment["label"]
    return top_sentiment

i = 0
for result in results:
    top_sentiment = get_top_sentiment(result)
    print(sentences[i])
    print("GOLDEN LABEL: ",golden_labels[i])
    print("TOP SENTIMENT: ", top_sentiment)
    predicted_labels.append(top_sentiment)
    print("PREDICTED SENTIMENTS:")
    for sentiment in result:
        print(sentiment["label"]," - ", "{:.2f}%".format(sentiment["score"]*100))
    i += 1

I wouldn't be caught dead watching the NFL if it weren't for Taylor Swift.
GOLDEN LABEL:  negative
TOP SENTIMENT:  negative
PREDICTED SENTIMENTS:
negative  -  39.02%
neutral  -  38.83%
positive  -  22.15%
Chris O'Donnell stated that while filming for this movie, he felt like he was in a Toys ''R'' Us commercial.
GOLDEN LABEL:  neutral
TOP SENTIMENT:  positive
PREDICTED SENTIMENTS:
positive  -  40.77%
negative  -  37.60%
neutral  -  21.63%
The whole game was a rollercoaster ride, but Los Angeles Lakers ultimately persevered and won!
GOLDEN LABEL:  positive
TOP SENTIMENT:  positive
PREDICTED SENTIMENTS:
positive  -  43.59%
negative  -  43.17%
neutral  -  13.24%
Zendaya slayed in Dune 2, as she does in all her movies.
GOLDEN LABEL:  positive
TOP SENTIMENT:  positive
PREDICTED SENTIMENTS:
positive  -  42.89%
negative  -  37.33%
neutral  -  19.77%
While my favorite player was playing this match and started off strongggg, it went downhill after Messi's injyry midgame.
GOLDEN LABEL:  negative

In [58]:
print(classification_report(golden_labels,predicted_labels))

              precision    recall  f1-score   support

    negative       0.50      0.75      0.60         4
     neutral       0.00      0.00      0.00         3
    positive       0.50      0.67      0.57         3

    accuracy                           0.50        10
   macro avg       0.33      0.47      0.39        10
weighted avg       0.35      0.50      0.41        10



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## Named Entity Recognition

In [59]:
from simpletransformers.ner import NERModel

  torch.utils._pytree._register_pytree_node(


In [60]:
englishmodel = NERModel(
        model_type="bert",
        model_name="dslim/bert-base-NER",
        use_cuda=False
)

Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [61]:
ner_test_dataset = pd.read_csv("./NER-test.tsv", sep='\t')

In [62]:
tokens_ner = ner_test_dataset.get("token")

In [63]:
predictions, raw_output = englishmodel.predict(tokens_ner)

  0%|          | 0/1 [00:00<?, ?it/s]

  torch.utils._pytree._register_pytree_node(


Running Prediction:   0%|          | 0/2 [00:00<?, ?it/s]

In [64]:
predictions

[[{'I': 'O'}],
 [{'would': 'O'}],
 [{"n't": 'O'}],
 [{'be': 'O'}],
 [{'caught': 'O'}],
 [{'dead': 'O'}],
 [{'watching': 'O'}],
 [{'the': 'O'}],
 [{'NFL': 'B-ORG'}],
 [{'if': 'O'}],
 [{'it': 'O'}],
 [{'were': 'O'}],
 [{"n't": 'O'}],
 [{'for': 'O'}],
 [{'Taylor': 'B-PER'}],
 [{'Swift': 'B-PER'}],
 [{'.': 'O'}],
 [{'Chris': 'B-PER'}],
 [{"O'Donnell": 'B-PER'}],
 [{'stated': 'O'}],
 [{'that': 'O'}],
 [{'while': 'O'}],
 [{'filming': 'O'}],
 [{'for': 'O'}],
 [{'this': 'O'}],
 [{'movie': 'O'}],
 [{',': 'O'}],
 [{'he': 'O'}],
 [{'felt': 'O'}],
 [{'like': 'O'}],
 [{'he': 'O'}],
 [{'was': 'O'}],
 [{'in': 'O'}],
 [{'a': 'O'}],
 [{'Toys': 'O'}],
 [{"''": 'O'}],
 [{'R': 'O'}],
 [{"''": 'O'}],
 [{'Us': 'O'}],
 [{'commercial': 'O'}],
 [{'.': 'O'}],
 [{'The': 'O'}],
 [{'whole': 'O'}],
 [{'game': 'O'}],
 [{'was': 'O'}],
 [{'a': 'O'}],
 [{'rollercoaster': 'O'}],
 [{'ride': 'O'}],
 [{',': 'O'}],
 [{'but': 'O'}],
 [{'Los': 'B-ORG'}],
 [{'Angeles': 'B-LOC'}],
 [{'Lakers': 'B-ORG'}],
 [{'ultimately': 'O'}],

## Topic analysis

In [94]:
data = pd.read_csv('./abcnews-date-text.csv');
data_text = data[['headline_text']]
data_text['index'] = data_text.index
documents = data_text

In [95]:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
np.random.seed(2018)
import nltk
nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/bebulcao/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [96]:
def lemmatize_stemming(text):
    return lemmatizer.lemmatize(text)
def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
           # result.append(token)
            result.append(lemmatize_stemming(token))
    return result

In [97]:
processed_docs = documents['headline_text'].map(preprocess)
### print the first 10 results
processed_docs[:10]

0          [decides, community, broadcasting, licence]
1                         [witness, aware, defamation]
2           [call, infrastructure, protection, summit]
3                          [staff, aust, strike, rise]
4              [strike, affect, australian, traveller]
5               [ambitious, olsson, win, triple, jump]
6          [antic, delighted, record, breaking, barca]
7    [aussie, qualifier, stosur, waste, memphis, ma...
8             [aust, address, security, council, iraq]
9                       [australia, locked, timetable]
Name: headline_text, dtype: object

In [98]:
dictionary = gensim.corpora.Dictionary(processed_docs)

In [99]:
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)

In [100]:
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]

In [101]:
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=10, id2word=dictionary, passes=2, workers=2)

In [102]:
# Save model to disk.
temp_file = "./model"
lda_model.save(temp_file)

### Analysing the detected topics

In [103]:
%matplotlib inline
import pyLDAvis
import pyLDAvis.gensim
vis = pyLDAvis.gensim.prepare(topic_model=lda_model, corpus=bow_corpus, dictionary=dictionary)
pyLDAvis.enable_notebook()
pyLDAvis.display(vis)

In [104]:
for sentence in sentences:
    print(sentence)
    bow_vector = dictionary.doc2bow(preprocess(sentence))
    for index, score in sorted(lda_model[bow_vector], key=lambda tup: -1*tup[1]):
        print("Score: {}\t Topic_id {}\t Topic: {}".format(score, index, lda_model.print_topic(index, 5)))
    print()

I wouldn't be caught dead watching the NFL if it weren't for Taylor Swift.
Score: 0.34987887740135193	 Topic_id 9	 Topic: 0.040*"trump" + 0.019*"vaccine" + 0.017*"australia" + 0.015*"test" + 0.014*"open"
Score: 0.18338239192962646	 Topic_id 2	 Topic: 0.039*"sydney" + 0.030*"election" + 0.016*"lockdown" + 0.012*"andrew" + 0.011*"state"
Score: 0.1833508163690567	 Topic_id 1	 Topic: 0.024*"canberra" + 0.023*"restriction" + 0.023*"life" + 0.019*"water" + 0.016*"police"
Score: 0.1833455115556717	 Topic_id 4	 Topic: 0.036*"police" + 0.030*"woman" + 0.027*"court" + 0.024*"death" + 0.023*"donald"
Score: 0.016676638275384903	 Topic_id 6	 Topic: 0.021*"crash" + 0.017*"house" + 0.015*"bushfire" + 0.015*"dy" + 0.014*"adelaide"
Score: 0.016673153266310692	 Topic_id 0	 Topic: 0.050*"covid" + 0.045*"coronavirus" + 0.037*"victoria" + 0.033*"case" + 0.024*"child"
Score: 0.016673153266310692	 Topic_id 3	 Topic: 0.042*"queensland" + 0.025*"south" + 0.017*"north" + 0.016*"australia" + 0.015*"indigenous"
S