In [1]:
import nltk
from nltk.stem import PorterStemmer

import warnings
warnings.filterwarnings('ignore')

from sklearn.naive_bayes import BernoulliNB
from sklearn.model_selection import cross_val_score

import imblearn.under_sampling
import imblearn.over_sampling
import imblearn.pipeline

from IPython.display import display

In [2]:
import sys
sys.path.append('../')

from citation_sentiment_analysis.datasets.athar import (
    download_and_read_athar_txt_with_sentiment_label,
    filter_long_sentences_from_athar
)
from citation_sentiment_analysis.preprocessing.token_filter import (
    get_default_words_to_include,
    keep_sentence_list_tokens_in
)
from citation_sentiment_analysis.utils.plot import configure_default_plot_style
from citation_sentiment_analysis.utils.vectorizer import transform_to_counts

In [3]:
configure_default_plot_style()

In [4]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /home/jovyan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [5]:
athar_df = filter_long_sentences_from_athar(download_and_read_athar_txt_with_sentiment_label())
athar_df.head()

Unnamed: 0,source_paper_id,target_paper_id,sentiment,citation_text,sentiment_label
0,A00-1043,A00-2024,o,We analyzed a set of articles and identified s...,neutral
1,H05-1033,A00-2024,o,Table 3: Example compressions Compression AvgL...,neutral
2,I05-2009,A00-2024,o,5.3 Related works and discussion Our two-step ...,neutral
3,I05-2009,A00-2024,o,(1999) proposed a summarization system based o...,neutral
4,I05-2009,A00-2024,o,We found that the deletion of lead parts did n...,neutral


In [6]:
words_to_include = get_default_words_to_include()

len(words_to_include)

109442

In [7]:
citation_texts = athar_df['citation_text']

citation_tokens = [nltk.word_tokenize(s) for s in citation_texts]
print('total tokens: %d, unique: %d' % (
    sum(len(tokens) for tokens in citation_tokens),
    len({t for tokens in citation_tokens for t in tokens})
))
print(citation_tokens[0][:20])

total tokens: 384256, unique: 19323
['We', 'analyzed', 'a', 'set', 'of', 'articles', 'and', 'identified', 'six', 'major', 'operations', 'that', 'can', 'be', 'used', 'for', 'editing', 'the', 'extracted', 'sentences']


In [8]:
citation_filtered_tokens = keep_sentence_list_tokens_in(citation_tokens, words_to_include)
print('total tokens: %d, unique: %d' % (
    sum(len(tokens) for tokens in citation_filtered_tokens),
    len({t for tokens in citation_filtered_tokens for t in tokens})
))
print(citation_filtered_tokens[0][:20])

total tokens: 169508, unique: 6205
['analyzed', 'set', 'articles', 'and', 'identified', 'six', 'major', 'operations', 'that', 'can', 'used', 'for', 'editing', 'the', 'extracted', 'sentences', 'including', 'removing', 'extraneous', 'phrases']


In [9]:
ps = PorterStemmer()
citation_stemmed_tokens = [[ps.stem(t) for t in tokens] for tokens in citation_filtered_tokens]
print(citation_stemmed_tokens[0][:20])

['analyz', 'set', 'articl', 'and', 'identifi', 'six', 'major', 'oper', 'that', 'can', 'use', 'for', 'edit', 'the', 'extract', 'sentenc', 'includ', 'remov', 'extran', 'phrase']


In [10]:
X_all = transform_to_counts(citation_stemmed_tokens)
y_all = athar_df['sentiment'] == 'n'

print(X_all.shape)
print(y_all.shape)

(8699, 3617)
(8699,)


In [11]:
scores = cross_val_score(
    BernoulliNB(), X_all, y_all, cv=4, scoring='roc_auc'
)
print('roc_auc mean: %s, std: %s' % (scores.mean(), scores.std()))

roc_auc mean: 0.770593509736653, std: 0.0432709152384011
