## Data

In [1]:
import pandas as pd # data manipulation

In [2]:
# read data
df = pd.read_excel('/imf.xlsx', index_col = None) # sample IMF data is provided in the workbook
df.head()

Unnamed: 0,Sentence,Sentiment,Author
0,The following information has become available...,0,Western Hemisphere Department
1,Preliminary national accounts data for the thi...,1,
2,This information does not change the thrust of...,0,
3,The second quarter data were revised upward to...,1,
4,"Investment remained strong, rising by 26 perce...",1,


In [3]:
# drop the Author column
df = df.drop(columns = ['Author'])
df.head()

Unnamed: 0,Sentence,Sentiment
0,The following information has become available...,0
1,Preliminary national accounts data for the thi...,1
2,This information does not change the thrust of...,0
3,The second quarter data were revised upward to...,1
4,"Investment remained strong, rising by 26 perce...",1


In [4]:
df['Sentiment'].value_counts()

 0    144625
 1     86934
-1     57106
Name: Sentiment, dtype: int64

In [5]:
# remove neutral tagged sentences
mask = (df['Sentiment'] == 1) | (df['Sentiment'] == -1)
df = df[mask]
df.head()

Unnamed: 0,Sentence,Sentiment
1,Preliminary national accounts data for the thi...,1
3,The second quarter data were revised upward to...,1
4,"Investment remained strong, rising by 26 perce...",1
6,Preliminary industrial output numbers fiom bot...,-1
7,Consumer prices rose by 0.2 percent in Decembe...,-1


In [6]:
df['Sentiment'].value_counts()

 1    86934
-1    57106
Name: Sentiment, dtype: int64

In [7]:
# convert -1s to 0s
df['Sentiment'] = df['Sentiment'].replace({-1: 0})
df.head()

Unnamed: 0,Sentence,Sentiment
1,Preliminary national accounts data for the thi...,1
3,The second quarter data were revised upward to...,1
4,"Investment remained strong, rising by 26 perce...",1
6,Preliminary industrial output numbers fiom bot...,0
7,Consumer prices rose by 0.2 percent in Decembe...,0


In [8]:
df['Sentiment'].value_counts()

1    86934
0    57106
Name: Sentiment, dtype: int64

In [9]:
# downsample data
df = df.sample(frac = 0.01, random_state = 42)
df.head()

Unnamed: 0,Sentence,Sentiment
78784,Inflation was higher than expected in the firs...,0
206368,Such precision might endanger the success for ...,0
123613,"In this regard, I welcome the emphasis being p...",1
241397,They found that the proposed program represent...,1
20911,"First, according to the authorities' projectio...",1


In [10]:
df['Sentiment'].value_counts()

1    873
0    567
Name: Sentiment, dtype: int64

In [11]:
len(df)

1440

## Preprocess

In [12]:
def cleanData(text):
    text = str(text)
    text = text.strip()
    text = text.lower()
    # remove stop words # nltk library
    # remove punctuation marks # regex
    # ...
    return text

In [13]:
# sample
sample_text = '  It Will CONTAIN Things  '
print('-' + cleanData(sample_text) + '-')

-it will contain things-


In [14]:
df['Sentence'] = df['Sentence'].apply(lambda x: cleanData(x))
df.head()

Unnamed: 0,Sentence,Sentiment
78784,inflation was higher than expected in the firs...,0
206368,such precision might endanger the success for ...,0
123613,"in this regard, i welcome the emphasis being p...",1
241397,they found that the proposed program represent...,1
20911,"first, according to the authorities' projectio...",1


In [15]:
sentences = df['Sentence'].to_numpy()
sentences

array(['inflation was higher than expected in the first months of 1996.',
       'such precision might endanger the success for implementa- tion of the program.',
       'in this regard, i welcome the emphasis being placed on structural reforms aimed at improving the efficiency and competitiveness of the economy.',
       ...,
       'with the support of unicef mauritania has recently launched a study on social protection that will review the social safety nets currently in place, assess priority needs, and issue recommendations to guide the action of the government and its partners in preparing a national social welfare strategy for mauritania for the 2010-12 period.',
       'fiscal performance has been remarkable under the current program.',
       'the program fosters private sector development through privatization and encouraging domestic and foreign investment with improved governance, transparency, and administrative efficiency.'],
      dtype=object)

In [16]:
labels = df['Sentiment'].to_numpy()
labels

array([0, 0, 1, ..., 1, 1, 1])

## Sentiment classification

In [17]:
from sklearn.feature_extraction.text import CountVectorizer # for Bag-of-Words
from sklearn.model_selection import train_test_split
import numpy as np

In [18]:
# extract features with Bag-of-Words
count_vec = CountVectorizer(ngram_range = (1,1))
count_data = count_vec.fit_transform(sentences)
instances = count_data.toarray()

In [19]:
instances

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [20]:
print('(Number of instances, Number of features):', instances.shape)

(Number of instances, Number of features): (1440, 4417)


In [21]:
# split data to train and test instances
X_train, X_test, y_train, y_test = train_test_split(instances, labels, test_size = 0.1, random_state = 42)
sentence_train, sentence_test = train_test_split(sentences, test_size = 0.1, random_state = 42)

In [22]:
print('train instances:', X_train.shape)
print('test instances:', X_test.shape)
print('train labels:', y_train.shape)
print('test labels:', y_test.shape)
print('train sentences:', sentence_train.shape)
print('test sentences:', sentence_test.shape)

train instances: (1296, 4417)
test instances: (144, 4417)
train labels: (1296,)
test labels: (144,)
train sentences: (1296,)
test sentences: (144,)


### SVM

In [23]:
from sklearn import svm
from sklearn import metrics

In [24]:
# create a Support Vector Machines classifier
clf = svm.LinearSVC(random_state = 42, C = 0.1)

In [25]:
# train the model using the training set
clf.fit(X_train, y_train)

In [26]:
# predict the classes for the test set
y_pred = clf.predict(X_test)

y_pred # predictions

array([0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1,
       0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0,
       1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1,
       0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0,
       0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1,
       1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0,
       1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0])

In [27]:
# calculate accuracy of the model
svm_accuracy = metrics.accuracy_score(y_test, y_pred)
svm_accuracy

0.7430555555555556

### Pre-trained BERT

In [28]:
!pip install transformers
from transformers import pipeline

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.26.1-py3-none-any.whl (6.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m40.2 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.12.1-py3-none-any.whl (190 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m190.3/190.3 KB[0m [31m19.4 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m69.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.12.1 tokenizers-0.13.2 transformers-4.26.1


In [29]:
sentiment_classification = pipeline(task = 'sentiment-analysis', model = 'siebert/sentiment-roberta-large-english')

Downloading (…)lve/main/config.json:   0%|          | 0.00/687 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/256 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

In [30]:
# sample
sample_sentence_1 = 'flowers smell so good!'
sentiment_classification(sample_sentence_1)

[{'label': 'POSITIVE', 'score': 0.9985612034797668}]

In [31]:
# sample
sample_sentence_2 = 'i hate rainy weathers'
sentiment_classification(sample_sentence_2)

[{'label': 'NEGATIVE', 'score': 0.9985837936401367}]

In [32]:
# find BERT labels
label_of_sentiment = {}
label_of_sentiment['POSITIVE'] = 1
label_of_sentiment['NEGATIVE'] = 0

bert_labels = []
bert_scores = []

for s in sentence_test:
    label_score = sentiment_classification(s)
    bert_labels.append(label_of_sentiment[label_score[0]['label']])
    bert_scores.append(label_score[0]['score'])

# predictions
print(bert_labels[0:5])
print(bert_scores[0:5])

[0, 1, 1, 0, 1]
[0.9904823303222656, 0.9958587288856506, 0.9889670014381409, 0.9962180256843567, 0.9977511763572693]


In [33]:
# calculate accuracy
bert_accuracy = metrics.accuracy_score(y_test, bert_labels)
bert_accuracy

0.75