# Read Data

In [1]:
import pandas as pd

In [2]:
train_text = pd.read_parquet('train.parquet')

In [3]:
print(train_text.shape)
train_text.head()

(1000, 2)


Unnamed: 0,text,label
0,− Scope 3: Optional scope that includes indire...,1
1,The Group is not aware of any noise pollution ...,0
2,Global climate change could exacerbate certain...,0
3,Setting an investment horizon is part and parc...,0
4,Climate change the physical impacts of climate...,0


In [4]:
test_text = pd.read_parquet('test.parquet')

In [5]:
print(test_text.shape)
test_text.head()

(320, 2)


Unnamed: 0,text,label
0,Sustainable strategy ‘red lines’ For our susta...,0
1,"Verizon’s environmental, health and safety man...",1
2,"In 2019, the Company closed a series of transa...",1
3,"In December 2020, the AUC approved the Electri...",0
4,"Finally, there is a reputational risk linked t...",0


In [6]:
test_text['label'].value_counts()

label
1    163
0    106
2     51
Name: count, dtype: int64

# Preprocessing

In [7]:
import nltk
# nltk.download('punkt')
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer
import string

In [8]:
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return 'a'
    elif tag.startswith('V'):
        return 'v'
    elif tag.startswith('N'):
        return 'n'
    elif tag.startswith('R'):
        return 'r'
    else:
        return 'n'

In [9]:
def preprocessing(df):
    
    df_new = df.copy()
    tokenise_text = [nltk.word_tokenize(text.lower().replace('’', "'")) for text in df['text']]
    stop_words = [item for item in ENGLISH_STOP_WORDS]
    punctuations = [item for item in string.punctuation]
    punctuations.append('−')
    punctuations.append('—')
    punctuations.append('‘')
    punctuations.append('``')
    punctuations.append("''")
    punctuations.append('""')
    punctuations.append('•')
    
    lemmatizer = WordNetLemmatizer()
    stemmed_text = []
    stemmed_text_wo_punct = []
    for text in tokenise_text:
        
        pos_tagged = pos_tag(text)
        stemmed_word = [
            lemmatizer.lemmatize(word, pos = get_wordnet_pos(pos))
            for word, pos in pos_tagged if not word in stop_words
        ]
        stemmed_text.append(stemmed_word)
        
        stemmed_word_wo_punct = [
            word for word in stemmed_word if not word in punctuations
        ]
        stemmed_text_wo_punct.append(stemmed_word_wo_punct)
    
    df_new['processed'] = stemmed_text
    df_new['processed_wo_punct'] = stemmed_text_wo_punct
    
    return df_new

In [10]:
train_text_processed = preprocessing(train_text)

In [11]:
print(train_text.shape)
train_text_processed.head()

(1000, 2)


Unnamed: 0,text,label,processed,processed_wo_punct
0,− Scope 3: Optional scope that includes indire...,1,"[−, scope, 3, :, optional, scope, include, ind...","[scope, 3, optional, scope, include, indirect,..."
1,The Group is not aware of any noise pollution ...,0,"[group, aware, noise, pollution, negatively, i...","[group, aware, noise, pollution, negatively, i..."
2,Global climate change could exacerbate certain...,0,"[global, climate, change, exacerbate, certain,...","[global, climate, change, exacerbate, certain,..."
3,Setting an investment horizon is part and parc...,0,"[set, investment, horizon, parcel, policy, foc...","[set, investment, horizon, parcel, policy, foc..."
4,Climate change the physical impacts of climate...,0,"[climate, change, physical, impact, climate, c...","[climate, change, physical, impact, climate, c..."


In [12]:
test_text_processed = preprocessing(test_text)

In [13]:
print(test_text.shape)
test_text_processed.head()

(320, 2)


Unnamed: 0,text,label,processed,processed_wo_punct
0,Sustainable strategy ‘red lines’ For our susta...,0,"[sustainable, strategy, ‘, red, line, ', susta...","[sustainable, strategy, red, line, sustainable..."
1,"Verizon’s environmental, health and safety man...",1,"[verizon, 's, environmental, ,, health, safety...","[verizon, 's, environmental, health, safety, m..."
2,"In 2019, the Company closed a series of transa...",1,"[2019, ,, company, close, series, transaction,...","[2019, company, close, series, transaction, re..."
3,"In December 2020, the AUC approved the Electri...",0,"[december, 2020, ,, auc, approve, electricity,...","[december, 2020, auc, approve, electricity, di..."
4,"Finally, there is a reputational risk linked t...",0,"[finally, ,, reputational, risk, link, possibi...","[finally, reputational, risk, link, possibilit..."


# Feature Extraction

## Bag of Words

In [14]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

In [15]:
count_vect = CountVectorizer()

In [16]:
def bag_of_words_training(df):
        
    bow_train_counts = count_vect.fit_transform([' '.join(text) for text in df['processed_wo_punct']])
    transformer_train = TfidfTransformer().fit(bow_train_counts)
    bow_train_tfidf = transformer_train.transform(bow_train_counts)
    bagofwords = pd.DataFrame.sparse.from_spmatrix(
        bow_train_tfidf, columns = count_vect.get_feature_names_out()
    )
    
    bagofwords = bagofwords.add_prefix('bow_')
    
    return bagofwords

In [17]:
train_bow = bag_of_words_training(train_text_processed)
train_bow.shape

(1000, 5145)

In [18]:
def bag_of_words_testing(df):
        
    bow_test_counts = count_vect.transform([' '.join(text) for text in df['processed_wo_punct']])
    transformer_test = TfidfTransformer().fit(bow_test_counts)
    bow_test_tfidf = transformer_test.transform(bow_test_counts)
    bagofwords = pd.DataFrame.sparse.from_spmatrix(
        bow_test_tfidf, columns = count_vect.get_feature_names_out()
    )
    
    bagofwords = bagofwords.add_prefix('bow_')
    
    return bagofwords

In [19]:
test_bow = bag_of_words_testing(test_text_processed)
test_bow.shape

(320, 5145)

## Word2Vec (Doc2Vec)

In [20]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

In [21]:
documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(train_text_processed['processed_wo_punct'])]
model = Doc2Vec(documents, vector_size = 100, window = 2, min_count = 1, workers = 4)

In [22]:
train_vectors = []
for text in train_text_processed['processed_wo_punct']:
    vector = model.infer_vector(text)
    train_vectors.append(vector)
train_d2v = pd.DataFrame(train_vectors, columns=[f'w2v_{i}' for i in range(model.vector_size)])

In [23]:
train_d2v.shape

(1000, 100)

In [24]:
test_vectors = []
for text in test_text_processed['processed_wo_punct']:
    vector = model.infer_vector(text)
    test_vectors.append(vector)
test_d2v = pd.DataFrame(test_vectors, columns=[f'w2v_{i}' for i in range(model.vector_size)])

In [25]:
test_d2v.shape

(320, 100)

## Sentiment

In [26]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [27]:
sid = SentimentIntensityAnalyzer()

In [28]:
train_sentiment = pd.DataFrame()

In [29]:
neg_score = []
neu_score = []
pos_score = []
compound_score = []

for text in train_text_processed['text']:
    neg = sid.polarity_scores(text)['neg']
    neg_score.append(neg)
    neu = sid.polarity_scores(text)['neu']
    neu_score.append(neu)
    pos = sid.polarity_scores(text)['pos']
    pos_score.append(pos)
    compound = sid.polarity_scores(text)['compound']
    compound_score.append(compound)

train_sentiment['neg'] = neg_score
train_sentiment['neu'] = neu_score
train_sentiment['pos'] = pos_score
train_sentiment['compound'] = compound_score

In [30]:
train_sentiment.shape

(1000, 4)

In [31]:
test_sentiment = pd.DataFrame()

In [32]:
neg_score = []
neu_score = []
pos_score = []
compound_score = []

for text in test_text_processed['text']:
    neg = sid.polarity_scores(text)['neg']
    neg_score.append(neg)
    neu = sid.polarity_scores(text)['neu']
    neu_score.append(neu)
    pos = sid.polarity_scores(text)['pos']
    pos_score.append(pos)
    compound = sid.polarity_scores(text)['compound']
    compound_score.append(compound)

test_sentiment['neg'] = neg_score
test_sentiment['neu'] = neu_score
test_sentiment['pos'] = pos_score
test_sentiment['compound'] = compound_score

In [33]:
test_sentiment.shape

(320, 4)

## Concat Feature

In [34]:
train_feature = pd.concat([train_bow, train_d2v, train_sentiment], axis = 1)

In [35]:
train_feature.shape

(1000, 5249)

In [36]:
train_feature_vector = train_feature.to_numpy()

In [37]:
test_feature = pd.concat([test_bow, test_d2v, test_sentiment], axis = 1)

In [38]:
test_feature.shape

(320, 5249)

In [39]:
test_feature_vector = test_feature.to_numpy()

# Model Training & Testing

In [40]:
import numpy as np
from sklearn.metrics import *
from sklearn.model_selection import KFold
from math import sqrt

In [41]:
kfold = KFold(n_splits = 10)

## Dummy

In [42]:
from sklearn.dummy import DummyClassifier

In [43]:
dummy_clf = DummyClassifier(strategy = "most_frequent").fit(train_feature_vector, train_text['label'])

### Cross-Validation (Train Data)

In [44]:
dummy_scores = []

for train_index, val_index in kfold.split(train_text_processed):
    
    cross_train = train_text_processed.iloc[train_index].reset_index(drop = True)
    cross_val = train_text_processed.iloc[val_index].reset_index(drop = True)
    
    
    train_bow = bag_of_words_training(cross_train)
    
    documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(cross_train['processed_wo_punct'])]
    model = Doc2Vec(documents, vector_size = 100, window = 2, min_count = 1, workers = 4)
    train_vectors = []
    for text in cross_train['processed_wo_punct']:
        vector = model.infer_vector(text)
        train_vectors.append(vector)
    train_d2v = pd.DataFrame(train_vectors, columns=[f'w2v_{i}' for i in range(model.vector_size)])
    
    train_sentiment = pd.DataFrame()
    neg_score = []
    neu_score = []
    pos_score = []
    compound_score = []
    for text in cross_train['text']:
        neg = sid.polarity_scores(text)['neg']
        neg_score.append(neg)
        neu = sid.polarity_scores(text)['neu']
        neu_score.append(neu)
        pos = sid.polarity_scores(text)['pos']
        pos_score.append(pos)
        compound = sid.polarity_scores(text)['compound']
        compound_score.append(compound)
    train_sentiment['neg'] = neg_score
    train_sentiment['neu'] = neu_score
    train_sentiment['pos'] = pos_score
    train_sentiment['compound'] = compound_score
    
    features_train = pd.concat([train_bow, train_d2v, train_sentiment], axis = 1)
    features_train_vector = features_train.to_numpy()
    
    
    test_bow = bag_of_words_testing(cross_val)
    
    test_vectors = []
    for text in cross_val['processed_wo_punct']:
        vector = model.infer_vector(text)
        test_vectors.append(vector)
    test_d2v = pd.DataFrame(test_vectors, columns=[f'w2v_{i}' for i in range(model.vector_size)])
    
    test_sentiment = pd.DataFrame()
    neg_score = []
    neu_score = []
    pos_score = []
    compound_score = []
    for text in cross_val['text']:
        neg = sid.polarity_scores(text)['neg']
        neg_score.append(neg)
        neu = sid.polarity_scores(text)['neu']
        neu_score.append(neu)
        pos = sid.polarity_scores(text)['pos']
        pos_score.append(pos)
        compound = sid.polarity_scores(text)['compound']
        compound_score.append(compound)
    test_sentiment['neg'] = neg_score
    test_sentiment['neu'] = neu_score
    test_sentiment['pos'] = pos_score
    test_sentiment['compound'] = compound_score
    
    features_test = pd.concat([test_bow, test_d2v, test_sentiment], axis = 1)
    features_test_vector = features_test.to_numpy()
    
    model = DummyClassifier(strategy = "most_frequent").fit(features_train_vector, cross_train['label'])
    y_pred = model.predict(features_test_vector)
    score = accuracy_score(cross_val['label'], y_pred)
    
    dummy_scores.append(score)

In [45]:
np.mean(dummy_scores)

0.40800000000000003

In [46]:
1.96 * sqrt((np.mean(dummy_scores) * (1 - np.mean(dummy_scores))) / len(train_text))

0.0304611998713117

### Prediction

In [47]:
y_pred_dummy = dummy_clf.predict(test_feature_vector)

In [48]:
accuracy_dummy = dummy_clf.score(test_feature_vector, test_text['label'])
confusion_dummy = confusion_matrix(test_text['label'], y_pred_dummy)
classification_dummy = classification_report(test_text['label'], y_pred_dummy)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [49]:
accuracy_dummy

0.509375

In [50]:
accuracy_dummy_interval = 1.96 * sqrt((accuracy_dummy * (1 - accuracy_dummy)) / len(test_text))

In [51]:
accuracy_dummy_interval

0.054774034661022326

In [52]:
print(confusion_dummy)

[[  0 106   0]
 [  0 163   0]
 [  0  51   0]]


In [53]:
print(classification_dummy)

              precision    recall  f1-score   support

           0       0.00      0.00      0.00       106
           1       0.51      1.00      0.67       163
           2       0.00      0.00      0.00        51

    accuracy                           0.51       320
   macro avg       0.17      0.33      0.22       320
weighted avg       0.26      0.51      0.34       320



In [54]:
1.96 * sqrt((0.17 * (1 - 0.17)) / len(test_text))

0.04115708322998606

In [55]:
1.96 * sqrt((0.33 * (1 - 0.33)) / len(test_text))

0.05151995244563022

In [56]:
1.96 * sqrt((0.22 * (1 - 0.22)) / len(test_text))

0.04538786181348489

## Logistic Regression

In [57]:
from sklearn.linear_model import LogisticRegression

In [58]:
LR_clf = LogisticRegression().fit(train_feature_vector, train_text['label'])

### Cross-Validation (Train Data)

In [59]:
LR_scores = []

for train_index, val_index in kfold.split(train_text_processed):
    
    cross_train = train_text_processed.iloc[train_index].reset_index(drop = True)
    cross_val = train_text_processed.iloc[val_index].reset_index(drop = True)
    
    
    train_bow = bag_of_words_training(cross_train)
    
    documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(cross_train['processed_wo_punct'])]
    model = Doc2Vec(documents, vector_size = 100, window = 2, min_count = 1, workers = 4)
    train_vectors = []
    for text in cross_train['processed_wo_punct']:
        vector = model.infer_vector(text)
        train_vectors.append(vector)
    train_d2v = pd.DataFrame(train_vectors, columns=[f'w2v_{i}' for i in range(model.vector_size)])
    
    train_sentiment = pd.DataFrame()
    neg_score = []
    neu_score = []
    pos_score = []
    compound_score = []
    for text in cross_train['text']:
        neg = sid.polarity_scores(text)['neg']
        neg_score.append(neg)
        neu = sid.polarity_scores(text)['neu']
        neu_score.append(neu)
        pos = sid.polarity_scores(text)['pos']
        pos_score.append(pos)
        compound = sid.polarity_scores(text)['compound']
        compound_score.append(compound)
    train_sentiment['neg'] = neg_score
    train_sentiment['neu'] = neu_score
    train_sentiment['pos'] = pos_score
    train_sentiment['compound'] = compound_score
    
    features_train = pd.concat([train_bow, train_d2v, train_sentiment], axis = 1)
    features_train_vector = features_train.to_numpy()
    
    
    test_bow = bag_of_words_testing(cross_val)
    
    test_vectors = []
    for text in cross_val['processed_wo_punct']:
        vector = model.infer_vector(text)
        test_vectors.append(vector)
    test_d2v = pd.DataFrame(test_vectors, columns=[f'w2v_{i}' for i in range(model.vector_size)])
    
    test_sentiment = pd.DataFrame()
    neg_score = []
    neu_score = []
    pos_score = []
    compound_score = []
    for text in cross_val['text']:
        neg = sid.polarity_scores(text)['neg']
        neg_score.append(neg)
        neu = sid.polarity_scores(text)['neu']
        neu_score.append(neu)
        pos = sid.polarity_scores(text)['pos']
        pos_score.append(pos)
        compound = sid.polarity_scores(text)['compound']
        compound_score.append(compound)
    test_sentiment['neg'] = neg_score
    test_sentiment['neu'] = neu_score
    test_sentiment['pos'] = pos_score
    test_sentiment['compound'] = compound_score
    
    features_test = pd.concat([test_bow, test_d2v, test_sentiment], axis = 1)
    features_test_vector = features_test.to_numpy()
    
    model = LogisticRegression().fit(features_train_vector, cross_train['label'])
    y_pred = model.predict(features_test_vector)
    score = accuracy_score(cross_val['label'], y_pred)
    
    LR_scores.append(score)

In [60]:
np.mean(LR_scores)

0.727

In [61]:
1.96 * sqrt((np.mean(LR_scores) * (1 - np.mean(LR_scores))) / len(train_text))

0.02761242824526666

### Prediction

In [62]:
y_pred_LR = LR_clf.predict(test_feature_vector)

In [63]:
accuracy_LR = LR_clf.score(test_feature_vector, test_text['label'])
confusion_LR = confusion_matrix(test_text['label'], y_pred_LR)
classification_LR = classification_report(test_text['label'], y_pred_LR)

In [64]:
accuracy_LR

0.759375

In [65]:
accuracy_LR_interval = 1.96 * sqrt((accuracy_LR * (1 - accuracy_LR)) / len(test_text))

In [66]:
accuracy_LR_interval

0.046835979071082466

In [67]:
print(confusion_LR)

[[ 88  14   4]
 [ 21 124  18]
 [  2  18  31]]


In [68]:
print(classification_LR)

              precision    recall  f1-score   support

           0       0.79      0.83      0.81       106
           1       0.79      0.76      0.78       163
           2       0.58      0.61      0.60        51

    accuracy                           0.76       320
   macro avg       0.72      0.73      0.73       320
weighted avg       0.76      0.76      0.76       320



In [85]:
1.96 * sqrt((0.72 * (1 - 0.72)) / len(test_text))

0.04919560956020364

In [86]:
1.96 * sqrt((0.73 * (1 - 0.73)) / len(test_text))

0.048643452796856425

## Decision Tree

In [70]:
from sklearn.tree import DecisionTreeClassifier

In [71]:
DT_clf = DecisionTreeClassifier().fit(train_feature_vector, train_text['label'])

### Cross-Validation (Train Data)

In [72]:
DT_scores = []

for train_index, val_index in kfold.split(train_text_processed):
    
    cross_train = train_text_processed.iloc[train_index].reset_index(drop = True)
    cross_val = train_text_processed.iloc[val_index].reset_index(drop = True)
    
    
    train_bow = bag_of_words_training(cross_train)
    
    documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(cross_train['processed_wo_punct'])]
    model = Doc2Vec(documents, vector_size = 100, window = 2, min_count = 1, workers = 4)
    train_vectors = []
    for text in cross_train['processed_wo_punct']:
        vector = model.infer_vector(text)
        train_vectors.append(vector)
    train_d2v = pd.DataFrame(train_vectors, columns=[f'w2v_{i}' for i in range(model.vector_size)])
    
    train_sentiment = pd.DataFrame()
    neg_score = []
    neu_score = []
    pos_score = []
    compound_score = []
    for text in cross_train['text']:
        neg = sid.polarity_scores(text)['neg']
        neg_score.append(neg)
        neu = sid.polarity_scores(text)['neu']
        neu_score.append(neu)
        pos = sid.polarity_scores(text)['pos']
        pos_score.append(pos)
        compound = sid.polarity_scores(text)['compound']
        compound_score.append(compound)
    train_sentiment['neg'] = neg_score
    train_sentiment['neu'] = neu_score
    train_sentiment['pos'] = pos_score
    train_sentiment['compound'] = compound_score
    
    features_train = pd.concat([train_bow, train_d2v, train_sentiment], axis = 1)
    features_train_vector = features_train.to_numpy()
    
    
    test_bow = bag_of_words_testing(cross_val)
    
    test_vectors = []
    for text in cross_val['processed_wo_punct']:
        vector = model.infer_vector(text)
        test_vectors.append(vector)
    test_d2v = pd.DataFrame(test_vectors, columns=[f'w2v_{i}' for i in range(model.vector_size)])
    
    test_sentiment = pd.DataFrame()
    neg_score = []
    neu_score = []
    pos_score = []
    compound_score = []
    for text in cross_val['text']:
        neg = sid.polarity_scores(text)['neg']
        neg_score.append(neg)
        neu = sid.polarity_scores(text)['neu']
        neu_score.append(neu)
        pos = sid.polarity_scores(text)['pos']
        pos_score.append(pos)
        compound = sid.polarity_scores(text)['compound']
        compound_score.append(compound)
    test_sentiment['neg'] = neg_score
    test_sentiment['neu'] = neu_score
    test_sentiment['pos'] = pos_score
    test_sentiment['compound'] = compound_score
    
    features_test = pd.concat([test_bow, test_d2v, test_sentiment], axis = 1)
    features_test_vector = features_test.to_numpy()
    
    model = DecisionTreeClassifier().fit(features_train_vector, cross_train['label'])
    y_pred = model.predict(features_test_vector)
    score = accuracy_score(cross_val['label'], y_pred)
    
    DT_scores.append(score)

In [73]:
np.mean(DT_scores)

0.663

In [74]:
1.96 * sqrt((np.mean(DT_scores) * (1 - np.mean(DT_scores))) / len(train_text))

0.029297312668570812

### Prediction

In [75]:
y_pred_DT = DT_clf.predict(test_feature_vector)

In [76]:
accuracy_DT = DT_clf.score(test_feature_vector, test_text['label'])
confusion_DT = confusion_matrix(test_text['label'], y_pred_DT)
classification_DT = classification_report(test_text['label'], y_pred_DT)

In [77]:
accuracy_DT

0.615625

In [78]:
accuracy_DT_interval = 1.96 * sqrt((accuracy_DT * (1 - accuracy_DT)) / len(test_text))

In [79]:
accuracy_DT_interval

0.05329871918533198

In [80]:
print(confusion_DT)

[[77 24  5]
 [31 98 34]
 [ 2 27 22]]


In [81]:
print(classification_DT)

              precision    recall  f1-score   support

           0       0.70      0.73      0.71       106
           1       0.66      0.60      0.63       163
           2       0.36      0.43      0.39        51

    accuracy                           0.62       320
   macro avg       0.57      0.59      0.58       320
weighted avg       0.62      0.62      0.62       320



In [87]:
1.96 * sqrt((0.59 * (1 - 0.59)) / len(test_text))

0.053888862485675086

In [82]:
1.96 * sqrt((0.58 * (1 - 0.58)) / len(test_text))

0.054077888272379866

In [83]:
1.96 * sqrt((0.57 * (1 - 0.57)) / len(test_text))

0.05424412871454384

In [91]:
from scipy.stats import ttest_rel
from scipy.stats import ttest_1samp

In [92]:
t_DT, p_DT = ttest_1samp(DT_scores, accuracy_DT)
p_DT

0.11938379764249754