# Set-up and loading the data

In [None]:
from transformers import pipeline, AutoTokenizer, AutoModel, AutoModelForSequenceClassification
import numpy as np
import pandas as pd
import torch
from sklearn.metrics.pairwise import cosine_similarity
from scipy.spatial.distance import cosine
from sentence_transformers import SentenceTransformer, util, models
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
from wordcloud import WordCloud

In [None]:
path_data = 'C:\\Users\\tnguyen10\\OneDrive - Deloitte (O365D)\\Documents\\GitHub\\Thesis\\data_structured'

In [None]:
pipe_sentiment = pipeline(model = 'distilbert-base-uncased-finetuned-sst-2-english')
sent_embedder = SentenceTransformer('sentence-transformers/paraphrase-MiniLM-L6-v2')

In [None]:
df_article = pd.read_pickle('art.pkl')

The report already has most of the pre-processing we want done in the pickle state, so we will apply the same changes to the report dataset.

In [None]:
df_report = pd.read_csv(os.path.join(path_data,'report_sentences.csv'))
df_report = df_report[df_report["word count"] > 5]
df_report = df_report[df_report["word count"] < 100]
df_report.rename(columns = {'fname':'company'},inplace = True)

In [None]:
report_sent = df_report['sentence'].tolist()
report_embeddings = sent_embedder.encode(report_sent)
df_report['embeddings'] = list(report_embeddings)

In [None]:
df_report.to_pickle('rep.pkl')

# Analyzing the data

## K-Means clustering

First, we apply k-means clustering on the existing embeddings to identify the sustainability topic clusters within the text. Following Boelders' approach, we first overlap the embeddings to remove the inherent difference in language between the report and the articles (*not sure if this is necessary - maybe I can use those metrics to see whether it improves the clusters or not*). 

### Overlapping the data

We first calculate the centroids as Boelders did and calculate the difference between the two. This difference is then subtracted from one of the embeddings (the article ones) to remove the inherent difference and create new embeddings.

In [None]:
article_centroid = np.mean(df_article['embeddings'].values, axis = 0)
article_centroid

In [None]:
report_centroid = np.mean(report_embeddings, axis = 0)
report_centroid

In [None]:
dif = report_centroid - article_centroid 
# Convert the patents vectors
def difference(org_vec):
    return org_vec + dif

In [None]:
df_art = df_article.copy()
df_rep = df_report.copy()

In [None]:
df_art['new_embeddings'] = df_art['embeddings'].apply(difference)
df_rep['new_embeddings'] = df_rep['embeddings']

### Running the algorithm and plotting a word cloud

In [None]:
# combining the two datasets in preparation for the clustering
df_comb = pd.concat([df_art, df_rep])

In [None]:
#pickling the combined dataset here
df_comb.to_pickle('data_structured/comb.pkl')

In [None]:
def plot_clusters(df,min_k,max_k):
    embeddings = list(df['new_embeddings'])
    embeddings = np.array(embeddings)
    
    min_clusters = min_k
    max_clusters = max_k
    inertias = []
    for k in range(min_clusters, max_clusters):
        kmeans = KMeans(n_clusters = k, init = 'k-means++',random_state = 1542, n_init = 10).fit(embeddings)
        inertias.append(kmeans.inertia_)
    
    plt.plot(range(min_clusters, max_clusters), inertias, '-o')
    plt.xlabel('Number of clusters (k)')
    plt.ylabel('Inertia')
    plt.title('Scree plot')
    return plt.show()


In [None]:
def add_clusters(df,k):
    embeddings = list(df['new_embeddings'])
    embeddings = np.array(embeddings)
    kmeans = KMeans(n_clusters = k, init = 'k-means++',random_state = 1542, n_init = 10).fit(embeddings)
    clusters = kmeans.labels_
    df['clusters'] = clusters
    return df

In [None]:
def plot_wordcloud(df, cluster):
    cluster_text = ' '.join([sentence for sentence in df[df['clusters'] == cluster]['sentence'].str.lower()])
    word_cloud = WordCloud(collocation_threshold = 2, width = 1000, height = 500, background_color = 'white'
                      ).generate(cluster_text)

    plt.figure(figsize = (10,5))
    plt.imshow(word_cloud)
    plt.axis('off')
    return plt.show

In [None]:
plot_clusters(df_comb, 2, 20)

In [None]:
df_comb = add_clusters(df_comb, 10)

In [None]:
plot_wordcloud(df_comb, 5)

## Sentiment Analysis

In [None]:
df_comb['sentiment'] = df_comb['sentence'].map(lambda x: pipe_sentiment(x)[0]['label'])
df_comb['sentiment_score'] = df_comb['sentence'].map(lambda x: pipe_sentiment(x)[0]['score'])

Similarly to Kang and Kim we reverse negative scores, so all the scores are on the same scale.

In [None]:
def reverse_score(row):
    if row['sentiment'] == 'NEGATIVE':
        return 1 - row['sentiment_score']
    else:
        return row['sentiment_score']

In [None]:
df_comb['sentiment_score'] = df_comb.apply(reverse_score, axis = 1)

### Score calculation