# Statistics of the de Volksbank Dataset

This notebook provides the code to extract the statistics of the annotated and translated dataset. 

Additionally, the code at the end is used to split the data and checks the distribution. 

*Note: the output of some of the cells is hidden because the data is not allowed to be shared.*

In [None]:
# import the needed packages
import pandas as pd
import matplotlib.pyplot as plt
from nltk.tokenize import word_tokenize
import numpy as np
import nltk
from collections import Counter
import string
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split

# read in the complete annotated dataset
#filepath can be changed to also get statistics about the merged datasets
file_path = #filepath to the dataset
new_df = pd.read_csv(file_path, delimiter=';', header= 0, dtype= str, keep_default_na=False, encoding= 'latin1', quotechar= '"')

## Closer look at the first 5 rows of the data: 

In [None]:
new_df.head()

## Some descriptive statistics: 

In [None]:
new_df.describe()

## Closer look at the annotations of the topics and subtopics using 'groupby': 

In [None]:
#Overview distribution English topics
new_df.groupby('CsatTopicEn').size().sort_values()

# Distribution in a Plot

In [None]:
groups = pd.DataFrame()
groups['total'] = new_df.groupby('CsatTopicEn').size()
groups['dataversion'] = 'original'
groups = groups.sort_values(by = ['total'])
groups.plot(kind = 'barh', legend = False)
plt.tight_layout()
# uncomment if you wish to save the figure
plt.savefig(r'\figures\distribution_topics_original.pdf', dpi = 300)

In [None]:
# Overview distribution English subtopics
new_df.groupby('CsatSubtopicEn').size().sort_values()

## Closer look at the feedback statements: 

In [None]:
# Average length of the sentences

lengths = []

for sentence in new_df["Sentence_new_improved"]:
    lengths.append(len(word_tokenize(sentence)))
    
mean = sum(lengths)/len(lengths)
print("Mean length of sentence: ") 
print(round(mean, 2))

In [None]:
# Total number of tokens

tokens = []
for sentence in new_df['Sentence_new_improved'].astype(str):
    [tokens.append(token) for token in word_tokenize(sentence)]

print(len(tokens))

In [None]:
# Find the longest and the shortest feedback sequence

def FindMaxLength(lst):
    ''' 
    this function intends on finding the longest feedback sequence after the sequences have been tokenized
    :param lst: the input is the list of tokens 
    :return: the tokens of the longest feedback sequence and the number of tokens
    '''
    maxList = max(lst, key = len)
    maxLength = max(map(len, lst))
      
    return maxList, maxLength

def FindMinLength(lst):
    ''' 
    this function intends on finding the shortest feedback sequence after the sequences have been tokenized
    :param lst: the input is the list of tokens 
    :return: the tokens of the shortest feedback sequence and the number of tokens
    '''
    minList = min(lst, key = len)
    minLength = min(map(len, lst))
      
    return minList, minLength

In [None]:
feedback_statements=[]
for sentence in new_df['Sentence_new_improved']: 
    feedback_statements.append(sentence)

toks = []
for s in feedback_statements:
    tok = nltk.word_tokenize(s)
    toks.append(tok)
print('Max length token:', FindMaxLength(toks))

for s in feedback_statements:
    tok = nltk.word_tokenize(s)
    toks.append(tok)
print('Min length token:', FindMinLength(toks))

In [None]:
# take a closer look at the n-grams
n = 2
ngram_frequencies = Counter()
for sentence in new_df['Sentence_new_improved'].astype(str):
    tokens_list = [tok.lower() for tok in word_tokenize(sentence)]
    ngrams = [" ".join(tokens_list[i:i+n]) for i in range(len(tokens_list)-n+1)]
    ngram_frequencies.update(ngrams)


print(ngram_frequencies.most_common(20))

In [None]:
#n-grams without the stopwords 
stops = set(stopwords.words('english'))
# uncomment if you wish to see the stopwords
#print(stops)


frequent_ngrams = ngram_frequencies.most_common(200)
for tokens, freq in frequent_ngrams: 
    
    filter = False
    
    for token in tokens.split():
        # remove punctuation
        if token in string.punctuation: 
            filter = True
        # remove stopwords
        if token in stops: 
            filter = True
    if not filter: 
        print(tokens, freq)

## Data split
The dataset is splitted in training, validation, and test sets. 
* 80% training
* 10% validation
* 10% test

In [None]:
train,test_temp = train_test_split(new_df, test_size=0.20, random_state=5)
test, validation = train_test_split(test_temp, test_size=0.50, random_state=0)

#save the data

train.to_csv(f"/data/train.csv",index=False, sep= ';')
validation.to_csv(f"/data/valid.csv",index=False, sep= ';')
test.to_csv(f"/data/test.csv",index=False, sep= ';')

### We explore the stats of the training set to see the distribution of the training examples.

In [None]:
train_df = pd.read_csv(r"\data\train.csv", delimiter=';', header= 0, dtype= str, keep_default_na=False, encoding= 'latin1')

In [None]:
groups = train_df.groupby('CsatTopicEn').size() 
print(groups)
groups.plot.bar()

In [None]:
test_df = pd.read_csv(r"\data\test.csv", delimiter=';', header= 0, dtype= str, keep_default_na=False, encoding= 'latin1')

In [None]:
groups = test_df.groupby('CsatTopicEn').size() 
print(groups)
groups.plot.bar()

In [None]:
valid_df = pd.read_csv(r"\data\valid.csv", delimiter=';', header= 0, dtype= str, keep_default_na=False, encoding= 'latin1')

In [None]:
groups = valid_df.groupby('CsatTopicEn').size() 
print(groups)
groups.plot.bar()

## End of the Notebook.