In [None]:
# default_exp core

# SGNN

> Implementation of Self-Governing Neural Networks for speech act classification

Implementation of the [SGNN paper](https://www.aclweb.org/anthology/D19-1402.pdf) for speech act classification.
This repository is inspired by Guillaume Chevalier's [implementation](https://github.com/guillaume-chevalier/SGNN-Self-Governing-Neural-Networks-Projection-Layer), as well as his [discussion](https://github.com/guillaume-chevalier/SGNN-Self-Governing-Neural-Networks-Projection-Layer/issues/1) with [Sava Kalbachou](https://github.com/thinline).
This version implements some things differently from Guillaume's code, and extends beyond the projection layer all the way to a fully trainable network.

The network is trained to classify the [SwDA corpus](https://web.stanford.edu/~jurafsky/ws97/) utterances according to their speech act. The corpus was was pre-processed using Cristopher Pott's [project](https://github.com/cgpotts/swda/) related to it; the pre-processed data is included in [`data/swda-acttags-and-text.csv`](data/swda-acttags-and-text.csv) for repeateability.

In [None]:
#hide
from nbdev.showdoc import *

We first import the data from file

In [None]:
# export
import pandas as pd

data_filepath = 'data/swda-acttags-and-text.csv'
data = pd.read_csv(data_filepath)

In [None]:
# See the data
data.head()

Unnamed: 0,DamslActTag,Text
0,ad,"So, uh, describe your family budget."
1,sd,"Well, I've, uh, for a lot of years I, I've pre..."
2,sd,"and, uh, just recently, uh, we, we set up a bu..."
3,sd,"and, and we're trying to stick to it."
4,sd,We just bought a new house.


Divide data in test and train sets

In [None]:
from sklearn.model_selection import train_test_split

y = data.DamslActTag
X = data.Text
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2)


See the data division

In [None]:
print(X_train.head(),"\n", y_train.head())

104567        The last one was back down to Lubbock, Texas,
196416             I think you would have to start younger.
139376    I come from a state that has, well I originall...
48102     it's not the evaporator, the big filter on one...
132485    And my other child, um, uh, is in, big into tr...
Name: Text, dtype: object 
 104567    sd
196416    sv
139376    sd
48102     sd
132485    sd
Name: DamslActTag, dtype: object


## Let's build SGNN the pipeline for processing the data

In [None]:
#export

# CountVectorizer Parameters
char_ngram_range = (1, 4)

char_term_frequency_params = {
    'char_term_frequency__analyzer': 'char',
    'char_term_frequency__lowercase': False,
    'char_term_frequency__ngram_range': char_ngram_range,
    'char_term_frequency__strip_accents': None,
    'char_term_frequency__min_df': 2,
    'char_term_frequency__max_df': 0.99,
    'char_term_frequency__max_features': int(1e7),
}

In [None]:
#export 
import scipy.sparse as sp

T = 80
d = 14
# T=80 projections for each of dimension d=14: 80 * 14 = 1120-dimensionnal word projections.
hashing_feature_union_params = {
    **{'union__sparse_random_projection_hasher_{}__n_components'.format(t): d
       for t in range(T)
    },
    **{'union__sparse_random_projection_hasher_{}__dense_output'.format(t): False  # only AFTER hashing.
       for t in range(T)
    }
}

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.random_projection import SparseRandomProjection

params = dict()
params.update(char_term_frequency_params)
# params.update(hashing_feature_union_params)

pipeline = Pipeline([
    ("char_term_frequency", CountVectorizer()),
#     ('union', FeatureUnion([
#         ('sparse_random_projection_hasher_{}'.format(t), SparseRandomProjection())
#         for t in range(T)
#     ]))
])
pipeline.set_params(**params)

result = pipeline.fit_transform(X_train)

print(len(result), len(X_train))
print(result[0].shape)

ValueError: np.nan is an invalid document, expected byte or unicode string.

In [None]:
type(X_train)