In [None]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
import spacy
from sklearn.preprocessing import LabelEncoder
from keras.preprocessing.sequence import pad_sequences
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
text_data = """
Searle revised the speech acts classification and claimed that all speech acts fall into five categories: (1) Representative/Assertive: Speech act that expresses speaker’s belief and that commits the speaker to the truth of what is asserted (i.e., words fit the world. Example: 😂 Statements); (2) Directive: Speech act that expresses speaker’s wish and making an attempt to get the hearer to do something (i.e., world fits the words. Example: Requests); (3) Commissive: Speech act that expresses speaker’s intention and marking the commitment for the speaker to engage in future action (I.e., world fits the words. Example: Promise); (4) Expressive: Speech act that expresses speaker’s psychological states which has no direction of fit between the world and words (Example: Apologies) and (5) Declaration: Speech act that brings change in (institutional) reality and has bilateral fit between world and words (Example: Baptizing). 😂
👍 A number of studies have applied speech acts analysis in CMC environments. Vásquez (2011) studied complaints on the travel website TripAdvisor and concluded that complaints co-occurred more frequently with advice and recommendations and they were considered mostly indirect in nature. Other studies focused on users’ self-representation in CMC environments. By examining away messages in Instant Messenger (IM), Nastri et al. (2006) found that they were constructed primarily with assertives, followed by expressives and commissives, but seldom with directives. The authors concluded that away messages tended to reflect both informational and entertainment goals. Similarly, Carr et al. (2012) investigated self-presentation in Facebook status messages and found that they were mostly constructed with expressives, followed by assertives. 😍 Their findings demonstrated differences in how users expressed themselves in alternate media. Given that text-based speech acts often co-occur with emoticons and emojis in CMC, some studies have investigated the relationship between speech acts and emoticon usage in message construction. Dresner and Herring (2010) examined the pragmatic function of emoticons and argued that the primary function of emoticon was not to convey emotion but to indicate an illocutionary force, which is the intended effect of the utterance. While their study provided a more nuanced understanding of the functions of emoticons, their study was not situated in a particular CMC setting. In light of this, Skovholt et al. (2014) investigated the communicative functions of emoticons in workplace emails by adopting speech act theory and politeness theory. Through identification of speech acts followed by emoticons in workplace emails, they found that emoticons contributed to modifying the propositional content and the illocutionary force of speech acts, which corresponded with Drenser and Herring’s results (2010). 😍 More recently, the popularity of emoji use have attracted scholars’ interests. Ge-Stadnyk (2021) examined and compared how social media influencers on Weibo (a Chinese Microblogging site) and Twitter used emoji sequences when engaging in self-presentation. The study identified a variety of text-based speech acts, emoji functions, and functional relations by conducting speech act and pragmatic function analyses and claimed that emoji sequences functioning as ‘emphasis on text’ was most employed in connection with accompanying texts in both Weibo and Twitter data (p. 378). To our best knowledge, studies on speech acts with emoji usage in self-help online discussion forums is sparse. This study expands the current research scope by examining the text-based speech acts and the communicative functions of emoji in an online self-help discussion forum related to COVID-19, with the aim to investigate how Hong Kong forum users framed their COVID-19 experiences, expressed their emotions and seek socioemotional support from others amid a global health crisis. 😍
"""

###**Text normalization**

In [None]:
normalized_text = text_data.lower()
normalized_text

'\nsearle revised the speech acts classification and claimed that all speech acts fall into five categories: (1) representative/assertive: speech act that expresses speaker’s belief and that commits the speaker to the truth of what is asserted (i.e., words fit the world. example: 😂 statements); (2) directive: speech act that expresses speaker’s wish and making an attempt to get the hearer to do something (i.e., world fits the words. example: requests); (3) commissive: speech act that expresses speaker’s intention and marking the commitment for the speaker to engage in future action (i.e., world fits the words. example: promise); (4) expressive: speech act that expresses speaker’s psychological states which has no direction of fit between the world and words (example: apologies) and (5) declaration: speech act that brings change in (institutional) reality and has bilateral fit between world and words (example: baptizing). 😂\n👍 a number of studies have applied speech acts analysis in cmc

###**Tokenization**

In [None]:
tokens =nltk.word_tokenize(normalized_text)
tokens

['searle',
 'revised',
 'the',
 'speech',
 'acts',
 'classification',
 'and',
 'claimed',
 'that',
 'all',
 'speech',
 'acts',
 'fall',
 'into',
 'five',
 'categories',
 ':',
 '(',
 '1',
 ')',
 'representative/assertive',
 ':',
 'speech',
 'act',
 'that',
 'expresses',
 'speaker',
 '’',
 's',
 'belief',
 'and',
 'that',
 'commits',
 'the',
 'speaker',
 'to',
 'the',
 'truth',
 'of',
 'what',
 'is',
 'asserted',
 '(',
 'i.e.',
 ',',
 'words',
 'fit',
 'the',
 'world',
 '.',
 'example',
 ':',
 '😂',
 'statements',
 ')',
 ';',
 '(',
 '2',
 ')',
 'directive',
 ':',
 'speech',
 'act',
 'that',
 'expresses',
 'speaker',
 '’',
 's',
 'wish',
 'and',
 'making',
 'an',
 'attempt',
 'to',
 'get',
 'the',
 'hearer',
 'to',
 'do',
 'something',
 '(',
 'i.e.',
 ',',
 'world',
 'fits',
 'the',
 'words',
 '.',
 'example',
 ':',
 'requests',
 ')',
 ';',
 '(',
 '3',
 ')',
 'commissive',
 ':',
 'speech',
 'act',
 'that',
 'expresses',
 'speaker',
 '’',
 's',
 'intention',
 'and',
 'marking',
 'the',
 'co

###**Stop-word removal**

In [None]:
stop_words = set(stopwords.words('english'))
filtered_tokens = [words for words in tokens if words not in stop_words]
filtered_tokens

['searle',
 'revised',
 'speech',
 'acts',
 'classification',
 'claimed',
 'speech',
 'acts',
 'fall',
 'five',
 'categories',
 ':',
 '(',
 '1',
 ')',
 'representative/assertive',
 ':',
 'speech',
 'act',
 'expresses',
 'speaker',
 '’',
 'belief',
 'commits',
 'speaker',
 'truth',
 'asserted',
 '(',
 'i.e.',
 ',',
 'words',
 'fit',
 'world',
 '.',
 'example',
 ':',
 '😂',
 'statements',
 ')',
 ';',
 '(',
 '2',
 ')',
 'directive',
 ':',
 'speech',
 'act',
 'expresses',
 'speaker',
 '’',
 'wish',
 'making',
 'attempt',
 'get',
 'hearer',
 'something',
 '(',
 'i.e.',
 ',',
 'world',
 'fits',
 'words',
 '.',
 'example',
 ':',
 'requests',
 ')',
 ';',
 '(',
 '3',
 ')',
 'commissive',
 ':',
 'speech',
 'act',
 'expresses',
 'speaker',
 '’',
 'intention',
 'marking',
 'commitment',
 'speaker',
 'engage',
 'future',
 'action',
 '(',
 'i.e.',
 ',',
 'world',
 'fits',
 'words',
 '.',
 'example',
 ':',
 'promise',
 ')',
 ';',
 '(',
 '4',
 ')',
 'expressive',
 ':',
 'speech',
 'act',
 'expresses',


###**Stemming**

In [None]:
stemmer = PorterStemmer()
stemmed_tokens = [stemmer.stem(word) for word in filtered_tokens ]
stemmed_tokens

['searl',
 'revis',
 'speech',
 'act',
 'classif',
 'claim',
 'speech',
 'act',
 'fall',
 'five',
 'categori',
 ':',
 '(',
 '1',
 ')',
 'representative/assert',
 ':',
 'speech',
 'act',
 'express',
 'speaker',
 '’',
 'belief',
 'commit',
 'speaker',
 'truth',
 'assert',
 '(',
 'i.e.',
 ',',
 'word',
 'fit',
 'world',
 '.',
 'exampl',
 ':',
 '😂',
 'statement',
 ')',
 ';',
 '(',
 '2',
 ')',
 'direct',
 ':',
 'speech',
 'act',
 'express',
 'speaker',
 '’',
 'wish',
 'make',
 'attempt',
 'get',
 'hearer',
 'someth',
 '(',
 'i.e.',
 ',',
 'world',
 'fit',
 'word',
 '.',
 'exampl',
 ':',
 'request',
 ')',
 ';',
 '(',
 '3',
 ')',
 'commiss',
 ':',
 'speech',
 'act',
 'express',
 'speaker',
 '’',
 'intent',
 'mark',
 'commit',
 'speaker',
 'engag',
 'futur',
 'action',
 '(',
 'i.e.',
 ',',
 'world',
 'fit',
 'word',
 '.',
 'exampl',
 ':',
 'promis',
 ')',
 ';',
 '(',
 '4',
 ')',
 'express',
 ':',
 'speech',
 'act',
 'express',
 'speaker',
 '’',
 'psycholog',
 'state',
 'direct',
 'fit',
 'worl

###**Lemmatization**

In [None]:
nlp = spacy.load('en_core_web_sm')
doc = nlp(' '.join(filtered_tokens))
lemmatized_tokens = [token.lemma_ for token in doc]
lemmatized_tokens

['searle',
 'revise',
 'speech',
 'act',
 'classification',
 'claim',
 'speech',
 'act',
 'fall',
 'five',
 'category',
 ':',
 '(',
 '1',
 ')',
 'representative',
 '/',
 'assertive',
 ':',
 'speech',
 'act',
 'express',
 'speaker',
 "'",
 'belief',
 'commit',
 'speaker',
 'truth',
 'assert',
 '(',
 'i.e.',
 ',',
 'word',
 'fit',
 'world',
 '.',
 'example',
 ':',
 '😂',
 'statement',
 ')',
 ';',
 '(',
 '2',
 ')',
 'directive',
 ':',
 'speech',
 'act',
 'express',
 'speaker',
 "'",
 'wish',
 'make',
 'attempt',
 'get',
 'hearer',
 'something',
 '(',
 'i.e.',
 ',',
 'world',
 'fit',
 'word',
 '.',
 'example',
 ':',
 'request',
 ')',
 ';',
 '(',
 '3',
 ')',
 'commissive',
 ':',
 'speech',
 'act',
 'express',
 'speaker',
 "'",
 'intention',
 'mark',
 'commitment',
 'speaker',
 'engage',
 'future',
 'action',
 '(',
 'i.e.',
 ',',
 'world',
 'fit',
 'word',
 '.',
 'example',
 ':',
 'promise',
 ')',
 ';',
 '(',
 '4',
 ')',
 'expressive',
 ':',
 'speech',
 'act',
 'express',
 'speaker',
 "'",
 '

###**Text Encoding**

In [None]:
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(lemmatized_tokens)
encoded_labels

array([188, 185, 203,  22,  50,  49, 203,  22, 100, 104,  46,  19,   1,
         7,   2, 181,   6,  35,  19, 203,  22,  97, 202,   0,  42,  54,
       202, 215,  34,   1, 127,   3, 227, 103, 229,   5,  94,  19, 232,
       206,   2,  20,   1,   8,   2,  77,  19, 203,  22,  97, 202,   0,
       226, 145,  36, 115, 121, 200,   1, 127,   3, 229, 103, 227,   5,
        94,  19, 182,   2,  20,   1,  15,   2,  53,  19, 203,  22,  97,
       202,   0, 138, 146,  55, 202,  89, 113,  23,   1, 127,   3, 229,
       103, 227,   5,  94,  19, 169,   2,  20,   1,  17,   2,  98,  19,
       203,  22,  97, 202,   0, 172, 205,  76, 103, 229, 227,   1,  94,
        19,  31,   2,   1,  18,   2,  73,  19, 203,  22,  44,  47,   1,
       136,   2, 173,  43, 103, 229, 227,   1,  94,  19,  40,   2,   5,
       232, 231, 156, 208,  32, 203,  22,  30,  51,  91,   5, 223,   1,
        11,   2, 208,  58, 213, 224, 214,  59,  58,  52,   4, 157, 110,
        25, 175,  62, 152, 132, 154,   5, 208, 105, 220, 230, 19

###**Vectorization and Embeddings**

In [None]:
vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform([' '.join(lemmatized_tokens)])
vectors.toarray()

array([[0.05202269, 0.02601134, 0.05202269, 0.02601134, 0.02601134,
        0.02601134, 0.02601134, 0.02601134, 0.02601134, 0.44219284,
        0.02601134, 0.02601134, 0.02601134, 0.02601134, 0.07803403,
        0.02601134, 0.02601134, 0.05202269, 0.02601134, 0.02601134,
        0.02601134, 0.02601134, 0.07803403, 0.02601134, 0.02601134,
        0.02601134, 0.05202269, 0.02601134, 0.07803403, 0.02601134,
        0.02601134, 0.02601134, 0.02601134, 0.02601134, 0.02601134,
        0.02601134, 0.05202269, 0.02601134, 0.10404537, 0.05202269,
        0.05202269, 0.02601134, 0.02601134, 0.05202269, 0.02601134,
        0.05202269, 0.05202269, 0.02601134, 0.02601134, 0.02601134,
        0.05202269, 0.02601134, 0.02601134, 0.02601134, 0.02601134,
        0.02601134, 0.05202269, 0.02601134, 0.02601134, 0.02601134,
        0.02601134, 0.02601134, 0.02601134, 0.02601134, 0.05202269,
        0.05202269, 0.02601134, 0.02601134, 0.02601134, 0.05202269,
        0.15606806, 0.02601134, 0.20809075, 0.05

###**Padding/Truncation**

In [None]:
sequences = [encoded_labels]
padded_sequences = pad_sequences(sequences, maxlen = 100, padding = 'post', truncating = 'post')
padded_sequences

array([[188, 185, 203,  22,  50,  49, 203,  22, 100, 104,  46,  19,   1,
          7,   2, 181,   6,  35,  19, 203,  22,  97, 202,   0,  42,  54,
        202, 215,  34,   1, 127,   3, 227, 103, 229,   5,  94,  19, 232,
        206,   2,  20,   1,   8,   2,  77,  19, 203,  22,  97, 202,   0,
        226, 145,  36, 115, 121, 200,   1, 127,   3, 229, 103, 227,   5,
         94,  19, 182,   2,  20,   1,  15,   2,  53,  19, 203,  22,  97,
        202,   0, 138, 146,  55, 202,  89, 113,  23,   1, 127,   3, 229,
        103, 227,   5,  94,  19, 169,   2,  20,   1]], dtype=int32)