<a href="https://colab.research.google.com/github/corvusMidnight/thesis/blob/main/data_frame_preparation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# A notebook for feature engineering

In [None]:
##### IMPORTS #####
%%capture
!pip install -r requirements.txt | grep -v 'already satisfied'
!pip install demoji

!pip install lazypredict
!pip install plotly
!pip install emoji
!pip install tokenizer
!pip install transformers
!pip install simpletransformers
!pip install happytransformer

#Basic imports

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import sklearn
import re
import seaborn as sns
import string
import operator
import plotly.express as px
from collections import Counter
from time import time
import pickle
from scipy import stats
import demoji

#NLP imports
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

#Baseline with LazyClassifier
from lazypredict.Supervised import LazyClassifier, LazyRegressor

#nltk
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
stopword=set(stopwords.words('italian'))
from nltk.stem.porter import PorterStemmer
from nltk import word_tokenize, FreqDist
stemmer = nltk.SnowballStemmer("italian")
from nltk.tokenize import TweetTokenizer
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
from nltk import ngrams, FreqDist
nltk.download('punkt')

#Classifiers
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier, BaggingClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.naive_bayes import BernoulliNB

#Evaluation tools
from sklearn import metrics
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, KFold
from sklearn.datasets import make_classification
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import classification_report, confusion_matrix, plot_confusion_matrix, accuracy_score, f1_score
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import learning_curve, validation_curve
from sklearn.metrics import make_scorer
from imblearn.metrics import geometric_mean_score
from sklearn.dummy import DummyClassifier
from sklearn import svm
from sklearn.metrics import accuracy_score, log_loss


#Pipeline
from sklearn.pipeline import make_pipeline
from imblearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_transformer

#Pipeline visualization
from sklearn import set_config
from sklearn.utils import estimator_html_repr

#Pipeline display mode
set_config(display='diagram')


#Imputers
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.impute import KNNImputer

#Scalers
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import minmax_scale
from sklearn.preprocessing import MaxAbsScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import QuantileTransformer
from sklearn.preprocessing import PowerTransformer

from collections import Counter
from collections import defaultdict
import unicodedata as uni
from google.colab import drive 
import emoji
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk import word_tokenize
from tokenizer import *
from transformers import AutoTokenizer, AutoModel
from simpletransformers.classification import ClassificationModel, ClassificationArgs

#Model tuning
from sklearn.model_selection import GridSearchCV, StratifiedKFold, RandomizedSearchCV

#Neural and pretrained models
import torch
from torch import nn 
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoModel
from happytransformer import HappyTextClassification

from datasets import Dataset
from datasets import load_metric
from transformers import AutoTokenizer, DataCollatorWithPadding
from transformers import TrainingArguments, AutoModelForSequenceClassification, Trainer

## Linguistic standardness

In [None]:
# this file contains functions that can be imported in other py.scripts

abbrev_dict = {
'aspetta':['asp', 'aspe'],
'anche':'anke',
'bene':'bn',
'birthday':['bday', 'b-day'],
'boyfriend':['bf', 'boy'],
'bravo':'brv',
'capito':'cpt',
'che':'ke',
'ci sei':['c6', 'c 6'],
'come':'cm',
'comunque':'cmq',
'con':'cn',
'cosa': 'cs',
'dev':'developer',
'easy':'EZ',
'Facebook':'fb',
'forever':'4ever',
'fuck':['fck','fk'],
'fucking':['fcking','fking'],
'grazie':['gratz', 'gz'],
'great':'gr8',
"I don't know":'dunno',
'inderdaad':'idd',
'Instagram': 'insta',
'messaggio':['mex','messa'],
'nevermind':'nvm',
'non':'nn',
'niente':'nnt',
'thanks':['thx', 'tx','thks', 'thnx'],
'please':'plz',
'per sempre':'xseo',
'perché': ['xche', 'xché', 'xchè'],
'qualcuno':'qcn',
'rosicare':'rosik',
'scusa':'scs',
'sei un':'6 1',
'sorry':['sry', 'srry'],
'ti voglio bene per sempre': ['tvbxseo'],
'too easy':'2EZ',
'tremendo':'3mendo',
'va bene':['vbne', 'vbbe', 'vbbè'],
'vaffanculo':['vaffa', 'fanculo'],
'what':['wat', 'wht', 'wut'],
}


acronym_dict_EN = {
'also known as':'AKA',
'as soon as possible':'asap',
'at the moment': 'atm',
'away from keyboard':'AFK',
'be right back':'brb',
'best friend for life':'bffl',
'best friend forever':'bff',
'by the way':'btw',
'for the win':'ftw',
'fuck my life':'fml',
"I don't know":'idk',
'I know, right' : 'ikr',
'I love you': 'ily',
'in my humble opinion':'imho',
'in my opinion':'imo',
'in real life':'irl',
'laughing my ass off':'lmao',
'laughing my fucking ass off':'lmfao',
'laughing out loud':'lol',
'love you so much':'lysm',
'love you':'ly',
'love you so much': 'lysm',
'never want to lose you': "nwly",
'no problem': "np",
'oh my god':'omg',
'rolling on the floor laughing':'rofl',
'see you':'cu',
'shut the fuck up':'stfu',
'talk to you later':'ttyl',
'to be honest':'tbh',
'what the fuck':'wtf',
'what the hell':'wth',
'you are welcome' : "yw",
'you only live once':'yolo'}


acronym_dict_IT = {
'cresci bene che ripasso':'cbcr',
'figli di puttana':'fdp',
'ti amo di bene': ['tadb'],
'ti voglio bene': ['tvb']}

##### FEATURE: EMOTICONS #####


# we define 'normal' unicode chars, i.e. non-emoji
###normal_unicode = ('VARIATION', 'SELECTOR', 'SELECTOR-16', 'GREEK', 'ARROW', 'HANGUL', 'DAGGER','LATIN', 'DOT', 'MACRON', 'DIFFERENTIAL', 'ELLIPSIS', 'ORDINAL','INDICATOR', 'DASH', 'TILDE','DIAERESIS','APOSTROPHE','LINE','LETTER', 'SPACE', 'DIGIT', 'HYPHEN-MINUS', 'COMMA', 'COLON', 'AMPERSAND', 'COMMERCIAL','STOP', 'ACCENT', 'PARENTHESIS', 'SOLIDUS', 'MARK', 'FULL STOP','SEMICOLON','ASTERISK', 'BRACKET','SIGN')
normal_unicode = ('DRAWINGS', 'BLOCK', 'SYLLABLE', 'HANGUL', 'QUOTATION', 'MINUS', 'COMBINING', 'INDICATOR', 'HEBREW', 'CENT', 'TILDE', 'ORDINAL', 'DIFFERENTIAL', 'COMMA', 'LATIN', 'COLON',  'VOWEL', 'NUMERO', 'APOSTROPHE', 'LESS-THAN', 'AFGHANI', 'LETTER', 'PARENTHESIS', 'SELECTOR-16', 'AMPERSAND', 'DEGREE', 'HYPHEN-MINUS', 'EURO', 'PLUS-MINUS', 'ACCENT', 'OHM', 'BRACKET', 'ARABIC', 'SUPERSCRIPT', 'NOMISMA', 'SECTION', 'QUESTION', 'ELLIPSIS', 'DAGGER', 'COMMERCIAL', 'EURO-CURRENCY', 'MINUS-OR-PLUS', 'CYRILLIC', 'ASTERISK', 'GREATER-THAN', 'CURRENCY', 'EQUALS', 'PERSIAN', 'STOP', 'PERCENT', 'SUBSCRIPT', 'TRADE', 'DIAERESIS', 'LINEAR', 'LINE', 'PER', 'DOT', 'DASH', 'COPYRIGHT', 'DIVISION', 'SOLIDUS', 'ARMENIAN', 'MICRO', 'TAI', 'SPACE', 'PERSIAN', 'DIGIT', 'KELVIN', 'SEMICOLON', 'NUMBER', 'PLUS', 'OUNCE', 'ROMAN', 'POUND', 'MULTIPLICATION', 'GREEK', 'EXCLAMATION', 'SELECTOR', 'DOLLAR', 'MODIFIER', 'VARIATION')


# remark: 'VARIATION SELECTOR-16': ️ Variation Selector-16:
# "An invisible codepoint which specifies that the preceding character should be displayed with emoji presentation. Only required if the preceding character defaults to text presentation."

# remark2: 'ARROW' eruit gehaald, want te veel smileys met een pijl, en weinig echte pijlen in CMC

facebook_faces = ["smile",
"frown",
"unsure",
"grin",
"tongue",
"wink",
"gasp",
"upset",
"cry",
"confused_rev",
"confused",
"grumpy",
"confused",
"glasses",
"sunglasses",
"devil",
"angel",
"kiss",
"kiki",
"squint",
"pacman",
"colonthree"]





##### REGEXES #####

punct = re.compile(r'[\?\!,.:;()"\*\s]+') 
# remark: \s added to punct in order to still split on whitespace too
# remark: a single quotation mark isn't added, in order to still detect abbreviations like 'k, 't, ...


##### FUNCTIONS #####

def split_textNEW(text):
	"""
	function to split a text into tokens in slightly different ways

	input:
		- 'text': a text string

	output:
		- lists of:
			- tokens with punct attached
			- tokens without punct
			- lowercased tokens without punct
			- lowercased tokens with punct
		- the lowercased text
	"""

	# we also create a lowercased version of the text
	text_lower = text.lower()

	# we split both versions
	# ==> tokens: words with poss. punctuation attached
	tokens = [item for item in text.split() if item]
	tokens_lower = [item for item in text_lower.split() if item]

	# ==> clean_tokens: words without poss. punctuation attached
	clean_tokens = [item for item in re.split(punct,text) if item]
	clean_tokens_lower = [item for item in re.split(punct,text_lower) if item]

	return(tokens, clean_tokens, clean_tokens_lower, tokens_lower, text_lower)

"""script to count several non-standard Italian features in a social media corpus"""



##### REGEXES #####

numbers_and_ordinals = re.compile(r'^\d+(de|st|ste|e)?$')
ordinals_EN = re.compile(r'^\d+(nd|th|rd)$')
hyperlink = re.compile(r'(http:\/\/)?(www\.)?\w+\.(com|be|org|nl|ac|fm|gov|me|net|webs|de|co|uk|it|fr|to|nu|tk).*')
email = re.compile(r'[a-zA-Z0-9-_.]+@\w+\.\w+')
hour_refs = re.compile(r'^\d{1,2}u(\d{2})?$')
hashtags = re.compile(r'^#\w+$')
ats = re.compile(r'^@[a-zA-Z]+:?$')
files = re.compile(r'^.+\.(doc|docx|pdf|ppt|pptx|xml|jpg|png|jpeg|psd)$')
money = re.compile(r'^\d+(€|\$|£)$')
pages = re.compile(r'^(p|P)\.?\d+(\-\d+)?$')
meter = re.compile(r'^\d+k?m$')

western_emoticons_all = re.compile(r'(<?[:;xX\|=8]\'?[-^o]?[\)\(xXpPSsdDoOvV\|\/\*$@#\[\]])|([\)\(][-^o]?\'?[:;]>?)') # including some MSN emoticons consisting of letters and punct marks
not_western_emoticons = re.compile(r'(\w)\1{1,}')
flooding_x = re.compile(r'^x+$')
flooding_xo = re.compile(r'^(x+o+)+x?$')


##### FUNCTIONS FOR FEATURE DETECTION #####


##### FEATURE: FLOODING #####


def letter_flooding(tokens):
	"""
	this function analyzes the use of letter and punctuation flooding in a text, i.e. the repetition of a character

	input:
		- 'tokens': a text split on whitespaces (i.e. a list of the tokens of a text)

	output:
		- the absolute number of (words containing) letter flooding in the text
		- the relative number of (words containing) letter flooding in the text
		- a freq dict of the words containing letter flooding
	"""

	# we create containers to store the flooding occurrences
	flooding_letters = defaultdict(int)

	# we define letter flooding
	# letter flooding: repetition of three or more times the same letter (as a repetition of two times the same letter occurs frequently in Standard Dutch words)
	flooding_alpha = re.compile(r'([a-z])\1{2,}')
	# we will, however, not count the repetition of the letter 'x' or the combi 'xo' as flooding, as this repetition does not represent a lengthening of the sound /ks/
	flooding_x = re.compile(r'^x+$')
	flooding_xo = re.compile(r'^(x+o+)+x?$')

	# we store the occurrences of flooding
	# we only want to work with clean tokens

	clean_tokens = []

	for token in tokens:
		token_lower = token.lower()

		# we do not want to count hyperlinks and emoticons as flooding
		if not (hyperlink.search(token_lower) or email.search(token_lower) or (emoticons([token])[0] > 0)):
			clean = [item.lower() for item in re.compile(r'\W+').split(token) if item != ""]
			clean_tokens += clean

	for token in clean_tokens:

		if flooding_alpha.search(token.lower()):
			# we add 1 to a general counter per flooded letter
			# we do not want to detect flooding of the letter x
			flooded_letters_without_x = [letter for letter in flooding_alpha.findall(token) if letter.lower() != 'x']
			if len(flooded_letters_without_x) != 0:
				for letter in flooded_letters_without_x:
					flooding_letters[letter.lower()+'-variants'] += 1
				# we store the clean token
				flooding_letters[token.lower()] += 1


	# all flooded words:
	number_flooding_letters = sum([flooding_letters[key] for key in flooding_letters if not key.endswith('-variants')])
	#number_flooding_letters = sum(flooding_letters.values())
	# the percentage of flooded words:
	if len(tokens) != 0:
		percentage_flooding_letters = float(number_flooding_letters)/len(tokens)
	else:
		percentage_flooding_letters = 0

	# we return these relative numbers and freq dicts
	return(number_flooding_letters,percentage_flooding_letters,flooding_letters)


def punct_flooding(tokens):
	"""
	this function analyzes the use of punctuation flooding in a text, i.e. the repetition of a punctuation mark

	input:
		- 'tokens': a text split on whitespaces (i.e. a list of the tokens of a text)

	output:
		- the absolute number of punctuation flooding
		- the relative number of punctuation flooding
		- a freq dict of the flooded punctuation marks
		- the number of (groups of) punctuation marks in the text
	"""

	# we create containers to store the flooding occurrences
	flooding_punctuation = defaultdict(int)

	# punctuation flooding: repetition of two or more times a question or exclamation mark
	flooding_nonalphanumeric = re.compile(r'(\?|!)\1{1,}')
	"""# in the Netlog data, letters with accent marks are often replaced by '???' we do not want to count this as punct flooding
	not_flooding_nonalphanumeric = re.compile(r'\w+(\?){3}\w+')""" # not applicable here

	# we only want to store the actual punctuation marks and not the letters attached
	clean_punc = []

	for token in tokens:
		clean = [item for item in re.compile(r'\w+').split(token) if item != ""]
		clean_punc += clean


	# we store the occurrences of flooding
	for token in clean_punc:
		if flooding_nonalphanumeric.search(token):
			# we add 1 to the general counter per punct mark
			for punctmark in flooding_nonalphanumeric.findall(token):
				flooding_punctuation[punctmark+'-variants'] += 1
			# we store the occurrence
			flooding_punctuation[token] += 1

	# we will need to compare the number of flooded punct.marks to the total amount of these punct.marks
	# we define and count all (groups of) non-flooded question and exclamation marks
	nr_nonflooded_punct = 0
	# single question and excl. marks:
	single_punct = re.compile(r'^[\?!]$')
	#single_punct_after_word = re.compile(r'\w[\?!]')
	# unconv. combinations of question and excl. marks: (e.g. 'hello?!?!')
	# we only count the combinations in which no flooding is present (e.g. we do count '?!?!' but not '?!??', as it is already counted)
	#unconv_combis_without_flooding_regex = re.compile(r'^((!\?)+!?)|((\?!)+\??)$')
	unconv_combis_without_flooding_regex = re.compile(r'^(?:(?:\!\?)+!?)|(?:(?:\?\!)+\??)$')
	#unconv_combis_without_flooding_regex_after_word = re.compile(r'\w+((!\?)+!?)|((\?!)+\??)')
	#unconv_combis_without_flooding_regex_after_word = re.compile(r'\w+(?:(?:(?:\!\?)+\!?)|(?:(?:\?\!)+\??))$')

	for token in clean_punc:
		if (single_punct.match(token) or unconv_combis_without_flooding_regex.match(token)):
			nr_nonflooded_punct += 1

	# now we can calculate the total nr of occurrences of excl. and question marks
	number_flooding_punct = sum([flooding_punctuation[key] for key in flooding_punctuation if not key.endswith('-variants')])
	total_punct = nr_nonflooded_punct + number_flooding_punct

	# the percentage of flooded punctuation:
	if total_punct != 0:
		percentage_flooding_punct = float(number_flooding_punct)/total_punct
	else:
		percentage_flooding_punct = 0

	# we return these relative numbers and freq dicts
	return(number_flooding_punct,percentage_flooding_punct,flooding_punctuation, total_punct)




def emoticons(tokens):
	"""
	this function analyzes the use of emoticons in a text

	input:
		- 'tokens': a text split on whitespaces (i.e. a list of the tokens of a text)

	output:
		- the absolute number of emoticons in the text
		- the relative number of emoticons in the text
		- a freq dict of the emoticons
	"""

	# we define different sorts of emoticons
	# we also detect emoticons that may be attached to words, to make sure not to miss any (e.g. 'haha:D')

	### Western emoticons ### (in two directions, e.g. ':)' and '(:')
	western_emoticons_all = re.compile(r'(<?[:;xX\|=8]\'?[-^o]?[\)\(xXpPSsdDoOvV\|\/\*$@#\[\]])|([\)\(][-^o]?\'?[:;]>?)') # including some MSN emoticons consisting of letters and punct marks
	# however, we do not want to detect e.g. 'xx' or 'oo' as smileys
	not_western_emoticons = re.compile(r'(\w)\1{1,}')

	# neither do we want to detect smileys as 'XO' in kisses like 'xoxoxo'
	flooding_x = re.compile(r'^x+$')
	flooding_xo = re.compile(r'^(x+o+)+x?$')
	# we also do not want to recognize smileys in hyperlinks (e.g. ':/' is no smiley in this situation: https:/www. ...)
	hyperlink = re.compile(r'(http:\/\/)?(www\.)?\w+\.(com|be|org|nl|ac|fm|gov|me|net|webs|de|co|uk|it|fr|to|nu|tk).*')

	### Asian emoticons ###
	asian_emoticons = re.compile(r'(\^_*\^)|(\^o?\^)|([oO0]_+[oO0])|(T_+T)|([\-\—]_+[\-\—])|(\-\.\-)|(_O_)|(<_+<)|(<\.<)|(>_+>)|(>\.>)|(n_+n)|(n\.n)|(n,n)|(u_+u)|(u\.u)|(u,u)')

	### Hearts ###
	hearts = re.compile(r'(<+3+)|(x3+)|(X3+)')

	### typical MSN emoticons #####
	# LH not relevant in this corpus
	#MSN_emoticons = re.compile(r'\([hHaAlLuUkKnNyY]\)')

	### typical Netlog emoticons ###
	# LH not relevant in this corpus
	#Netlog_emoticons = re.compile(r'\[(?:(?:@[a-zA-Z0-9\-]+)|(?:#\w+(_anim)?)|(?: Love )|(?: blush )|(?: dohh )|(?: hug )|(?: whistle )|(?: x )|(?:Crazylove)|(?:Flash)|(?:Hug)|(?:theband)|(?:thebonk)|(?:theflasher)|(?:theguitare)|(?:thehappyherman)|(?:theheadphone)|(?:unsure)|(?: girly )|(?: love  )|(?:LOVE)|(?:thejesus)|(?:thedancer)|(?:thebanana2)|(?:book)|(?:thebirthday)|(?:thecartman)|(?:theparty)|(?:cold)|(?:yinyang)|(?:elvis)|(?:kiss)|(?:thebanana)|(?:viking)|(?:fancy)|(?:flash)|(?:pirate)|(?:mrT)|(?:holmes)|(?:netlog)|(?:afro)|(?:thebounce)|(?:stinky)|(?:badboy)|(?:huh)|(?:cry2)|(?:evileyes)|(?:angel)|(?:music)|(?:razor)|(?:rants)|(?:scanner)|(?:rock)|(?:blink)|(?:flower)|(?:weirdo)|(?:wacko)|(?:sick)|(?:clown)|(?:ninja)|(?:southpark)|(?:thewave)|(?:fool)|(?:puzzled)|(?:doh)|(?:girly)|(?:innocent)|(?:inlove)|(?:celebrate)|(?:dohh)|(?:bow)|(?:cry)|(?:lol)|(?:theflash)|(?:wow)|(?:x)|(?:thumbsup)|(?:whistle)|(?:crazylove)|(?:wub)|(?:blush)|(?:hug)|(?:love)|(?:blush_shy)|(?:thumbs_down)|(?:blush_shy)|(?:thumbs_up))\]')
	#Netlog_emoticons1 = re.compile(r'\[(?:(?: Love )|(?: blush )|(?: dohh )|(?: hug )|(?: whistle )|(?: x )|(?:Crazylove)|(?:Flash)|(?:Hug)|(?:theband)|(?:thebonk)|(?:theflasher)|(?:theguitare)|(?:thehappyherman)|(?:theheadphone)|(?:unsure)|(?: girly )|(?: love  )|(?:LOVE)|(?:thejesus)|(?:thedancer)|(?:thebanana2)|(?:book)|(?:thebirthday)|(?:thecartman)|(?:theparty)|(?:cold)|(?:yinyang)|(?:elvis)|(?:kiss)|(?:thebanana)|(?:viking)|(?:fancy)|(?:flash)|(?:pirate)|(?:mrT)|(?:holmes)|(?:netlog)|(?:afro)|(?:thebounce)|(?:stinky)|(?:badboy)|(?:huh)|(?:cry2)|(?:evileyes)|(?:angel)|(?:music)|(?:razor)|(?:rants)|(?:scanner)|(?:rock)|(?:blink)|(?:flower)|(?:weirdo)|(?:wacko)|(?:sick)|(?:clown)|(?:ninja)|(?:southpark)|(?:thewave)|(?:fool)|(?:puzzled)|(?:doh)|(?:girly)|(?:innocent)|(?:inlove)|(?:celebrate)|(?:dohh)|(?:bow)|(?:cry)|(?:lol)|(?:theflash)|(?:wow)|(?:x)|(?:thumbsup)|(?:whistle)|(?:crazylove)|(?:wub)|(?:blush)|(?:hug)|(?:love)|(?:blush_shy)|(?:thumbs_down)|(?:blush_shy)|(?:thumbs_up))\]')
	#Netlog_emoticons2 = re.compile(r'\[@[a-zA-Z0-9\-]+\]')
	#Netlog_emoticons3 = re.compile(r'\[#\w+(?:_anim)?\]')
	
	## Recovered emoticons ### i.e. when the annotators have recovered smileys that were first automatically deleted. they have placed them between square brackets
	# not relevant in this corpus
	#recovered_emoticons = re.compile(r'\[((<?[:;xXB\|=]\'?[-^o]?[\)\(xXpPSsdDVoO\|\/\*$\[\]])|([\)\(][-^o]?\'?[:;]>?))\]')

	### Facebook emoticons ### i.e. in plain text format: e.g. 'wink-emoticon'
	fb_emoticon_indicator = re.compile(r'\w+-emoticon')

	### manual emoticon indicators ### i.e. emoticons that are transcribed with an indicator: 'XEMOTICONX'
	#emoticon_indicator = re.compile(r'XEMOTICONX(_[a-zA-Z_]+)?')
	#emoticon_indicator_detailed = re.compile(r'XEMOTICONX_[a-zA-Z_]+')
	emoticon_indicator = re.compile(r'XEMOTICONX(?:_[a-zA-Z_]+)?')
	emoticon_indicator_detailed = re.compile(r'XEMOTICONX_(?:[a-zA-Z_]+)')

	### L/J emoticons ### i.e. smileys that are automatically replaced by 'L' or 'J'
	# not relevant in this corpus
	#J_emoticon = re.compile(r'^J+$')
	#L_emoticon = re.compile(r'^L+$')


	# we will store the emoticons in a container
	emoticons = defaultdict(int)

	# we will only look for emoticons in words that are no hyperlinks
	tokens_ = [token for token in tokens if not hyperlink.search(token)]

	for item in tokens_:
		# we want to update the number of words in the text if one word/token contains multiple smileys
		# (to avoid that for instance the utterance ':):)' contains 200% emoticons (2 emots divided by 1 word))
		nr_emoticons_found = 0

		# we first want to restore the recovered emoticons by removing the square brackets
		"""if recovered_emoticons.search(item):
			item = item.replace('[','')
			item = item.replace(']','')"""

		# then we check for different kinds of emoticons

		if emoticon_indicator.search(item):
			# we do not want to miss any occurrences (in case spaces are ommitted and we have XEMOTICONXXEMOTICONX)
			occs = emoticon_indicator.findall(item)
			# we add the occurrences to the general counter for unknown emoticons, and count the individual variants
			for occ in occs:
				if emoticon_indicator_detailed.search(occ):
					if 'face' in occ:
						emoticons['Faces variants'] += 1
					elif (('heart' in occ) or ('kiss' in occ)):
						emoticons['Heart variants'] += 1
					else:
						emoticons['Pictogram variants'] += 1
				else:
					emoticons['Unknown variants'] += 1
				
				emoticons[occ] += 1
				# we update the nr of emoticons found
				nr_emoticons_found += 1

		"""if (J_emoticon.search(item)):
			nr_occs = len(item) # we do not work with findall as this would give much noise for this emoticon
			# we add the occurrences to the general counter for western emoticons, and count the individual variants
			emoticons['Faces'] += nr_occs
			emoticons['J'] += nr_occs
			# we update the nr of emoticons found
			nr_emoticons_found += nr_occs"""

		"""if (L_emoticon.search(item)):
			nr_occs = len(item) # we do not work with findall as this would give much noise for this emoticon
			# we add the occurrences to the general counter for western emoticons, and count the individual variants
			emoticons['Faces'] += nr_occs
			emoticons['L'] += nr_occs
			# we update the nr of emoticons found
			nr_emoticons_found += nr_occs"""


		if western_emoticons_all.search(item):
			# we do not want to miss any occurrences (in case spaces are ommitted and we have hello:) )
			occs = western_emoticons_all.findall(item)

			# as the regex for Western emoticons contains | (OR), findall may result in a list of tuples instead of a list of strings
			# we therefore extract the elements from the tuples and save them in a new list
			new_occs = []
			if len(occs) != 0:
				# if the list of occurrences containts a tuple, we want to extract the strings from these tuples
				if (("<class 'tuple'>" in [str(type(element)) for element in occs]) or ("<type 'tuple'>" in [str(type(element)) for element in occs])):
					for tuple_ in occs:
						for element in tuple_:
							if not ((element == '') or (element.lower() == 'xs') or (not_western_emoticons.search(element)) or (not_western_emoticons.search(element.lower()))):
								# we do not count flooding of kisses as emoji (e.g. XX or XO)
								if not (flooding_x.search(element) or flooding_xo.search(element)):
									new_occs.append(element)
				else:
					# else, we do not need to change the list of occurrences
					new_occs = occs

			# we add the occurrences to the general counter for Western emoticons, and count the individual variants
			for new_occ in new_occs:
				emoticons['Faces variants'] += 1
				emoticons[new_occ] += 1
				# we update the nr of emoticons found
				nr_emoticons_found += 1


		if fb_emoticon_indicator.search(item):
			# we do not want to miss any occurrences (in case spaces are ommitted and we have hellogrin-emoticon)
			occs = fb_emoticon_indicator.findall(item)
			nr_occs = len(occs)

			if len(occs) != 0:
				for occ in occs:
					meaningful_part_name = occ.replace("-emoticon", "")
					if ((meaningful_part_name in facebook_faces) or ("face" in meaningful_part_name)):
						emoticons['Faces variants'] += 1
					elif (('heart' in meaningful_part_name) or ('kiss' in meaningful_part_name)):
						emoticons['Hearts variants'] += 1
					else:
						emoticons['Pictogram variants'] += 1

					emoticons[occ] += 1


		if asian_emoticons.search(item):
			# we do not want to miss any occurrences (in case spaces are ommitted and we have hello^^)
			occs = asian_emoticons.findall(item)

			# as the regex for Asian emoticons contains | (OR), findall may result in a list of tuples instead of a list of strings
			# we therefore extract the elements from the tuples and save them in a new list
			new_occs = []
			if len(occs) != 0:
				# if the list of occurrences contains a tuple, we want to extract the strings from these tuples
				if "<class 'tuple'>" in [str(type(element)) for element in occs]:
					for tuple_ in occs:
						for element in tuple_:
							if element != '':
								new_occs.append(element)
				else:
					# else, we do not need to change the list of occurrences
					new_occs = occs

			# we add the occurrences to the general counter for Asian emoticons, and count the individual variants
			for new_occ in new_occs:
				emoticons['Faces variants'] += 1
				emoticons[new_occ] += 1
				# we update the nr of emoticons found
				nr_emoticons_found += 1

		if hearts.search(item):
			# we do not want to miss any occurrences (in case spaces are ommitted and we have hello<3)
			occs = hearts.findall(item)
			# as the regex for heart emoticons contains | (OR), findall may result in a list of tuples instead of a list of strings
			# we therefore extract the elements from the tuples and save them in a new list
			new_occs = []
			if len(occs) != 0:
				# if the list of occurrences contains a tuple, we want to extract the strings from these tuples
				if "<class 'tuple'>" in [str(type(element)) for element in occs]:
					for tuple_ in occs:
						for element in tuple_:
							if element != '':
								new_occs.append(element)
				else:
					# else, we do not need to change the list of occurrences
					new_occs = occs

			# we add the occurrences to the general counter for heart emoticons, and count the individual variants
			for new_occ in new_occs:
				emoticons['Heart variants'] += 1
				emoticons[new_occ] += 1
				# we update the nr of emoticons found
				nr_emoticons_found += 1

		"""if MSN_emoticons.search(item):
			# we do not want to miss any occurrences (in case spaces are ommitted and we have 'hello(H)')
			occs = MSN_emoticons.findall(item)
			# as the regex for MSN emoticons contains | (OR), findall may result in a list of tuples instead of a list of strings
			# we therefore extract the elements from the tuples and save them in a new list
			new_occs = []
			if len(occs) != 0:
				# if the list of occurrences contains a tuple, we want to extract the strings from these tuples
				if "<class 'tuple'>" in [str(type(element)) for element in occs]:
					for tuple_ in occs:
						for element in tuple_:
							if element != '':
								new_occs.append(element)
				else:
					# else, we do not need to change the list of occurrences
					new_occs = occs

			# we add the occurrences to the general counter for MSN emoticons, and count the individual variants
			for new_occ in new_occs:
				emoticons['MSN variants'] += 1
				emoticons[new_occ] += 1
				# we update the nr of emoticons found
				nr_emoticons_found += 1"""

		"""if (Netlog_emoticons1.search(item) or Netlog_emoticons2.search(item) or Netlog_emoticons3.search(item)):
			# we do not want to miss any occurrences (in case spaces are ommitted and we have 'hey[ Love ]')
			occs = Netlog_emoticons1.findall(item) + Netlog_emoticons2.findall(item) + Netlog_emoticons3.findall(item)
			# as the regex for Netlog emoticons contains | (OR), findall may result in a list of tuples instead of a list of strings
			# we therefore extract the elements from the tuples and save them in a new list
			new_occs = []
			if len(occs) != 0:
				# if the list of occurrences contains a tuple, we want to extract the strings from these tuples
				if "<class 'tuple'>" in [str(type(element)) for element in occs]:
					for tuple_ in occs:
						for element in tuple_:
							if element != '':
								new_occs.append(element)
				else:
					# else, we do not need to change the list of occurrences
					new_occs = occs

			# we add the occurrences to the general counter for MSN emoticons, and count the individual variants
			for new_occ in new_occs:
				emoticons['Netlog variants'] += 1
				emoticons[new_occ] += 1
				# we update the nr of emoticons found
				nr_emoticons_found += 1"""

		# we also count unicode emoji, which occur, for example, in WhatsApp
		# we loop over the characters of the word/token
		# we have to decode the item to use unicode.name 	# LH no: not in python 3
		for char in item:#.decode('utf-8'):
			# we get the unicode name of the character
			# however: some control characters do not have a name - we take this scenario into account
			try:
				charname = uni.name(char)
			except ValueError:
				charname = None


			# per default, we assume that this character is a possible unicode emoji
			possible_emoji = True

			# if however, the name or a part of its name is in our list of regular unicode characters, it is no emoji
			# if it is 'None', it is a control character, and also no emoji
			if ((charname in normal_unicode) or (charname == None)):
				possible_emoji = False
			else:
				# this name often consists of multiple parts (e.g. 'LATIN SMALL LETTER')
				# we split it in different parts
				name_parts = charname.split()
				for part in name_parts:
					if part in normal_unicode:
						# then we change the boolean parameter to False: no emoji
						possible_emoji = False
						break
			# if an emoji is found, we add it to our freq dict and increase the counters
			if possible_emoji == True:
				if 'face' in charname.lower():
					emoticons['Faces variants'] += 1
				elif (('heart' in charname.lower()) or ('kiss' in charname.lower())):
					emoticons['Hearts variants'] += 1
				else:
					emoticons['Pictogram variants'] += 1

				### LLL aangepast! emoticons[charname] += 1
				emoticons[char] += 1
				# we update the nr of emoticons found
				nr_emoticons_found += 1

		# if necessary, we update the number of tokens in the text (i.e. if a token consists of multiple smileys, e.g. ':D:D')
		length_tokens = len(tokens) + max([0,(nr_emoticons_found - 1)])

	# percentage emoticons in the text: nr. of emoticons divided by number of words (including emoticons)
	# we will count all emoticons that occur, but we exclude the general counters (so as not to count each emoticon twice)
	nr_emoticons = sum([emoticons[key] for key in emoticons if not key.endswith('variants')])
	if len(tokens) != 0:
		percentage_emoticons = float(nr_emoticons)/len(tokens)
	else:
		percentage_emoticons = 0

	# we return these numbers and the freq dict
	return(nr_emoticons,percentage_emoticons,emoticons)


##### FEATURE: UNCONVENTIONAL CAPITALIZATION #####

def unconv_capitalization(tokens):
	"""
	this function analyzes unconventional capitalization in a text

	input:
		- 'tokens': a text split on whitespaces (i.e. a list of the tokens of a text)

	output:
		- the absolute number of unconv. capitalized words in the text
		- the relative number of unconv. capitalized words in the text
		- a freq dict of the unconv. capitalized words
	"""

	# a regex we will need later on
	punc = re.compile(r'[\!\?\.;\:,\/\-\(\)\'\"\-]+')

	# we create a container to store the unconv. capitalized words
	unconv_caps = defaultdict(int)

	# we define different kinds of unconventional capitalization
	all_caps = re.compile(r'^[A-Z]{2,}$') # allcaps: (HELLO)
	inverse_caps = re.compile(r'^[a-z][A-Z]+$') # inverse caps: (hELLO)
	alternating_caps = re.compile(r'(^([a-z][A-Z])+[a-z]?$)|(^([A-Z][a-z])+[A-Z]?$)') # alternating caps: (HeLlO or hElLo)
	

	# we do not want to count certain emoticons as allcaps, e.g. 'XD'
	western_emoticons_all = re.compile(r'(<?[:;xXB\|=]\'?[-^o]?[\)\(xXpPSsdDVoO\|\/\*$\[\]])|([\)\(][-^o]?\'?[:;]>?)')

	# we count the number of occurrences of unconventional caps
	for token in tokens:
		# we only want to store the actual word, no punct marks attached to it
		only_word = punc.sub('',token)
		only_word_lower = only_word.lower()
		# if what is left is no empty string, we check if the remaining letters match the pattern
		if all_caps.search(only_word):
			# we do not count emoticons (e.g. XD), name indicators nor emoticon indicators as allcaps
			if not (western_emoticons_all.search(only_word) or (only_word_lower.startswith('xemoticonx')) or (only_word_lower == 'xnaamx')):
				# if the word is a standard Dutch abbreviation, it is okay to write it in caps
				if not only_word_lower in abbrev_dict:
					unconv_caps[only_word] += 1
					# we also increase a general counter for allcaps
					unconv_caps['allcaps variants'] += 1

			else:
				if (not_western_emoticons.search(only_word_lower) or flooding_x.search(only_word_lower) or flooding_xo.search(only_word_lower)):
					if not (only_word_lower.startswith('xemoticonx') or (only_word_lower == 'xnaamx')):
						unconv_caps[only_word] += 1
						unconv_caps['allcaps variants'] += 1

		elif (inverse_caps.search(only_word) and (len(only_word) > 2)):
			# we do not count emoticons (e.g. xP) as inverse caps
			if not western_emoticons_all.search(only_word):
				unconv_caps[only_word] += 1
				# we also increase a general counter for inverse caps
				unconv_caps['inverse caps variants'] += 1

			else:

				if (not_western_emoticons.search(only_word_lower) or flooding_x.search(only_word_lower) or flooding_xo.search(only_word_lower)):
					unconv_caps[only_word] += 1
					unconv_caps['inverse caps variants'] += 1

		elif (alternating_caps.search(only_word) and (len(only_word) > 2)):
			# we do not count emoticons (e.g. xP) as inverse caps
			# we also do not count words with only two letters as inverse caps, as this is standard capitalization (eg 'En ...', 'Ik ...')
			if not western_emoticons_all.search(only_word):
				unconv_caps[only_word] += 1
				# we also increase a general counter for alternating caps
				unconv_caps['alternating caps variants'] += 1

			else:

				if (not_western_emoticons.search(only_word_lower) or flooding_x.search(only_word_lower) or flooding_xo.search(only_word_lower)):
					unconv_caps[only_word] += 1
					unconv_caps['alternating caps variants'] += 1


	# we calculate abs and rel numbers
	nr_unconvcaps = sum([unconv_caps[key] for key in unconv_caps if not key.endswith('variants')])

	if len(tokens) != 0:
		percentage_unconvcaps = float(nr_unconvcaps)/len(tokens)
	else:
		percentage_unconvcaps = 0

	# we return the absolute and relative nr as well as the freq dict of unconv. capitalized words
	return(nr_unconvcaps,percentage_unconvcaps,unconv_caps)


##### FEATURE: KISSES AND HUGS #####

def kisses_and_hugs(tokens_lower):
	"""
	this function analyzes the use of the letter(s) 'x' (and 'o') representing kisses (and hugs)

	input:
		- 'tokens_lower': a lowercased text split on whitespaces (i.e. a list of the tokens of a lowercased text)

	output:
		- the absolute number of (groups of) kisses, expressed by the letter(s) 'x' (and 'o')
		- the relative number of (groups of) kisses, expressed by the letter(s) 'x' (and 'o')
		- a freq dict of these occurrences
	"""

	# we create a container to store the kisses and hugs
	kisses_dict = defaultdict(int)

	# we define these kisses
	# kisses: one or more times the letter 'x', isolated
	kisses = re.compile(r'^x+$')
	# kisses and hugs: one or more times the combination 'xo', isolated
	kisses_hugs = re.compile(r'^(x+o+)+x?$')

	# we store the occurrences of kisses and hugs
	for token in tokens_lower:
		if kisses.search(token):
			# we add 1 to the general counter for all x-variants
			kisses_dict['x-variants'] += 1
			# we also count the individual variant
			kisses_dict[token] += 1

		elif kisses_hugs.search(token):
			# we add 1 to the general counter for all xoxo-variants
			kisses_dict['xoxo-variants'] += 1
			# we also count the individual variant
			kisses_dict[token] += 1

	# we count the occurrences of kisses and hugs
	dont_include = ['x-variants','xoxo-variants']
	nr_kisses = sum([kisses_dict[key] for key in kisses_dict if key not in dont_include])
	# the percentage of kisses and hugs:
	if len(tokens_lower) != 0:
		percentage_kisses = float(nr_kisses)/len(tokens_lower)
	else:
		percentage_kisses = 0

	# we return this relative number and the freq dict
	return(nr_kisses,percentage_kisses,kisses_dict)



##### FEATURE: UNCONVENTIONAL PUNCTUATION #####

# remark: the analysis of ALL kinds of unconventional punct. would lead to a considerable overlap with the detected emoticons and punctuation flooding
# which is why, in this function, we will only analyze combinations of ? and ! (e.g. 'hello?!?!')

def unconv_combis(tokens):
	"""
	this function analyzes the unconventional combinations of question and exclamation marks

	input:
		- 'tokens': a text split on whitespaces (i.e. a list of the tokens of a text)

	output:
		- the relative number of unconventional combinations of question and excl. marks
		- a freq.dict. of these occurrences
	"""

	# we create a container for the unconv. combinations
	unconv_combis_dict = defaultdict(int)

	# we define the unconventional combinations
	unconv_combis_regex = re.compile(r'(![!?]*\?[!?]*)|(\?[!?]*![!?]*)')

	# we only want to store the punctuation marks, and not the letters attached
	clean_punc = []

	for token in tokens:
		clean = [item for item in re.compile(r'\w+').split(token) if item != ""]
		clean_punc += clean

	# we store the unconventional combi's in a list
	for token in clean_punc:
		if unconv_combis_regex.search(token):
		# we store the occurrence
			unconv_combis_dict[token] += 1

	# we count all other occurrences of ! and ?
	# we initiate a counter at zero
	other_occurrences = 0
	single_punct = re.compile(r'^[\?!]$')
	flooded_punct = re.compile(r'^(\?|!)\1{1,}$') # isolated flooded punct

	for token in clean_punc:
		if (single_punct.match(token) or flooded_punct.match(token)):
			other_occurrences += 1

	# we count all occurrences of ? and !
	nr_unconv_combis = sum(unconv_combis_dict.values())
	total_occ = other_occurrences + nr_unconv_combis

	# the percentage of unconventional combis: the nr of these combi's divided by all occurrences of ? and !
	if total_occ != 0:
		percentage_unconv_combis = float(nr_unconv_combis)/total_occ
	else:
		percentage_unconv_combis = 0

	# we return this percentage and the freq dict of instances
	return(nr_unconv_combis,percentage_unconv_combis,unconv_combis_dict)


##### FEATURE: LEETSPEAK #####



# we define a list of tokens that must not be confused for leetspeak
not_leetspeak = ['9gag', 'ps3', 'ps4', 'ps5', 'ps1', 'ps2', 'a1', 'a2', 'a3', 'a4', 'a5', 'c4', 'k3', 'mp3', 'mp4', 'a12', 'e19', 'e17', 'e40', 'e313']

letters = re.compile(r'[a-z]|[A-Z]+')
leetspeak_signs = re.compile(r'[0-9@\$€£]+')


def leetspeak(tokens):
	"""
	this function analyzes the use of leetspeak in a text

	input: 
		- 'tokens': a text split on whitespaces (i.e. a list of the tokens of a text)

	output:
		- the abs and relative number of leetspeak in the text
		- a freq dict of the words containing leetspeak
	"""


	# we create a container for the leetspeak occurrences
	leetspeak_words = defaultdict(int)

	# we count the occurrences of leetspeak
	# leetspeak: a word written as a combi of letters and numbers (e.g. 'w8' = 'wait') or letters and special signs (e.g. 'wh@t' = "what")
	for token in tokens:
		token = token.lower()
		if letters.search(token) and leetspeak_signs.search(token):
			# we exclude some exceptions: leetspeak forms that are actually standard Dutch
			if not (token in not_leetspeak):
				# we do not want to count hour references as leetspeak, or page numbers, ...
				if not (hour_refs.search(token) or meter.search(token) or pages.search(token) or money.search(token) or files.search(token) or numbers_and_ordinals.search(token) or ordinals_EN.search(token) or hyperlink.search(token) or email.search(token) or ats.search(token) or (emoticons([token])[0] > 0)):
	
					leetspeak_words[token] += 1


	# percentage leetspeak: words containing leetspeak divided by all words
	nr_leetspeak = sum(leetspeak_words.values())
	if len(tokens) > 0:
		percentage_leetspeak = float(nr_leetspeak)/len(tokens)
	else:
		percentage_leetspeak = 0

	# we return the abs and rel frequency and the occurrences
	return(nr_leetspeak,percentage_leetspeak,leetspeak_words)

##### FEATURE: CHATSPEAK ABBREVIATIONS AND ACRONYMS ######

# we store popular* chatspeak abbreviations and acronyms in a dictionary, along with their original (longer) word(s)
# *popular among Flemish teenagers


 
def abbrev(clean_tokens_lower):
	"""
	this function analyzes the use of typical chatspeak (i.e. NON-standard) abbreviations and acronyms in a text

	input:
		- 'clean_tokens_lower': a lowercased text split on whitespaces and punctuation marks

	output:
		- the absolute and relative number of abbrev. and acronyms in the text
		- a freq. dict of the abbrev. and acronyms
	"""

	# we create a container for the abbrev. and acronyms
	abbrev_forms = defaultdict(int)

	# we check if the words are abbreviations or acronyms
	for token in clean_tokens_lower:

		# abbrev
		for key in abbrev_dict:
			val = abbrev_dict[key]
			# the value of the original word (key) can be one abbrev, or a list of abbrevs
			if (((type(val) == str) and (token == val)) or ((type(val) == list) and (token in val))):
			#if ((word == val) or (word in val)):
				full_word = key
				# we add the abbreviation to the dictionary
				abbrev_forms[token] += 1
				# we also increase the counter for the abbreviations
				abbrev_forms['abbreviations variants'] += 1
				# finally, we store the full word
				name_fullword_key = full_word + ' variants'
				abbrev_forms[name_fullword_key] += 1

				# if we have found the full word, we leave the for-loop
				break

		# acronyms
		for key in acronym_dict_EN:
			val = acronym_dict_EN[key]
			# the value of the original word (key) can be one acronym, or a list of acronyms
			if (((type(val) == str) and (token == val)) or ((type(val) == list) and (token in val))):
			#if ((word == val) or (word in val)):
				full_phrase = key
				# we add the acronym to the dictionary
				abbrev_forms[token] += 1
				# we also increase the counter for the acronyms
				abbrev_forms['acronyms variants'] += 1
				# finally, we store the full phrase
				name_fullphrase_key = full_phrase + ' variants'
				abbrev_forms[name_fullphrase_key] += 1

				# if we have found the full phrase, we leave the for-loop
				break

		# acronyms
		for key in acronym_dict_IT:
			val = acronym_dict_IT[key]
			# the value of the original word (key) can be one acronym, or a list of acronyms
			if (((type(val) == str) and (token == val)) or ((type(val) == list) and (token in val))):
			#if ((word == val) or (word in val)):
				full_phrase = key
				# we add the acronym to the dictionary
				abbrev_forms[token] += 1
				# we also increase the counter for the acronyms
				abbrev_forms['acronyms variants'] += 1
				# finally, we store the full phrase
				name_fullphrase_key = full_phrase + ' variants'
				abbrev_forms[name_fullphrase_key] += 1

				# if we have found the full phrase, we leave the for-loop
				break

	# we calculate the abs and rel numbers
	nr_abbrev_forms = sum([abbrev_forms[key] for key in abbrev_forms if not key.endswith('variants')])

	if len(clean_tokens_lower) != 0:
		percentage_abbrev_forms = float(nr_abbrev_forms)/len(clean_tokens_lower)
	else:
		percentage_abbrev_forms = 0

	# we return the abs and rel number as well as the list of occurrences
	return(nr_abbrev_forms,percentage_abbrev_forms,abbrev_forms)



##### FEATURE: LAUGHTER #####

def laughter(tokens_lower):
    """
    this function analyzes laughter in a text, expressed by 'haha', 'hihi', etc.

    input:
        - 'tokens_lower': a lowercased text split on whitespaces (i.e. a list of the tokens of a lowercased text)
    
    output: 
        - the absolute frequency of laughter in the texet, expressed by 'haha', 'hihi', etc.
        - the relative frequency of laughter in the texet, expressed by 'haha', 'hihi', etc.
        - a freq dict of these occurrences
    """

    # we create containers to store the laughter
    laughter_dict = defaultdict(int)

    # we define laughter
    # haha variants: one or more times 'haha', + variants like haaahhaa
    haha = re.compile(r'w?(h+a+){2,}h?')
    # hihi variants: one or more times 'hihi', + variants like hiihi
    hihi = re.compile(r'(h+i+){2,}h?')
	# ahah variants: one or more times 'haha', + variants like haaahhaa
    ahah = re.compile(r'w?(a+h+){2,}h?')

    # a regex we will need later on: punctuation
    punc = re.compile(r'[\!\?\.;:,\/\-\(\)\'\"\-]+')
	
    # we store the occurrences of laughter
    for token in tokens_lower:
		
        if haha.search(token):
            # we add 1 to the general counter for haha-variants
            laughter_dict['haha-variants'] += 1
            # we also count the individual variant
            # we only want to store the word itself, and no punctuation marks attached to it
            only_word = punc.sub('',token)
            laughter_dict[only_word] += 1

        if hihi.search(token):
            # we add 1 to the general counter for hihi-variants
            laughter_dict['hihi-variants'] += 1
            # we also count the individual variant
            # we only want to store the word itself, and no punctuation marks attached to it
            only_word = punc.sub('',token)
            laughter_dict[only_word] += 1
		
        if ahah.search(token):
			# we add 1 to the general counter for hihi-variants
            laughter_dict['ahah-variants'] += 1
            # we also count the individual variant
            # we only want to store the word itself, and no punctuation marks attached to it
            only_word = punc.sub('',token)
            laughter_dict[only_word] += 1

    # we count the occurrences of laughter
    dont_include = ['haha-variants','hihi-variants', 'ahah-variants']
    nr_laughter = sum([laughter_dict[key] for key in laughter_dict if key not in dont_include])
    # the percentage of laughter:
    if len(tokens_lower) != 0:
        percentage_laughter = float(nr_laughter)/len(tokens_lower)
    else:
        percentage_laughter = 0

    # we return this relative number and the freq dict
    return(nr_laughter,percentage_laughter,laughter_dict)


##### FEATURE: DISCOURSE MARKER #####

def discourse(tokens):
	"""
	detects Twitter-related discourse markers in a text (hashtags and @ )

	input: 
		- 'tokens': a text split on whitespaces (i.e. a list of the tokens of a text)

	output: 
		- the abs and rel numbers as well as the freq dict of:
			- hashtags
			- @ + name
	"""
	# we create a container for the occurrences
	discourse_markers = defaultdict(int)

	# we define the different elements of discourse markers
	hashtags = re.compile(r'^#\w+$')
	ats = re.compile(r'^@[a-zA-Z]+:?$')


	# we count the occurrences
	for token in tokens:

		token_lower = token.lower()

		if hashtags.search(token):
			discourse_markers[token_lower] += 1
			discourse_markers['hashtag variants'] += 1

		elif ats.search(token):
			discourse_markers[token_lower] += 1
			discourse_markers['ats variants'] += 1

	# percentage: occurrences divided by all tokens
	nr_discourse_markers = sum([discourse_markers[key] for key in discourse_markers if not key.endswith('variants')])

	if len(tokens) > 0:
		percentage_discourse_markers = float(nr_discourse_markers)/len(tokens)
	else:
		percentage_discourse_markers = 0

	# we return the abs and rel frequency and the occurrences
	return(nr_discourse_markers,percentage_discourse_markers,discourse_markers)
 
 
#----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
def italian_tokenizer(text: str) -> list:

        """A class method to tokenize Italian text. It returns the tokenized text.
        
        A class method to be run on the comments through apply to tokenize them.
        The function is ideantical to the tokenizer above. However, it is meant to be
        used for Italian data: nltk tokenizer does not split on the "'" correctly for Italian.
    
        Args:
            self: reference to the current instance of the class
            text (str): Any string

        Returns:
            txt (list): A list of strings. The tokenized input text.
        
        """
        try:

            txt = word_tokenize(text, language='italian')
            txt = [token for token in txt if token]
            txt = [token for word in txt for token in word.split("'")]

        except LookupError:
            
            x = input('Model "punkt" is not installed yet, do you want to install it? Y | N')
            if x.lower() == 'y':

                    print('Downloading model...')
                    nltk.download('punkt')
                    print('Downloaded!')
                    txt = word_tokenize(text, language='italian')
                    txt = [token for token in txt if token]
                    txt = [token for word in txt for token in word.split("'")]
            
            else:
                    txt = ModuleNotFoundError
                    print('Please download "punkt" to used cometaNLP tokenizer')

        return txt
#-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
def preprocessor(txt: str) -> str:

        """A function to be run on the comments through apply to clean them.
        
        The function applies a series of transformation to the comments. Hashtags,
        urls, and user tags are removed.Digits and leading/trailing spaces are also removed.

        Args:
            text (str): Any string
        
        Returns:
            txt (str): The input text without hashtags, urls, etc.
        
        
        """

    
        #Noise removal based on the explain weights function of the baseline logistic regression
        #txt = text.replace('url', '')
        #txt = txt.replace('URL', '')
				
        
        
				#Hashtag substitute
        txt = re.sub("#[A-Za-z0-9_]+","HASH", txt)
    
        #Genral user tag substitute (accounting also for potential differently anonymized data) remover
        txt = re.sub("@[A-Za-z0-9_]+","HASH", txt)
    
        #Genral url substitute (same as above)
        txt = re.sub(r"http\S+", "URL", txt)
        txt = re.sub(r"www.\S+", "URL", txt)
	
        # remove numbers
        #txt = re.sub(r'\d+', '', txt)
        txt = txt.lower()
		
        txt = "".join([char if char not in '!"#$%&\()*+,-./:;<=>?@[\\]^_`{|}~' else ' ' for char in txt]) 

        # Also, removes leading and trailing whitespaces
        txt = re.sub('\s+', ' ', txt).strip()

        return txt
#-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
def demojizer(text: str) -> str:

        """A function to be run on comments. It returns the number of urls.

        Args:
            text (str): Any string

        Returns:
            txt (str): The input text without emojis

        """
        txt = emoji.replace_emoji(string=text, replace=' EMOJI ')
        
        return txt
#-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------        
def comment_length(l: list) -> int:
        
        
        """A function to be run on comments. It returns the length of the comments.

        Args:
            text (str): Any string

        Returns:
            count (int): The comment length

        """

        count = len(l)
        
        return count

#----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
def word_counts(l: list) -> dict:

        
        """A function to be run on comments. It returns a dictionary containing the word counts.

        Args:
            l (list): Any list of strings

        Returns:
            counts (dict): A word-counts dictionary

        """
        counts = Counter()
        
        for token in l:
            counts[token] += 1
        
        return counts
#----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
def type_token_ratio(tokens: list) -> float:

        """"A function to calculate type-token ratio.
        
        A function to calculate the type-token ratio on the words in a string. The type-token
        ratio is defined as the number of unique word types divided by the number
        of total words. ATTENTION: requires the TextAnlyzer.word_counts() to run.

        Args:
            text (str): Any string
        
        Returns:
            
            float: A float expressing the comments TTR

        """
        
        counts = word_counts(tokens)

        type_count = len(counts.keys())
        token_count = sum(counts.values())

        if token_count != 0:
            return type_count / token_count
        else:
            return type_count
#----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
def content_function_ratio(tokens: list) -> float:

        """A class method to be run on tokenized comments. It returns the content-function words ratio.

        If the number of function words is equal to 0, the returned digit expresses the number of content words in the text

        Args:
            self: reference to the current instance of the class
            tokens (list): Any list of strings

        Returns:
            float: The content-function words ratio
        
        """

        
        stop = set(stopwords.words('italian'))
        
        content = {}
        function = {}

        for word in tokens:
            if word.lower() in stop:
                function[word] =+ 1
            if word.lower() not in stop:
                content[word] =+ 1

        content_count = sum(content.values())
        function_count = sum(function.values())

        if len(tokens)!=0:
        
            return content_count / len(tokens)

        else:
            return content_count

def further_cleaning(tokens:list) -> list:

    text = [item for item in tokens if item.lower() not in ['url', 'hash', 'emoji']]

    return text
#----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
def unique_emojis(string:str) -> int:
    count = emoji.emoji_count(string=string, unique=True)
    return count

In [None]:
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
ITA_FB_TRAIN=pd.read_csv('/content/gdrive/MyDrive/Colab Notebooks/Thesis/FB-folder/FB-train/haspeede_FB-train.tsv', sep='\t', header=None)
ITA_FB_TRAIN = ITA_FB_TRAIN.rename(columns={0: 'ID', 1: 'text', 2: 'label'})

ITA_FB_TEST=pd.read_csv('/content/gdrive/MyDrive/Colab Notebooks/Thesis/FB-folder/FB-test/haspeede_FB-test.tsv', sep='\t', header=None)
ITA_FB_TEST = ITA_FB_TEST.rename(columns={0: 'ID', 1: 'text', 2: 'label'})




In [None]:
### Facebook

#ITA_FB_TRAIN = pd.read_csv("C:/Users/Leonardo/Desktop/Thesis-Internship/Data/FB-folder/FB-train/haspeede_FB-train.tsv", sep='\t', header=None)
#ITA_FB_TRAIN = ITA_FB_TRAIN.rename(columns={0: 'ID', 1: 'text', 2: 'HS'})


#ITA_FB_TEST = pd.read_csv("C:/Users/Leonardo/Desktop/Thesis-Internship/Data/FB-folder/FB-test/haspeede_FB-test.tsv", sep='\t', header=None)
#ITA_FB_TEST = ITA_FB_TEST.rename(columns={0: 'ID', 1: 'text', 2: 'HS'})

In [None]:
ITA_FB_TEST.head()


Unnamed: 0,ID,text
0,1,Ma....anche no!
1,2,Ma dove vivono ?
2,3,Le vai a impollinare tu le piante e gli alberi...
3,4,Ma manda li a quel paese questi zingari bugiardi
4,5,"Complimenti a chi sostiene ancora questa ""poli..."


In [None]:
ITA_FB_TRAIN['tokens'], ITA_FB_TRAIN['clean_tokens'], ITA_FB_TRAIN['clean_tokens_lower'], ITA_FB_TRAIN['tokens_lower'], ITA_FB_TRAIN['text_lower'] = zip(*ITA_FB_TRAIN['text'].map(split_textNEW))

In [None]:
ITA_FB_TEST['tokens'], ITA_FB_TEST['clean_tokens'], ITA_FB_TEST['clean_tokens_lower'], ITA_FB_TEST['tokens_lower'], ITA_FB_TEST['text_lower'] = zip(*ITA_FB_TEST['text'].map(split_textNEW))

In [None]:
ITA_FB_TRAIN['nr_emoticons'], ITA_FB_TRAIN['percentage_emoticons'], ITA_FB_TRAIN['emoticons'] = zip(*ITA_FB_TRAIN['tokens'].map(emoticons))

In [None]:
ITA_FB_TEST['nr_emoticons'], ITA_FB_TEST['percentage_emoticons'], ITA_FB_TEST['emoticons'] = zip(*ITA_FB_TEST['tokens'].map(emoticons))

In [None]:
ITA_FB_TRAIN['number_flooding_letters'],ITA_FB_TRAIN['percentage_flooding_letters'],ITA_FB_TRAIN['flooding_letters'] = zip(*ITA_FB_TRAIN['tokens'].map(letter_flooding))

In [None]:
ITA_FB_TEST['number_flooding_letters'],ITA_FB_TEST['percentage_flooding_letters'],ITA_FB_TEST['flooding_letters'] = zip(*ITA_FB_TEST['tokens'].map(letter_flooding))

In [None]:
ITA_FB_TRAIN['number_flooding_punct'], ITA_FB_TRAIN['percentage_flooding_punct'], ITA_FB_TRAIN['flooding_punctuation'], ITA_FB_TRAIN['total_punct'] = zip(*ITA_FB_TRAIN['tokens'].map(punct_flooding))

In [None]:
ITA_FB_TEST['number_flooding_punct'], ITA_FB_TEST['percentage_flooding_punct'], ITA_FB_TEST['flooding_punctuation'], ITA_FB_TEST['total_punct'] = zip(*ITA_FB_TEST['tokens'].map(punct_flooding))

In [None]:
ITA_FB_TEST.number_flooding_punct.value_counts()

0    838
1    126
2     29
3      5
5      1
4      1
Name: number_flooding_punct, dtype: int64

In [None]:
ITA_FB_TRAIN['nr_unconvcaps'], ITA_FB_TRAIN['percentage_unconvcaps'], ITA_FB_TRAIN['unconv_caps'] = zip(*ITA_FB_TRAIN['tokens'].map(unconv_capitalization))

In [None]:
ITA_FB_TEST['nr_unconvcaps'], ITA_FB_TEST['percentage_unconvcaps'], ITA_FB_TEST['unconv_caps'] = zip(*ITA_FB_TEST['tokens'].map(unconv_capitalization))

In [None]:
ITA_FB_TRAIN['nr_kisses'],ITA_FB_TRAIN['percentage_kisses'],ITA_FB_TRAIN['kisses_dict'] = zip(*ITA_FB_TRAIN['tokens'].map(kisses_and_hugs))

In [None]:
ITA_FB_TEST['nr_kisses'],ITA_FB_TEST['percentage_kisses'],ITA_FB_TEST['kisses_dict'] = zip(*ITA_FB_TEST['tokens'].map(kisses_and_hugs))

In [None]:
ITA_FB_TRAIN['nr_unconv_combis'], ITA_FB_TRAIN['percentage_unconv_combis'], ITA_FB_TRAIN['unconv_combis_dict'] = zip(*ITA_FB_TRAIN['tokens'].map(unconv_combis))

In [None]:
ITA_FB_TEST['nr_unconv_combis'], ITA_FB_TEST['percentage_unconv_combis'], ITA_FB_TEST['unconv_combis_dict'] = zip(*ITA_FB_TEST['tokens'].map(unconv_combis))

In [None]:
ITA_FB_TRAIN['nr_leetspeak'], ITA_FB_TRAIN['percentage_leetspeak'], ITA_FB_TRAIN['leetspeak_words'] = zip(*ITA_FB_TRAIN['tokens'].map(leetspeak))

In [None]:
ITA_FB_TEST['nr_leetspeak'], ITA_FB_TEST['percentage_leetspeak'], ITA_FB_TEST['leetspeak_words'] = zip(*ITA_FB_TEST['tokens'].map(leetspeak))

In [None]:
ITA_FB_TRAIN['nr_abbrev_forms'], ITA_FB_TRAIN['percentage_abbrev_forms'], ITA_FB_TRAIN['abbrev_forms'] = zip(*ITA_FB_TRAIN['tokens'].map(abbrev))

In [None]:
ITA_FB_TEST['nr_abbrev_forms'], ITA_FB_TEST['percentage_abbrev_forms'], ITA_FB_TEST['abbrev_forms'] = zip(*ITA_FB_TEST['tokens'].map(abbrev))

In [None]:
ITA_FB_TRAIN['nr_laughter'], ITA_FB_TRAIN['percentage_laughter'], ITA_FB_TRAIN['laughter_dict'] = zip(*ITA_FB_TRAIN['tokens'].map(laughter))

In [None]:
ITA_FB_TEST['nr_laughter'], ITA_FB_TEST['percentage_laughter'], ITA_FB_TEST['laughter_dict'] = zip(*ITA_FB_TEST['tokens'].map(laughter))

In [None]:
ITA_FB_TRAIN['nr_discourse_markers'], ITA_FB_TRAIN['discourse_markers'], ITA_FB_TRAIN['discourse_markers'] = zip(*ITA_FB_TRAIN['tokens'].map(discourse))

In [None]:
ITA_FB_TEST['nr_discourse_markers'], ITA_FB_TEST['discourse_markers'], ITA_FB_TEST['discourse_markers'] = zip(*ITA_FB_TEST['tokens'].map(discourse))

## Lexical diversity

In [None]:
ITA_FB_TRAIN['tokens_2'] = ITA_FB_TRAIN['text'].apply(preprocessor)
ITA_FB_TEST['tokens_2'] = ITA_FB_TRAIN['text'].apply(preprocessor)


In [None]:
ITA_FB_TRAIN['tokens_2'] = ITA_FB_TRAIN['tokens_2'].apply(demojizer)
ITA_FB_TRAIN['tokens_2'] = ITA_FB_TRAIN['tokens_2'].apply(italian_tokenizer)

ITA_FB_TEST['tokens_2'] = ITA_FB_TEST['tokens_2'].apply(demojizer)
ITA_FB_TEST['tokens_2'] = ITA_FB_TEST['tokens_2'].apply(italian_tokenizer)

In [None]:
ITA_FB_TRAIN['comment_length'] = ITA_FB_TRAIN['tokens_2'].apply(comment_length)
ITA_FB_TEST['comment_length'] = ITA_FB_TEST['tokens_2'].apply(comment_length)

In [None]:
ITA_FB_TRAIN['tokens_3'] = ITA_FB_TRAIN['tokens_2'].apply(further_cleaning)
ITA_FB_TEST['tokens_3'] = ITA_FB_TEST['tokens_2'].apply(further_cleaning)


In [None]:
ITA_FB_TRAIN['CFR'] = ITA_FB_TRAIN['tokens_3'].apply(content_function_ratio)
ITA_FB_TRAIN['TTR'] = ITA_FB_TRAIN['tokens_2'].apply(type_token_ratio)

ITA_FB_TEST['CFR'] = ITA_FB_TEST['tokens_3'].apply(content_function_ratio)
ITA_FB_TEST['TTR'] = ITA_FB_TEST['tokens_2'].apply(type_token_ratio)

In [None]:
ITA_FB_TRAIN['unique_emojis'] = ITA_FB_TRAIN['text'].apply(unique_emojis)
ITA_FB_TEST['unique_emojis'] = ITA_FB_TEST['text'].apply(unique_emojis)

In [None]:
print(len(ITA_FB_TRAIN.columns), len(ITA_FB_TEST.columns))

44 43


# Additional colums with different characteristics


In [None]:
def emoji_throw_away(text):
  text = demoji.replace(string=text, repl=' ')
  return text

ITA_FB_TRAIN['text_lower_no_emoji'] = ITA_FB_TRAIN['text_lower'].apply(emoji_throw_away)
ITA_FB_TEST['text_lower_no_emoji'] = ITA_FB_TEST['text_lower'].apply(emoji_throw_away)

In [None]:
ITA_FB_TRAIN['text_lower_no_emoji'] = ITA_FB_TRAIN['text_lower_no_emoji'].apply(preprocessor)
ITA_FB_TEST['text_lower_no_emoji'] = ITA_FB_TEST['text_lower_no_emoji'].apply(preprocessor)
ITA_FB_TRAIN['text_lower_no_emoji']

0                   io voterò no renzi deve andare a casa
1                                 poi si sentono fiere eh
2                                                   belli
3                                                  arrusi
4       sono indigeste fanno anche venire la colica in...
                              ...                        
2995                  no sono insopportabili e inquirenti
2996    matteo iniziamo a multare pesantemente i write...
2997      pur di prende soldi si venderebbe anche il culo
2998         governo golpista a morte per alto tradimento
2999                      la malpezzi non la posso vedere
Name: text_lower_no_emoji, Length: 3000, dtype: object

In [None]:

def PNR(tokens: list) -> float:
  r='!"#$%&amp;()*+,-./:;<=>?@[\\]^_`{|}~'
  count = 0
  for token in tokens:
    if token in r:
      count += 1

    for char in token:
      if char in r:
        r='!"#$%&amp;()*+,-./:;<=>?@[\\]^_`{|}~'
        count += 1
    
    return count / len(tokens)

from nltk.tokenize import WordPunctTokenizer

def emoji_throw_away(text):
  text = demoji.replace(string=text, repl=' ')
  return text

ITA_FB_TRAIN['tokens_4'] = ITA_FB_TRAIN.text.apply(emoji_throw_away)
ITA_FB_TRAIN['tokens_4'] = ITA_FB_TRAIN.text.apply(WordPunctTokenizer().tokenize)
ITA_FB_TRAIN['PNR'] = ITA_FB_TRAIN.tokens_4.apply(PNR)

ITA_FB_TEST['tokens_4'] = ITA_FB_TEST.text.apply(emoji_throw_away)
ITA_FB_TEST['tokens_4'] = ITA_FB_TEST.text.apply(WordPunctTokenizer().tokenize)
ITA_FB_TEST['PNR'] = ITA_FB_TEST.tokens_4.apply(PNR)

# Saving the DF for future work

In [None]:
ITA_FB_TRAIN.to_csv('data_train.csv')
!cp data_train.csv "/content/gdrive/MyDrive/Colab Notebooks/Thesis/"

In [None]:
ITA_FB_TEST.to_csv('data_test.csv')
!cp data_test.csv "/content/gdrive/MyDrive/Colab Notebooks/Thesis/"