In [1]:
import os, re
import pandas as pd
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
import nltk

from string import digits, punctuation

from keras.preprocessing.text import Tokenizer 
from nltk.tokenize.casual import reduce_lengthening
from nltk.corpus import stopwords, wordnet
from nltk.tokenize import WhitespaceTokenizer
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer, PorterStemmer, LancasterStemmer
from keras.preprocessing.sequence import pad_sequences

Using TensorFlow backend.


In [2]:
# Open and save data

train = pd.read_csv('data_it/training_set_sentipolc16.csv/training_set_sentipolc16.csv')

train.head()

Unnamed: 0,idtwitter,subj,opos,oneg,iro,lpos,lneg,top,text
0,122449983151669248,1,0,1,0,0,1,1,Intanto la partita per Via Nazionale si compli...
1,125485104863780865,1,0,1,0,0,1,1,"False illusioni, sgradevoli realtà Mario Monti..."
2,125513454315507712,1,0,1,0,0,1,1,"False illusioni, sgradevoli realtà #editoriale..."
3,125524238290522113,1,0,1,0,0,1,1,Mario Monti: Berlusconi risparmi all'Italia il...
4,125527933224886272,1,0,1,0,0,1,1,Mario Monti: Berlusconi risparmi all'Italia il...


In [3]:
test = pd.read_csv('data_it/test_set_sentipolc16_gold2000.csv/test_set_sentipolc16_gold2000.csv', usecols=[0, 8], header=None, )
test.columns = ['idtwitter','text']
test.head()

Unnamed: 0,idtwitter,text
0,507074506880712705,Tra 5 minuti presentazione piano scuola del go...
1,507075789456961536,\@matteorenzi: Alle 10 appuntamento su http://...
2,507077511902425088,#labuonascuola gli #evangelisti #digitali non ...
3,507079183315787777,Riforma scuola Tutto il discorso di Renzi su ...
4,507080190225563648,.@matteorenzi @MiurSocial #labuonascuola basta...


In [4]:
len(train)

7410

In [5]:
len(test)

1998

In [6]:
# control of null elements

train.isnull().any()

idtwitter    False
subj         False
opos         False
oneg         False
iro          False
lpos         False
lneg         False
top          False
text         False
dtype: bool

In [7]:
train.isna().any()

idtwitter    False
subj         False
opos         False
oneg         False
iro          False
lpos         False
lneg         False
top          False
text         False
dtype: bool

In [8]:
test.isnull().any()

idtwitter    False
text         False
dtype: bool

In [9]:
test.isna().any()

idtwitter    False
text         False
dtype: bool

In [10]:
x_tr = train['text']

In [11]:
x_ts = test['text']

# Text normalization: x_1 -> without punctuation and digits

In [12]:
# removing punctuation

x_tr_1 = x_tr.str.translate(str.maketrans(' ', ' ', punctuation))
x_ts_1 = x_ts.str.translate(str.maketrans(' ', ' ', punctuation))

In [13]:
x_tr_1[:2]

0    Intanto la partita per Via Nazionale si compli...
1    False illusioni sgradevoli realtà Mario Monti ...
Name: text, dtype: object

In [14]:
#removing digits

x_tr_1 = x_tr_1.str.translate(str.maketrans(' ', ' ', digits))
x_ts_1 = x_ts_1.str.translate(str.maketrans(' ', ' ', digits))

In [15]:
x_tr_1[:5]

0    Intanto la partita per Via Nazionale si compli...
1    False illusioni sgradevoli realtà Mario Monti ...
2    False illusioni sgradevoli realtà editoriale d...
3    Mario Monti Berlusconi risparmi allItalia il b...
4    Mario Monti Berlusconi risparmi allItalia il b...
Name: text, dtype: object

# Tokenization

In [16]:
max_features = 30000

tokenizer = Tokenizer(num_words=max_features)

tokenizer.fit_on_texts(list(x_tr_1))

In [17]:
x_tr_tokenized = tokenizer.texts_to_sequences(x_tr_1)
x_ts_tokenized = tokenizer.texts_to_sequences(x_ts_1)

In [18]:
# pad sequences: every sample with the same length

max_len = 200
X_tr_1 = pad_sequences(x_tr_tokenized, maxlen=max_len)
X_ts_1 = pad_sequences(x_ts_tokenized, maxlen=max_len)

In [19]:
X_tr_1[:5]

array([[   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0, 

# Text normalization: x_2 -> without URLs, abbreviations, stop words

In [20]:
# removing abbreviation, tag and urls

abbr = [ # list of abbreviation and URLs
    (r'(\w*|\W*)\n(\w*|\W*)', r'\g<1> \g<2>'),
    (r'(\w*|\W*)\t(\w*|\W*)', r'\g<1> \g<2>'),
    (r'(\w*|\W*)xch[èeé](\w*|\W*)', r'\g<1> perché \g<2>'),
    (r'(\w*|\W*)xk[éeè]*(\w*|\W*)', r'\g<1> perché \g<2>'),
    (r'(\w*|\W*)ank[e](\w*|\W*)', r'\g<1> anche \g<2>'),
    (r'(\w*|\W*)[kc]m(\w*|\W*)', r'\g<1> come \g<2>'),
    (r'http\S+', r' '),
    (r'www\S+', r' '),
    (r'c\*\*\*o', r'cazzo'), 
    (r'ca\*\*o', r'cazzo'), 
    (r'caz\*o', r'cazzo'), 
    (r'c\*\*\*\*', r'cazzo'), 
    (r'f\*\*\*\*\*o', r'fanculo'),
    (r'vaf*\*\*\*\*\*o', r'vaffanculo'),
] # list of atericks words from https://www.kaggle.com/tunguz/bi-gru-lstm-dual-embedding-new-test-cleaned-5

In [21]:
def clean_comment(comment):
    """
    Questa funzione normalizza i commenti 
    
    
    comment: a string
    """
    
    for (pattern, repl) in patterns:
        
        comment = re.sub(pattern, repl, comment)
        
    return comment

In [22]:
patterns = [(re.compile(regex), repl) for (regex, repl) in abbr]

In [23]:
x_tr_abbr = x_tr.apply(lambda x :clean_comment(x))
x_ts_abbr = x_ts.apply(lambda x :clean_comment(x))

In [24]:
x_tr_abbr[:5]

0    Intanto la partita per Via Nazionale si compli...
1    False illusioni, sgradevoli realtà Mario Monti...
2    False illusioni, sgradevoli realtà #editoriale...
3    Mario Monti: Berlusconi risparmi all'Italia il...
4    Mario Monti: Berlusconi risparmi all'Italia il...
Name: text, dtype: object

In [25]:
# everything in lowercase

x_tr_clean = x_tr_abbr.apply(lambda x: x.lower())
x_ts_clean = x_ts_abbr.apply(lambda x: x.lower())

In [26]:
x_tr_clean[:5]

0    intanto la partita per via nazionale si compli...
1    false illusioni, sgradevoli realtà mario monti...
2    false illusioni, sgradevoli realtà #editoriale...
3    mario monti: berlusconi risparmi all'italia il...
4    mario monti: berlusconi risparmi all'italia il...
Name: text, dtype: object

In [27]:
# removing stop words

list_sw = list(stopwords.words('italian'))

In [28]:
list_sw

['ad',
 'al',
 'allo',
 'ai',
 'agli',
 'all',
 'agl',
 'alla',
 'alle',
 'con',
 'col',
 'coi',
 'da',
 'dal',
 'dallo',
 'dai',
 'dagli',
 'dall',
 'dagl',
 'dalla',
 'dalle',
 'di',
 'del',
 'dello',
 'dei',
 'degli',
 'dell',
 'degl',
 'della',
 'delle',
 'in',
 'nel',
 'nello',
 'nei',
 'negli',
 'nell',
 'negl',
 'nella',
 'nelle',
 'su',
 'sul',
 'sullo',
 'sui',
 'sugli',
 'sull',
 'sugl',
 'sulla',
 'sulle',
 'per',
 'tra',
 'contro',
 'io',
 'tu',
 'lui',
 'lei',
 'noi',
 'voi',
 'loro',
 'mio',
 'mia',
 'miei',
 'mie',
 'tuo',
 'tua',
 'tuoi',
 'tue',
 'suo',
 'sua',
 'suoi',
 'sue',
 'nostro',
 'nostra',
 'nostri',
 'nostre',
 'vostro',
 'vostra',
 'vostri',
 'vostre',
 'mi',
 'ti',
 'ci',
 'vi',
 'lo',
 'la',
 'li',
 'le',
 'gli',
 'ne',
 'il',
 'un',
 'uno',
 'una',
 'ma',
 'ed',
 'se',
 'perché',
 'anche',
 'come',
 'dov',
 'dove',
 'che',
 'chi',
 'cui',
 'non',
 'più',
 'quale',
 'quanto',
 'quanti',
 'quanta',
 'quante',
 'quello',
 'quelli',
 'quella',
 'quelle',
 'q

In [29]:
x_tr_clean = x_tr_clean.apply(lambda x: ' '.join([w for w in x.split() if w not in (list_sw)]))
x_ts_clean = x_ts_clean.apply(lambda x: ' '.join([w for w in x.split() if w not in (list_sw)]))

In [30]:
x_tr_clean[:5]

0    intanto partita via nazionale complica. #sacco...
1    false illusioni, sgradevoli realtà mario monti...
2    false illusioni, sgradevoli realtà #editoriale...
3    mario monti: berlusconi risparmi all'italia bi...
4    mario monti: berlusconi risparmi all'italia bi...
Name: text, dtype: object

In [31]:
# reduce repeat character sequences of length 3 or greater

x_tr_clean = x_tr_clean.apply(lambda x: reduce_lengthening(x))
x_ts_clean = x_ts_clean.apply(lambda x: reduce_lengthening(x))

In [32]:
# removing digits and punctuation
x_tr_clean = x_tr_clean.str.translate(str.maketrans(' ', ' ', punctuation))
x_ts_clean = x_ts_clean.str.translate(str.maketrans(' ', ' ', punctuation))
x_tr_clean = x_tr_clean.str.translate(str.maketrans(' ', ' ', digits))
x_ts_clean = x_ts_clean.str.translate(str.maketrans(' ', ' ', digits))

In [33]:
x_tr_clean[:5]

0    intanto partita via nazionale complica saccoma...
1    false illusioni sgradevoli realtà mario monti ...
2    false illusioni sgradevoli realtà editoriale m...
3    mario monti berlusconi risparmi allitalia bias...
4    mario monti berlusconi risparmi allitalia bias...
Name: text, dtype: object

In [34]:
x_tr_tokenized = tokenizer.texts_to_sequences(x_tr_clean)
x_ts_tokenized = tokenizer.texts_to_sequences(x_ts_clean)

In [35]:
# pad sequences: every sample with the same length

X_tr_2 = pad_sequences(x_tr_tokenized, maxlen=max_len)
X_ts_2 = pad_sequences(x_ts_tokenized, maxlen=max_len)

In [36]:
X_tr_2[:5]

array([[   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0, 

# Lemmatization 

In [37]:
import spacy
from spacy.lemmatizer import Lemmatizer, ADJ, NOUN, VERB, PROPN

In [38]:
nlp = spacy.load("it_core_news_sm")

In [39]:
lemmatizer = nlp.vocab.morphology.lemmatizer

In [40]:
x_tr_tok = x_tr_abbr.apply(lambda x: word_tokenize(x, language='italian'))

In [41]:
x_ts_tok = x_ts_abbr.apply(lambda x: word_tokenize(x, language='italian'))

In [42]:
x_tr_tok[:5]

0    [Intanto, la, partita, per, Via, Nazionale, si...
1    [False, illusioni, ,, sgradevoli, realtà, Mari...
2    [False, illusioni, ,, sgradevoli, realtà, #, e...
3    [Mario, Monti, :, Berlusconi, risparmi, all'It...
4    [Mario, Monti, :, Berlusconi, risparmi, all'It...
Name: text, dtype: object

In [43]:
# everything in lowercase and removing stop words

x_tr_clean_lem = x_tr_tok.apply(lambda x: [w.lower() for w in x])
x_ts_clean_lem = x_ts_tok.apply(lambda x: [w.lower() for w in x])
x_tr_clean_lem = x_tr_clean_lem.apply(lambda x: ([w for w in x if w not in (list_sw)]))
x_ts_clean_lem = x_ts_clean_lem.apply(lambda x: ([w for w in x if w not in (list_sw)]))

In [44]:
x_tr_clean_lem[:5]

0    [intanto, partita, via, nazionale, complica, ....
1    [false, illusioni, ,, sgradevoli, realtà, mari...
2    [false, illusioni, ,, sgradevoli, realtà, #, e...
3    [mario, monti, :, berlusconi, risparmi, all'it...
4    [mario, monti, :, berlusconi, risparmi, all'it...
Name: text, dtype: object

In [45]:
# removing digits and punctuation

x_tr_clean_lem = x_tr_clean_lem.apply(lambda x: ([w for w in x if w not in (punctuation)]))
x_ts_clean_lem = x_ts_clean_lem.apply(lambda x: ([w for w in x if w not in (punctuation)]))
x_tr_clean_lem = x_tr_clean_lem.apply(lambda x: ([w for w in x if w not in (digits)]))
x_ts_clean_lem = x_ts_clean_lem.apply(lambda x: ([w for w in x if w not in (digits)]))

In [46]:
x_tr_clean_lem[:5]

0    [intanto, partita, via, nazionale, complica, s...
1    [false, illusioni, sgradevoli, realtà, mario, ...
2    [false, illusioni, sgradevoli, realtà, editori...
3    [mario, monti, berlusconi, risparmi, all'itali...
4    [mario, monti, berlusconi, risparmi, all'itali...
Name: text, dtype: object

In [47]:
x_tr_pos = x_tr_clean_lem.apply(lambda x: nltk.pos_tag(x, tagset='universal'))

In [48]:
x_ts_pos = x_ts_clean_lem.apply(lambda x: nltk.pos_tag(x, tagset='universal'))

In [49]:
x_tr_pos[:5]

0    [(intanto, NOUN), (partita, NOUN), (via, ADP),...
1    [(false, ADJ), (illusioni, NOUN), (sgradevoli,...
2    [(false, ADJ), (illusioni, NOUN), (sgradevoli,...
3    [(mario, NOUN), (monti, NOUN), (berlusconi, VE...
4    [(mario, NOUN), (monti, NOUN), (berlusconi, VE...
Name: text, dtype: object

In [50]:
x_tr_lem = x_tr_pos.apply(lambda x: [' '.join(lemmatizer(t[0], t[1])) for t in x])

In [51]:
x_ts_lem = x_ts_pos.apply(lambda x: [' '.join(lemmatizer(t[0], t[1])) for t in x])

In [52]:
x_tr_lem[:5]

0    [intanto, partito, via, nazionale, complicare,...
1    [falso, illusione, sgradevole, realtà, mario, ...
2    [falso, illusione, sgradevole, realtà, editori...
3    [mario, monte, berlusconi, risparmio, all'ital...
4    [mario, monte, berlusconi, risparmio, all'ital...
Name: text, dtype: object

In [53]:
x_tr_tokenized = tokenizer.texts_to_sequences(x_tr_lem)
x_ts_tokenized = tokenizer.texts_to_sequences(x_ts_lem)

In [54]:
# pad sequences: every sample with the same length

X_tr_3 = pad_sequences(x_tr_tokenized, maxlen=max_len)
X_ts_3 = pad_sequences(x_ts_tokenized, maxlen=max_len)

In [55]:
X_tr_3[:5]

array([[   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0, 

# Stemming

In [56]:
stemmer = nltk.stem.snowball.ItalianStemmer()

In [57]:
x_tr_stem = x_tr_tok.apply(lambda x: [stemmer.stem(t) for t in x])

In [58]:
x_ts_stem = x_ts_tok.apply(lambda x: [stemmer.stem(t) for t in x])

In [59]:
x_tr_stem[:5]

0    [intant, la, part, per, via, nazional, si, com...
1    [fals, illusion, ,, sgradevol, realt, mar, mon...
2    [fals, illusion, ,, sgradevol, realt, #, edito...
3    [mar, mont, :, berluscon, risp, all'ital, il, ...
4    [mar, mont, :, berluscon, risp, all'ital, il, ...
Name: text, dtype: object

In [60]:
# everything in lowercase and removing stop words

x_tr_clean_stem = x_tr_stem.apply(lambda x: [w.lower() for w in x])
x_ts_clean_stem = x_ts_stem.apply(lambda x: [w.lower() for w in x])
x_tr_clean_stem = x_tr_clean_stem.apply(lambda x: ([w for w in x if w not in (list_sw)]))
x_ts_clean_stem = x_ts_clean_stem.apply(lambda x: ([w for w in x if w not in (list_sw)]))

In [61]:
x_tr_clean_stem[:5]

0    [intant, part, via, nazional, complic, ., #, s...
1    [fals, illusion, ,, sgradevol, realt, mar, mon...
2    [fals, illusion, ,, sgradevol, realt, #, edito...
3    [mar, mont, :, berluscon, risp, all'ital, bias...
4    [mar, mont, :, berluscon, risp, all'ital, bias...
Name: text, dtype: object

In [62]:
# removing digits and punctuation

x_tr_clean_stem = x_tr_clean_stem.apply(lambda x: ([w for w in x if w not in (punctuation)]))
x_ts_clean_stem = x_ts_clean_stem.apply(lambda x: ([w for w in x if w not in (punctuation)]))
x_tr_clean_stem = x_tr_clean_stem.apply(lambda x: ([w for w in x if w not in (digits)]))
x_ts_clean_stem = x_ts_clean_stem.apply(lambda x: ([w for w in x if w not in (digits)]))

In [63]:
x_tr_clean_stem[:5]

0    [intant, part, via, nazional, complic, saccoma...
1    [fals, illusion, sgradevol, realt, mar, mont, ...
2    [fals, illusion, sgradevol, realt, editorial, ...
3    [mar, mont, berluscon, risp, all'ital, biasim,...
4    [mar, mont, berluscon, risp, all'ital, biasim,...
Name: text, dtype: object

In [64]:
x_tr_toks = tokenizer.texts_to_sequences(x_tr_clean_stem)
x_ts_toks = tokenizer.texts_to_sequences(x_ts_clean_stem)

In [65]:
X_tr_4 = pad_sequences(x_tr_toks, maxlen=max_len)
X_ts_4 = pad_sequences(x_ts_toks, maxlen=max_len)

# FastText

In [66]:
emb_index_ft = {}

file_ft = open('data_it/cc.it.300.vec')

for line in file_ft:

    l = line.split()
    word = l[0]
    coefs = np.asarray(l[1:max_len+1], dtype='float32')
    emb_index_ft[word] = coefs


In [67]:
file_ft.close()

In [68]:
len(emb_index_ft)

2000000

In [69]:
# for random initialization

all_embs_ft = np.stack(emb_index_ft.values())
emb_ft_mean, emb_ft_std = all_embs_ft.mean(), all_embs_ft.std()
emb_matrix_ft = np.random.normal(emb_ft_mean, emb_ft_std, (max_features,max_len))

  if (await self.run_code(code, result,  async_=asy)):


In [70]:
for word, i in tokenizer.word_index.items():
    
    if i >= max_features: continue
    
    emb_vector = emb_index_ft.get(word)
    
    if emb_vector is not None:
        
        emb_matrix_ft[i] = emb_vector

In [71]:
emb_matrix_ft.shape

(30000, 200)

# Save in files

In [87]:
len(X_tr_1)

7410

In [88]:
len(X_tr_2)

7410

In [89]:
len(X_tr_3)

7410

In [90]:
len(X_tr_4)

7410

In [91]:
len(X_ts_1)

1998

In [92]:
len(X_ts_2)

1998

In [93]:
len(X_ts_3)

1998

In [94]:
len(X_ts_4)

1998

In [95]:
np.savetxt("data_preproc/x_tr_1_it.csv", X_tr_1)

In [96]:
np.savetxt("data_preproc/x_ts_1_it.csv", X_ts_1)

In [97]:
np.savetxt("data_preproc/x_tr_2_it.csv", X_tr_2)

In [98]:
np.savetxt("data_preproc/x_ts_2_it.csv", X_ts_2)

In [99]:
np.savetxt("data_preproc/x_tr_3_it.csv", X_tr_3)

In [100]:
np.savetxt("data_preproc/x_ts_3_it.csv", X_ts_3)

In [101]:
np.savetxt("data_preproc/x_tr_4_it.csv", X_tr_4)

In [102]:
np.savetxt("data_preproc/x_ts_4_it.csv", X_ts_4)

In [103]:
np.savetxt("data_preproc/emb_matr_ft_it.csv", emb_matrix_ft)