In [None]:
import pandas as pd
import re
import os
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
import spacy
import stanza
import gensim
from gensim.models.phrases import Phraser, Phrases
from collections import defaultdict
from gensim import corpora
import string

pd.set_option('display.max_colwidth', None)

Perplexity measures how much surprised is a model to observe some text
- Given a test text W its (normalized) perplexity is:
- High perplexity means that the text has got a low probability
- Low perplexity means that the text has got a high probability

• which is what we aim for, given that test texts are from the same source of training data

# Data preparation

In [None]:
data_dir = "../data"
training_sets_dir = "haspeede2_dev"
training_file = "haspeede2_dev_taskAB.tsv"

train_path = os.path.join(data_dir, training_sets_dir, training_file)

## Columns resetting
df = pd.read_table('haspeede2_dev_taskAB.tsv', header= 0)
df = df.drop('stereotype', axis = 1)
# Rename the column 'text ', there was an extraspace
df.rename(columns = {'text ': 'text'}, inplace = True)

In [None]:
def clean_df(df):
    # Lowercase tweets
    df['text'] = df['text'].apply(lambda x: x.lower())

    # Preprocess text in a single step
    df['text_processed'] = df['text'].apply(lambda x: re.sub(r'@\w+', '', x))  # Remove mentions
    df['text_processed'] = df['text_processed'].apply(lambda x: re.sub(r'\.{2,}', ' ', x))  # Remove multiple dots
    df['text_processed'] = df['text_processed'].apply(lambda x: re.sub(r'\d+', '', x))  # Remove numbers
    df['text_processed'] = df['text_processed'].apply(lambda x: re.sub(r'#', '', x))  # Remove hashtags but keep text
    df['text_processed'] = df['text_processed'].apply(lambda x: re.sub(r'url', '', x))  # Remove URLs
    df['text_processed'] = df['text_processed'].apply(lambda x: re.sub(r'\s+', ' ', x).strip())  # Remove extra spaces

    custom_punctuation = string.punctuation + "’‘" + '’' + '' + '``' + "''"


    # Tokenize into words
    df['word_token'] = df['text_processed'].apply(nltk.word_tokenize)

    # Remove stopwords and punctuation
    df['tokenized'] = [[word for word in set(tweet)
         if word in word not in stopwords.words('italian') and word not in custom_punctuation]  # Remove punctuation and stopwords
        for tweet in df['word_token']]

    # Tokenize into sentences
    df['sentence_token'] = df['text_processed'].apply(nltk.sent_tokenize)
    # Drop intermediate columns
    df = df.drop('word_token', axis=1)
    return df

In [255]:
df = clean_df(df)
# n° of tokens per tweet
df['n_token'] = df['tokenized'].apply(len)
# n° of sentences per tweet
df['n_sentence'] = df['sentence_token'].apply(len)

## Badwords

In [None]:
file = open('lista_badwords.txt')
badwords = file.readlines()
badwords = [word.replace('\n', '') for word in badwords]

# How many badword per tweet?
badword_0_1 = []
for l in df['tokenized']:
    counter = 0
    for word in l:
        if word in badwords:
            counter += 1
    badword_0_1.append(counter)
df['badword'] = badword_0_1

# Creating two corpora of tweets as str
- token0: tokenized and cleaned corpus of not hs
- token0: tokenized and cleaned corpus of hs
- sent: list of lists of tokenized tweets

In [None]:
token0, token1, sent = '', '', []
custom_punctuation = string.punctuation + "’‘" + '’' + '' + '``' + "''" + '“' + '»' + '«' + '”' + '-'

for tweet, label in zip(df['text_processed'], df['hs']):
    tweet_ = tweet
    tweet_ = word_tokenize(tweet_)
    tweet_ = [word for word in tweet_ if word not in stopwords.words('italian') and word not in custom_punctuation]

    if label == 0:
        token0 += tweet
    else:
        token1 += tweet
    sent.append(tweet_)


token0 = word_tokenize(token0)
token1 = word_tokenize(token1)
token0 = [word for word in token0 if word not in stopwords.words('italian') and word not in custom_punctuation]
token1 = [word for word in token1 if word not in stopwords.words('italian') and word not in custom_punctuation]

In [304]:
freq_dist0 = nltk.FreqDist(token0)
freq_dist1 = nltk.FreqDist(token1)

print(freq_dist0.most_common(10))
print('\n')
print(freq_dist1.most_common(10))

[('rom', 1090), ('migranti', 727), ('immigrati', 399), ('italia', 336), ('roma', 300), ('campo', 289), ('italiani', 235), ('stranieri', 224), ('nomadi', 223), ('solo', 216)]


[('immigrati', 515), ('migranti', 475), ('rom', 340), ('italiani', 324), ('italia', 299), ('casa', 226), ('stranieri', 207), ('clandestini', 202), ('solo', 189), ('terroristi', 184)]


In [264]:
# Create a list of list, where each sublist contains the tokenized version of the tweet
# sents = [nltk.word_tokenize(text) for text in df['text_processed']]

## Collocations

In [306]:
# The number as expected decrease, as in the first attempt the pre-processing was less involving ad the result of this code was: (129277, 6837)
len(Phrases(sents).vocab), len(sents)

(111548, 6837)

In [308]:
bigram = Phraser(Phrases(sents))

In [310]:
collocations = list()
for key, score in bigram.phrasegrams.items():
    collocations.append((key,score))
bigrams_df = pd.DataFrame(collocations,columns = ['collocation', 'score'])
bigrams_df.sort_values(by=['score'],ascending=False)

Unnamed: 0,collocation,score
677,radical_chic,4576.328205
385,forze_dell'ordine,3541.206349
454,asia_bibi,3346.440000
684,global_compact,2845.612245
624,&_gt,2237.845679
...,...,...
309,i_numeri,10.056164
242,chi_arriva,10.043036
561,basta_con,10.021832
373,chi_ha,10.006714


In [312]:
stopword_list = stopwords.words('italian')
sents_w_bigram = bigram[sents]
trigrams = Phraser(Phrases(sents_w_bigram, connector_words=stopword_list))

In [313]:
collocations = list()
for key, score in trigrams.phrasegrams.items():
    collocations.append((key,score))
trigrams_df = pd.DataFrame(collocations,columns = ['collocation', 'score'])
trigrams_df.sort_values(by=['score'],ascending=False)
trigrams_df

Unnamed: 0,collocation,score
0,corriere_:,11.530484
1,mafia_capitale,5219.850000
2,campi_rom,262.157645
3,!_!,29.309198
4,studentessa_cinese,194874.400000
...,...,...
193,sindaco_di_riace,549.457895
194,anime_belle,14913.857143
195,global_compact,195744.375000
196,--_--,7733.111111


In [316]:
collocations_df = pd.concat([bigrams_df, trigrams_df])
collocations_df = collocations_df.drop_duplicates(subset = ["collocation"], keep='last')
collocations_df['ngram'] = collocations_df['collocation'].apply(lambda x: 'bigram' if x.count('_') == 1 else 'ngram')
collocations_df[collocations_df['ngram'] == 'ngram'].sort_values('score',ascending=False)

Unnamed: 0,collocation,score,ngram
5,studentessa_cinese_morta,50925.365854,ngram
22,ultimenotizie_news_notizie,13384.230769,ngram
126,tenta_di_investire,9490.636364,ngram
141,stile_di_vita,7765.066116,ngram
197,--_--_--_--,6444.259259,ngram
184,&_gt_;,3728.464286,ngram
46,reddito_di_cittadinanza,3222.12963,ngram
156,&_amp_;,2237.078571,ngram
65,fermato_un_ventenne,2109.030303,ngram
33,corriere_della_sera,1999.942529,ngram


# Working again on the DF
- Lemmatization

In [321]:
# Initialize the Stanza pipeline for Italian language
nlp = stanza.Pipeline('it', processors='tokenize,mwt,pos,lemma')

# Apply lemmatization with Stanza
df['lemmatized_txt'] = [[word.lemma for word in nlp(' '.join(words_list)).sentences[0].words] for words_list in df['tokenized']]

2024-11-16 11:19:27 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.9.0.json:   0%|   …

2024-11-16 11:19:27 INFO: Downloaded file to C:\Users\marco\stanza_resources\resources.json
2024-11-16 11:19:27 INFO: Loading these models for language: it (Italian):
| Processor | Package           |
---------------------------------
| tokenize  | combined          |
| mwt       | combined          |
| pos       | combined_charlm   |
| lemma     | combined_nocharlm |

2024-11-16 11:19:27 INFO: Using device: cpu
2024-11-16 11:19:27 INFO: Loading: tokenize
  checkpoint = torch.load(filename, lambda storage, loc: storage)
2024-11-16 11:19:27 INFO: Loading: mwt
  checkpoint = torch.load(filename, lambda storage, loc: storage)
2024-11-16 11:19:27 INFO: Loading: pos
  checkpoint = torch.load(filename, lambda storage, loc: storage)
  data = torch.load(self.filename, lambda storage, loc: storage)
  state = torch.load(filename, lambda storage, loc: storage)
2024-11-16 11:19:27 INFO: Loading: lemma
  checkpoint = torch.load(filename, lambda storage, loc: storage)
2024-11-16 11:19:27 INFO: Done 

In [None]:
results_dir = '../results'

df.to_csv(os.path.join(results_dir, 'text_dataset_cleaned.csv'), index=False)

In [134]:
'''
# The existing models each support negative, neutral, and positive, represented by 0, 1, 2 respectively.
sentiment = []
for element in df['text']:
    element = nlp(element)
    sentiment.append(element.sentiment)
df['sentiment'] = sentiment

# useless because it assign 0 to all the records, so it is not useful in classification terms
df.drop('sentiment', axis = 1, inplace= True)
'''

In [204]:
# Pos tagging
pos_num = []
pos_str = []
for element in df['text']:
    element = nlp(element)
    pos_inner_num = []
    pos_inner_str = []
    for token in element:
        pos_inner_num.append(token.pos)
        pos_inner_str.append(token.pos_)
    pos_num.append(pos_inner_num)
    pos_str.append(pos_inner_str)
#df['pos_num'] = pos_num
#df['pos_str'] = pos_str

In [205]:
final = []
for l in pos_str:
    dic = {}
    for element in l:
        dic[element] = dic.get(element, 0)+1
    final.append(dic)

In [206]:
df_pos = pd.DataFrame(final)
df_pos = df_pos.fillna(0).astype(int)

In [219]:
df = pd.concat([df,df_pos], axis = 1)

In [136]:
## I took the next pieces of code from the notebook 6.5

In [None]:
# list of 1 & 2) tokenized words without common words
sentences = []
for l in df['lemmatized_txt']:
    sentences.append(l)
# remove words that appear only 1
frequency = defaultdict(int)
for sentence in sentences:
    for token in sentence:
        frequency[token] += 1
texts = [[token for token in sentence if frequency[token] > 1]for sentence in sentences]

In [None]:
COUNTER_no_rare_words, COUNTER_with_rare_words= 0, 0
for l in texts:
    for word in l:
        COUNTER_no_rare_words +=1

for l in sentences:
    for word in l:
        COUNTER_with_rare_words +=1

print('Without rare word:', COUNTER_no_rare_words,'\n' 'With rare words:', COUNTER_with_rare_words)

Without rare word: 82368 
With rare words: 89411


In [334]:
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]
tfidf = gensim.models.TfidfModel(corpus, smartirs='ntc')
word_doc_matrix_tfidf = gensim.matutils.corpus2dense(tfidf[corpus],num_terms = len(dictionary))

#### Creating the dataset with token (as features) and score (are records)

In [336]:
df1 = pd.DataFrame(word_doc_matrix_tfidf, columns = [s for s in df['text']], index = [k for k in dictionary.token2id.keys()])
df1 = df1.T

In [342]:
df1_reset = df1.reset_index(drop=True)
df = pd.concat([df, df1_reset], axis=1)

# ML

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split, cross_val_score

from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.metrics import roc_curve, auc, roc_auc_score
from sklearn.linear_model import LogisticRegression

from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC # Linear Support Vector Classifier
from sklearn.compose import ColumnTransformer

In [407]:
dfml = df.drop(['id', 'text', 'tokenized', 'text_processed','sentence_token', 'lemmatized_txt'], axis = 1)

In [409]:
X = dfml.drop('hs', axis=1).values
y = dfml['hs'].values

In [None]:
numeric_columns = ['n_token', 'n_sentence', 'badword']
df_text = dfml.drop(columns = ['n_token', 'n_sentence', 'badword', 'hs'])
text_columns = df_text.columns

preprocessor = ColumnTransformer(
    transformers=[('text', Pipeline([
            ('tfidf', TfidfTransformer()),  # Apply TF-IDF to text columns
            ('sel', SelectKBest(chi2, k=5000))  # Feature selection after TF-IDF
        ]), text_columns),  ('num', StandardScaler(), numeric_columns)])

bin_pipeline = Pipeline([('preprocessor', preprocessor),
                         ('learner', LogisticRegression(random_state=42, max_iter=500, verbose = 1, solver = 'saga',penalty = 'l1'))])

X = dfml.drop(columns=['hs'])  # Adjust with your target column name
y = dfml['hs']  # Adjust with your target column name

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.02, random_state=42, stratify=y)
bin_pipeline.fit(X_train, y_train)
bin_predictions = bin_pipeline.predict(X_test)

max_iter reached after 3 seconds




##### Classification with StandardScaler worsens
##### This is the result of a classification with

* punctuation
* bad_words count
* n_of token and sentences
* Count of pos --> without this the classification worsen (Macro AVG F1 0.68)

In [463]:
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred,zero_division=0))

              precision    recall  f1-score   support

           0       0.76      0.73      0.74       204
           1       0.62      0.66      0.64       138

    accuracy                           0.70       342
   macro avg       0.69      0.69      0.69       342
weighted avg       0.70      0.70      0.70       342



In [465]:
y_pred = clf.predict(X_test)
print(classification_report(y_test, bin_predictions,zero_division=0))

              precision    recall  f1-score   support

           0       0.76      0.91      0.83        82
           1       0.82      0.56      0.67        55

    accuracy                           0.77       137
   macro avg       0.79      0.74      0.75       137
weighted avg       0.78      0.77      0.76       137



