# MLP model


In [2]:
import pandas as pd

### Puni dataset

In [3]:
data = pd.read_csv('data/dataset.csv')
data

Unnamed: 0,id,text,is_humor,humor_rating,humor_controversy,offense_rating
0,1,TENNESSEE: We're the best state. Nobody even c...,1,2.42,1.0,0.20
1,2,A man inserted an advertisement in the classif...,1,2.50,1.0,1.10
2,3,How many men does it take to open a can of bee...,1,1.95,0.0,2.40
3,4,Told my mom I hit 1200 Twitter followers. She ...,1,2.11,1.0,0.00
4,5,Roses are dead. Love is fake. Weddings are bas...,1,2.78,0.0,0.10
...,...,...,...,...,...,...
7995,7996,Lack of awareness of the pervasiveness of raci...,0,,,0.25
7996,7997,Why are aspirins white? Because they work sorry,1,1.33,0.0,3.85
7997,7998,"Today, we Americans celebrate our independence...",1,2.55,0.0,0.00
7998,7999,How to keep the flies off the bride at an Ital...,1,1.00,0.0,3.00


Značenje stupaca:

    - id - Ovo je identifikacijski broj za svaku recenicu. Može se koristiti za jedinstveno identificiranje svake stavke u skupu podataka.
    - text - Ovaj stupac sadrži rečenice koje je potrebno analizirati.
    - is_humor - inarna oznaka (0 ili 1) koja označava ima li rečenica humor ili ne. Ako je vrijednost 1, rečenica je označena kao humoristična, ako je 0, rečenica nije .
    - humor_rating - Numerička ocjena (1-5) koja predstavlja subjektivnu percepciju anotatora o tome koliko je rečenica smiješna. Anotatori su ocijenili smiješnost rečenice na skali od 1 do 5.
    - humor_controversy - Binarna oznaka (0 ili 1) koja označava ima li kontroverzu humora u rečenici. Ako je vrijednost 1, to znači da je ocjena humora za tu rečenicu kontroverzna.
    - offense_rating - Numerička ocjena (1-5) koja predstavlja subjektivnu percepciju anotatora o tome koliko je rečenica uvredljiva. Anotatori su ocijenili razinu uvredljivosti rečenice na skali od 1 do 5. Ovdje se također razmatra da nedavanje ocjene jednako 0.

In [4]:
print(data.describe())
print()
print()
print(f"Broj humoristicnih tekstova: {len(data[data['is_humor'] == 1])}")
print(f"Broj ne humoristicnih: {len(data[data['is_humor'] == 0])}")
print(f"Broj NaN zapisa: {len(data[data['is_humor'].isna()])}")
print(f"Broj NaN zapisa: {len(data[data['humor_rating'].isna()])}")
print(f"Broj NaN zapisa: {len(data[data['humor_controversy'].isna()])}")
print(f"Broj NaN zapisa: {len(data[data['offense_rating'].isna()])}")

               id     is_humor  humor_rating  humor_controversy  \
count  8000.00000  8000.000000   4932.000000        4932.000000   
mean   4000.50000     0.616500      2.260525           0.499797   
std    2309.54541     0.486269      0.566974           0.500051   
min       1.00000     0.000000      0.100000           0.000000   
25%    2000.75000     0.000000      1.890000           0.000000   
50%    4000.50000     1.000000      2.280000           0.000000   
75%    6000.25000     1.000000      2.650000           1.000000   
max    8000.00000     1.000000      4.000000           1.000000   

       offense_rating  
count     8000.000000  
mean         0.585325  
std          0.979955  
min          0.000000  
25%          0.000000  
50%          0.100000  
75%          0.700000  
max          4.850000  


Broj humoristicnih tekstova: 4932
Broj ne humoristicnih: 3068
Broj NaN zapisa: 0
Broj NaN zapisa: 3068
Broj NaN zapisa: 3068
Broj NaN zapisa: 0


In [5]:
# Provjerava ima li unos u svakom redu za 'text' stupac
text_column_not_null = data['text'].dropna()

# Ispisuje duljinu rezultirajućeg DataFrame-a
print(f"Broj redaka bez NaN vrijednosti u 'text' stupcu: {len(text_column_not_null)}")

Broj redaka bez NaN vrijednosti u 'text' stupcu: 8000


In [6]:
# Udio kontroverznosti humora
controversial_count = data['humor_controversy'].sum()
total_samples = len(data)

print(f"Udio kontroverznosti humora: {controversial_count / total_samples * 100:.2f}%")

Udio kontroverznosti humora: 30.81%


In [7]:
# Analiza duljine rečenica
data['sentence_length'] = data['text'].apply(lambda x: len(x.split()))
print(data[['text', 'sentence_length']].head())
print(data[['sentence_length']].mean())
print(data.groupby('is_humor')['sentence_length'].mean())

                                                text  sentence_length
0  TENNESSEE: We're the best state. Nobody even c...               17
1  A man inserted an advertisement in the classif...               32
2  How many men does it take to open a can of bee...               26
3  Told my mom I hit 1200 Twitter followers. She ...               26
4  Roses are dead. Love is fake. Weddings are bas...               12
sentence_length    20.889375
dtype: float64
is_humor
0    21.932855
1    20.240268
Name: sentence_length, dtype: float64


Training set ne sadrži neispravne primjere. Gdje su vrijednosti is_humor == 0, tj. za tekstove koji nisu humoristični nema vrijednosti humor_rating	i humor_controversy jer to za njih niti nije moguće izračunati.

### Podjela dataset-a

In [8]:
from sklearn.model_selection import StratifiedShuffleSplit

# Assuming you have a dataframe 'data' with features and labels
# X contains your features, y contains your labels

# Create an instance of StratifiedShuffleSplit for splitting into train and temp sets
stratified_splitter_train_temp = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)

# Use the splitter to generate indices for train and temp sets
for train_index, temp_index in stratified_splitter_train_temp.split(data, data['is_humor']):
    train_data, temp_data = data.iloc[train_index], data.iloc[temp_index]

# Create an instance of StratifiedShuffleSplit for further splitting temp into dev and test sets
stratified_splitter_temp_dev_test = StratifiedShuffleSplit(n_splits=1, test_size=0.5, random_state=42)

# Use the splitter to generate indices for dev and test sets
for dev_index, test_index in stratified_splitter_temp_dev_test.split(temp_data, temp_data['is_humor']):
    dev_data, test_data = temp_data.iloc[dev_index], temp_data.iloc[test_index]

# Print the sizes of the obtained sets
print(f"Size of train set: {len(train_data)}")
print(f"Size of dev set: {len(dev_data)}")
print(f"Size of test set: {len(test_data)}")


Size of train set: 6400
Size of dev set: 800
Size of test set: 800


In [9]:
# Spremi train set u CSV file
train_data.to_csv('data/train.csv', index=False)

# Spremi dev set u CSV file
dev_data.to_csv('data/dev.csv', index=False)

# Spremi test set u CSV file
test_data.to_csv('data/test.csv', index=False)

### Odnos humorističnih i nehumorističnih tekstova u train i dev setu

In [10]:
train_data


Unnamed: 0,id,text,is_humor,humor_rating,humor_controversy,offense_rating,sentence_length
5739,5740,I left my wife because she was obsessed with c...,1,1.95,1.0,0.00,17
5864,5865,"""Procrastinate"" comes from a Latin word meanin...",0,,,0.00,26
5753,5754,if school taught me to say no to fast food ins...,1,2.40,0.0,0.15,25
4321,4322,I just realized my countertop is made of marbl...,1,2.79,1.0,0.00,17
786,787,What are your best resources or most recommend...,0,,,0.00,18
...,...,...,...,...,...,...,...
1104,1105,Girls who talks about girls' problems are grea...,1,2.29,1.0,0.70,17
6059,6060,I'm trying to introduce my wife to my Scooby D...,1,2.25,1.0,0.35,36
7878,7879,I created a show about an airplane hijacking. ...,1,2.74,0.0,1.20,13
7598,7599,I find sex is just like peeling a potato reall...,1,1.85,1.0,1.80,17


In [11]:
# Broj humorističnih tekstova u train setu
humor_percent = len(train_data[train_data['is_humor'] == 1]) / len(train_data) * 100

# Broj nehumorističnih tekstova u train setu
non_humor_percent = len(train_data[train_data['is_humor'] == 0]) / len(train_data) * 100

# Ispis rezultata s dvije decimale
print(f"Postotak humorističnih tekstova u train setu: {humor_percent:.2f}%")
print(f"Postotak nehumorističnih tekstova u train setu: {non_humor_percent:.2f}%")



Postotak humorističnih tekstova u train setu: 61.66%
Postotak nehumorističnih tekstova u train setu: 38.34%


In [12]:
dev_data

Unnamed: 0,id,text,is_humor,humor_rating,humor_controversy,offense_rating,sentence_length
1864,1865,Me: What are my chances doc? Doctor: The surge...,1,2.40,1.0,0.00,32
7235,7236,Why do fish live in salt water? Because pepper...,1,2.60,1.0,0.00,13
2687,2688,"Family, we appreciate your patience. Due to fu...",0,,,0.00,46
1454,1455,John F. Kennedy's brain has been missing for 5...,1,1.27,0.0,1.55,10
7830,7831,"""Blueberry juice boosts memory""",0,,,0.05,4
...,...,...,...,...,...,...,...
1803,1804,On a daily basis some young gay guys get HIV t...,0,,,0.00,51
4749,4750,"just had a redbull, feelin' good, energetic, m...",0,,,0.00,22
2140,2141,We would like to remind you that registration ...,0,,,0.20,22
4140,4141,I'm a big fan of people being exactly who they...,0,,,0.00,16


In [13]:
# Broj humorističnih tekstova u dev setu
humor_percent = len(dev_data[dev_data['is_humor'] == 1]) / len(dev_data) * 100

# Broj nehumorističnih tekstova u dev setu
non_humor_percent = len(dev_data[dev_data['is_humor'] == 0]) / len(dev_data) * 100

# Ispis rezultata s dvije decimale
print(f"Postotak humorističnih tekstova u dev setu: {humor_percent:.2f}%")
print(f"Postotak nehumorističnih tekstova u dev setu: {non_humor_percent:.2f}%")


Postotak humorističnih tekstova u dev setu: 61.62%
Postotak nehumorističnih tekstova u dev setu: 38.38%


## Baseline model

In [14]:
from sklearn.svm import SVC
from sklearn.feature_extraction.text import CountVectorizer

from gensim.models import Word2Vec

from sklearn.metrics import classification_report, accuracy_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer

import nltk
from nltk.tokenize import RegexpTokenizer
from nltk import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

from sklearn.metrics import f1_score, accuracy_score
import numpy as np

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\josip\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\josip\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\josip\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

#### Pre-process the data

In order to use Word2Vec, you need to pre-process the data. It's very simple: you just need to split sentences to words (tokenization), bring the words to their basic form (lemmatization), and remove some very common words like articles or prepositions (stop-word removal). I'm using RegexpTokenizer, WordNetLemmatizer and NLTK stop word list

In [15]:
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

In [16]:
def preprocess_text(text):
    tokens = word_tokenize(text)
    tokens = [lemmatizer.lemmatize(token) for token in tokens if token.lower() not in stop_words]
    return ' '.join(tokens)

In [17]:
data['processed_text'] = data['text'].apply(preprocess_text)

In [18]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    data['processed_text'], 
    data['is_humor'], 
    test_size=0.2, 
    random_state=42
)

In [19]:
# Word2Vec model training
word2vec_model = Word2Vec(sentences=X_train.apply(word_tokenize), vector_size=100, window=5, min_count=1, workers=4)

In [20]:
# Function to average word vectors for a sentence
def average_word_vectors(words, model, vocabulary, num_features):
    feature_vector = np.zeros((num_features,), dtype="float32")
    n_words = 0
    for word in words:
        if word in vocabulary:
            n_words += 1
            feature_vector = np.add(feature_vector, model.wv[word])
    if n_words:
        feature_vector = np.divide(feature_vector, n_words)
    return feature_vector

In [21]:
# Transform text data to Word2Vec features
def word2vec_features(data, model, num_features):
    vocabulary = set(model.wv.index_to_key)
    return np.vstack([average_word_vectors(tokens, model, vocabulary, num_features) for tokens in data.apply(word_tokenize)])

In [23]:
# MLP model ako zelimo manualno napraviti parametre
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.pipeline import Pipeline

mlp_model=MLPClassifier(hidden_layer_sizes=(100,), max_iter=200)

In [24]:
# Create a pipeline with Word2Vec and SVC
model_pipeline = Pipeline([
    ('word2vec', FunctionTransformer(lambda x: word2vec_features(x, word2vec_model, 100))),
    ('classifier', mlp_model)
])

In [26]:
# Train the model
model_pipeline.fit(X_train, y_train)

In [27]:
# Evaluate the model
predictions = model_pipeline.predict(X_test)
accuracy = accuracy_score(y_test, predictions)
f1 = f1_score(y_test, predictions)
class_report = classification_report(y_test, predictions)
print(f"Classification report: {class_report}")
print(f"Accuracy: {accuracy:.2f}")
print(f"F1 Score: {f1:.2f}")

Classification report:               precision    recall  f1-score   support

           0       0.64      0.44      0.52       616
           1       0.71      0.84      0.77       984

    accuracy                           0.69      1600
   macro avg       0.67      0.64      0.65      1600
weighted avg       0.68      0.69      0.67      1600

Accuracy: 0.69
F1 Score: 0.77


In [28]:
y_test

2215    1
2582    1
1662    1
3027    0
4343    1
       ..
1079    0
7979    1
1115    0
6093    1
6832    1
Name: is_humor, Length: 1600, dtype: int64

In [29]:
count_zeros = (predictions == 0).sum()
count_ones = (predictions == 1).sum()

print(f"Number of zeros: {count_zeros}")
print(f"Number of ones: {count_ones}")

print(f"percentage of humoruous texts: {count_ones*100 / len(y_test)}%")
print(f"the difference between labeled and predicted humorous: {(count_ones*100 / len(y_test))-len(train_data[train_data['is_humor'] == 1]) / len(train_data) * 100}%")

Number of zeros: 427
Number of ones: 1173
percentage of humoruous texts: 73.3125%
the difference between labeled and predicted humorous: 11.65625%


U train setu je cca 61% humoristicnih tekstova, ovdje je prema predikciji 58.31%...

In [30]:
from sklearn.metrics import confusion_matrix

X_dev = dev_data['text']
y_dev = dev_data['is_humor']

y_pred = model_pipeline.predict(X_dev)

# Evaluate performance
accuracy = accuracy_score(y_dev, y_pred)
report = classification_report(y_dev, y_pred)
matrix = confusion_matrix(y_dev, y_pred)

print(f"Accuracy: {accuracy}")
print("Classification Report:\n", report)
print("Confusion Matrix:\n", matrix)

Accuracy: 0.72375
Classification Report:
               precision    recall  f1-score   support

           0       0.68      0.52      0.59       307
           1       0.74      0.85      0.79       493

    accuracy                           0.72       800
   macro avg       0.71      0.69      0.69       800
weighted avg       0.72      0.72      0.71       800

Confusion Matrix:
 [[160 147]
 [ 74 419]]


In [31]:
# Ulazni tekst
input_text = "I am so funny. Am I?"

In [32]:
# Primijeniti istu predobradu teksta
processed_input = preprocess_text(input_text)

In [33]:
# Pretvoriti tekst u vektor
input_vector = average_word_vectors(processed_input, word2vec_model, set(word2vec_model.wv.index_to_key), 100)

In [34]:
# Provjeriti oblik vektora (provjeriti dimenzionalnost)
print("Shape of input vector:", input_vector.shape)

Shape of input vector: (100,)


In [35]:
# Naparviti reshape sa 1D u 2D
input_vector_2d = input_vector.reshape(1, -1)
print("Shape of input vector (2D):", input_vector_2d.shape)
input_vector_2d

Shape of input vector (2D): (1, 100)


array([[-0.54425853,  0.5649667 , -0.03600514,  0.15677033,  0.18047075,
        -0.8998527 ,  0.16338056,  1.3484894 , -0.44480798, -0.72868586,
        -0.01061219, -0.7751362 , -0.32850087,  0.5042129 ,  0.2550669 ,
        -0.30292514,  0.4327028 , -0.7238214 , -0.26012614, -1.3500358 ,
         0.31897303,  0.03051787,  0.5980095 , -0.42596254, -0.2613593 ,
         0.00654607, -0.6209033 , -0.04399197, -0.52067703,  0.20202827,
         0.78006434, -0.1190227 ,  0.03616504, -0.76366144,  0.036917  ,
         0.3429152 ,  0.37522554, -0.33902064, -0.13814048, -0.8634219 ,
         0.31757033, -0.61889595, -0.45885527, -0.1617627 ,  0.5265503 ,
        -0.12609689, -0.5200675 , -0.30502206,  0.31923914,  0.43227735,
         0.16344231, -0.6593407 , -0.22703467,  0.11757258, -0.32601833,
         0.2219493 ,  0.5380323 ,  0.04997512, -0.52897257,  0.24500565,
         0.29103133,  0.1963741 ,  0.10311357, -0.21566875, -0.48286197,
         0.7683609 ,  0.36291817,  0.40081844, -0.6

In [36]:
# Ovdje dolazi do greske, treba ispraviti model i vidjeti sto je tocno krivo
# prediction = model_pipeline.predict(input_vector_2d)

# print("Predicted class:", prediction)