In [None]:
# Incercare data processing

# Random forest

# Pytorch model 

In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
import nltk
import re
import csv
import os
import pickle
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, classification_report
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from nltk.stem.snowball import SnowballStemmer

cv = CountVectorizer()
lemmatizer = WordNetLemmatizer()
lr = LogisticRegression(max_iter=2000)
stemmer = SnowballStemmer("romanian")


Pre procesarea textului:
- eliminarea caracterelor (re)
- lematizare (lemmatizer)

In [38]:
# preprocesare de textului
def process_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z0-9ĂăÂâÎîȘșȚț\s]', '', text)
    
    words = word_tokenize(text)
    new_text = ' '.join(lemmatize_text(words))
    # new_text = ' '.join(stem_text(new_words))
    return new_text

# lematizarea cuvintelor din text
def lemmatize_text(l_words):
    words = [lemmatizer.lemmatize(word) for word in l_words]
    return words

def stem_text(l_words):
    words = [stemmer.stem(word) for word in l_words]
    return words
    


Loading data

In [3]:
train = pd.read_csv('data/train.csv')
train_data = train[:54000]
validation_data = train[54000:]

test_data = pd.read_csv('data/test.csv')


Data Visualization

In [None]:
train['class'].value_counts(normalize = True).plot.bar()

In [None]:
train_data.head()

In [None]:
test_data.head()


Manage missing data
replace the missing content with the title
save the updated data in new .csv files - for later use

In [4]:
def fill_content(data):
    train_data_content = list(data['content'])
    train_data_title = list(data['title'])
    new_content = []
    for i in range(len(train_data_content)):
        if isinstance(train_data_content[i], str):
            new_content.append(train_data_content[i])
        else:
            new_content.append(train_data_title[i])
        
    return new_content

In [None]:
train_data.loc[:,'content'] = fill_content(train_data)
validation_data.loc[:,'content'] = fill_content(validation_data)
test_data.loc[:,'content'] = fill_content(test_data)

In [None]:
def fill_title(data):
    train_data_content = list(data['content'])
    train_data_title = list(data['title'])
    new_content = []
    for i in range(len(train_data_content)):
        if isinstance(train_data_title[i], str):
            new_content.append(train_data_title[i])
        else:
            new_title = train_data_content[i].split(".")[1]
            new_content.append(new_title)
        
    return new_content

In [None]:
train_data.loc[:,'title'] = fill_title(train_data)
validation_data.loc[:,'title'] = fill_title(validation_data)
test_data.loc[:,'title'] = fill_title(test_data)


Vectorize

In [None]:
train_content = cv.fit_transform(train_data['content'])
validation_content = cv.transform(validation_data['content'])
test_content = cv.transform(test_data['content'])

In [None]:
train_label = train_data['class']
validation_label = validation_data['class']


MODEL 1:  Random forest 

In [None]:
model_rf = RandomForestClassifier(n_estimators=100)
model_rf.fit(train_content, train_label)

In [None]:
predict_rf_validation = model_rf.predict(validation_content)

In [None]:
predict_rf_test = model_rf.predict(test_content)


Accuracy

In [None]:
def display_accuracy(predict_label, actual_label):
    accuracy = f1_score(predict_label, actual_label)
    print(f'Acuratete f1: {accuracy}')
    
    acc = balanced_accuracy_score(predict_label, actual_label)
    print(f'Acuratete balanced: ', acc)
    
    print('Raport de clasificare:')
    print(classification_report(predict_label, actual_label))

In [None]:
display_accuracy(predict_rf_validation, validation_label)


Test submission file

In [None]:
def write_csv_file(prediction_label, test_content, file_name):
    with open(file_name, 'w', newline='') as f:
        writer = csv.writer(f)
        writer.writerow(['id', 'class'])
        
        for i in range(test_content.shape[0]):
            label = 1 if prediction_label[i] else 0
            writer.writerow([i, label])

In [None]:
write_csv_file(predict_rf_test, test_content, 'submissions/submission_rf.csv')


MODEL LOGISTIC REGRESSION

In [None]:
lr.fit(train_content, train_label)
predictions_lr_test = lr.predict(test_content)
predictions_lr_validation = lr.predict(validation_content)

In [None]:
display_accuracy(predictions_lr_validation, validation_label)

In [None]:
cm_lr = confusion_matrix(validation_label,predictions_lr_validation)

In [None]:
disp_lr = ConfusionMatrixDisplay(confusion_matrix=cm_lr)
disp_lr.plot()



MODEL 2: Fully Connected Network

In [None]:
model_retea = tf.keras.Sequential([
    tf.keras.layers.Dense(128, activation='relu', input_shape=(train_content.shape[1],)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

In [None]:
model_retea.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

In [None]:
model_retea.fit(train_content, train_label, epochs=10, batch_size=32, validation_data=(validation_content, validation_label))

In [None]:
predict_retea_test = (model_retea.predict(test_content) > 0.5).astype(int)

In [None]:
predict_retea_validation = (model_retea.predict(validation_content) > 0.5).astype(int)


Display accuracy

In [None]:
display_accuracy(predict_retea_validation, validation_label)

Write submission

In [None]:
write_csv_file(predict_retea_test,test_content, 'submissions/submission_retea.csv' )



MODEL 3: retea 2

In [None]:
model_retea_2 = tf.keras.Sequential([
    tf.keras.layers.Dense(128, activation='relu', input_shape=(train_content.shape[1],)),
    tf.keras.layers.Dropout(0.5), 
    tf.keras.layers.Dense(64, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.001)),
    tf.keras.layers.Dropout(0.3),  
    tf.keras.layers.Dense(1, activation='sigmoid')
])

In [None]:
optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
model_retea_2.compile(optimizer=optimizer,
                      loss='binary_crossentropy',
                      metrics=['accuracy'])

In [None]:
model_retea_2.fit(train_content, train_label, epochs=20, batch_size=32, validation_data=(validation_content, validation_label))

In [None]:
model_retea_2.fit(train_content, train_label, epochs=20, batch_size=32, validation_data=(validation_content, validation_label))

In [None]:
predict_retea_2_test = (model_retea_2.predict(test_content) > 0.5).astype(int)

In [None]:
predict_retea_2_validation = (model_retea_2.predict(validation_content) > 0.5).astype(int)


Accuracy

In [None]:
display_accuracy(predict_retea_2_validation, validation_label)


Submission file

In [None]:
write_csv_file(predict_retea_test, test_content, 'submissions/submission_retea_2.csv')


Save trained models in folder 'models'

In [None]:
if not os.path.exists('models'):
    os.makedirs('models')
path = 'models'

In [None]:
def save_model(file_name, model):
    complete_path = path + '/' + file_name
    with open(complete_path, 'wb') as file:
        pickle.dump(model, file)

    print(f"Modelul a fost salvat în fișierul: '{complete_path}'")
    
def load_model(file_name):
    complete_path = path + '/' + file_name
    loaded_model = pickle.load(open(complete_path, 'rb'))
    return loaded_model

In [None]:
save_model('random_forest_model.pkl', model_rf)

In [None]:
save_model('retea_model.pkl', model_retea)

In [None]:
save_model('retea_2_model.pkl', model_retea_2)

In [None]:
save_model('logistic_regression_model.pkl', lr)

In [None]:
rf_loaded = load_model('random_forest_model.pkl')
predict_rf_validation = rf_loaded.predict(validation_content)

In [None]:
display_accuracy(predict_rf_validation, validation_label)

In [None]:
cm_rf = confusion_matrix(validation_label, predict_rf_validation)
disp_rf = ConfusionMatrixDisplay(confusion_matrix=cm_rf)
disp_rf.plot()