# Data Exploration

In [None]:
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

train_data = pd.read_csv("/kaggle/input/ys19-2023-assignment-1/train_set.csv")
valid_data = pd.read_csv("/kaggle/input/ys19-2023-assignment-1/valid_set.csv")
test_data = pd.read_csv("/kaggle/input/ys19-2023-assignment-1/test_set.csv")

print(train_data)

train_data.describe()

In [None]:
test_data['Party'].unique()

# Data Preprocessing

Cleaning, tokenization and lowercasing

In [None]:
import re
import unicodedata
from nltk.tokenize import word_tokenize

def strip_accents(text):
   return ''.join(c for c in unicodedata.normalize('NFD', text)
                  if unicodedata.category(c) != 'Mn')

# text cleaning and tokenization
def preprocess_text(text):
    # remove links and tags
    text = re.sub(r"http\S+|www\S+|@[^\s]+", ' ', text)
    
    # remove acute accents from Greek vowels
    text = strip_accents(text)
    
    # remove special characters
    text = re.sub(r"[^A-Za-z0-9Α-Ωα-ω]+", ' ', text)
    
    # tokenize the text
    tokens = word_tokenize(text)
    
    # lowercasing
    tokens = [token.lower() for token in tokens]
    
    return ' '.join(tokens)

train_data['Processed_Text'] = train_data['Text'].apply(preprocess_text)
valid_data['Processed_Text'] = valid_data['Text'].apply(preprocess_text)
test_data['Processed_Text'] = test_data['Text'].apply(preprocess_text)
print(train_data)

Lemmatization

In [None]:
!python -m spacy download el_core_news_sm

In [None]:
import spacy

# model for lemmatizing Greek text
nlp = spacy.load("el_core_news_sm")  # * try el_core_news_md, el_core_news_lg

# apply lemmatization
def lemmatize_greek_text(text):
    doc = nlp(text)
    lemmatized_tokens = [token.lemma_ for token in doc]
    return ' '.join(lemmatized_tokens)

train_data['Processed_Text'] = train_data['Processed_Text'].apply(lemmatize_greek_text)
valid_data['Processed_Text'] = valid_data['Processed_Text'].apply(lemmatize_greek_text)
test_data['Processed_Text'] = test_data['Processed_Text'].apply(lemmatize_greek_text)
print(train_data)

# Analysis

In [None]:
import matplotlib.pyplot as plt
from wordcloud import WordCloud

# generate word clouds graph
def generate_word_cloud(text, title):
    wordcloud = WordCloud(background_color='white').generate(text)

    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.title(title)
    plt.show()

generate_word_cloud(' '.join(train_data['Processed_Text']), 'Word Cloud - Training Data')

# Vectorization

Vectorization using TF-IDF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer()

# fit and transform in the training data
train_tfidf = tfidf.fit_transform(train_data['Processed_Text'])

# transform for the validation and test data
valid_tfidf = tfidf.transform(valid_data['Processed_Text'])
test_tfidf = tfidf.transform(test_data['Processed_Text'])

# Logistic regression model implementation 

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

logistic_regression = LogisticRegression(max_iter=1000)

# fitting to the training data
logistic_regression.fit(train_tfidf, train_data['Sentiment'])

# prediction
valid_predicted = logistic_regression.predict(valid_tfidf)

# evaluation
accuracy = accuracy_score(valid_data['Sentiment'], valid_predicted)
report = classification_report(valid_data['Sentiment'], valid_predicted)

print(f"Accuracy on the validation set: {accuracy:.2f}")
print("\nClassification Report:\n", report)

Optimizing the hyperparameters

In [None]:
from sklearn.model_selection import GridSearchCV

# the hyperparameters that will be tested
param_grid = {
    'max_iter': [1000, 2000, 5000],
    'C': [0.001, 0.01, 0.1, 1, 10],
    'solver': ['newton-cg', 'lbfgs', 'liblinear','sag','saga'],
#     'penalty': ['l1', 'l2', 'elasticnet'],
}

grid_search = GridSearchCV(logistic_regression, param_grid, cv=5, scoring='accuracy')

grid_search.fit(train_tfidf, train_data['Sentiment'])

best_params = grid_search.best_params_
best_score = grid_search.best_score_

print("Best Parameters:", best_params)
print("Best Accuracy Score:", best_score)

# Evaluation

In [None]:
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
import numpy as np
# import matplotlib.pyplot as plt

list_f1 = []
list_f1_train = []
list_sample_size = []

for times in range(10):
    # training
    X, X_unused, y, y_unused = train_test_split(train_tfidf, train_data['Sentiment'], test_size=1 - (times * 0.1 + 0.001))

    logistic_regression = LogisticRegression(max_iter=5000, C=0.01, solver='saga')
    logistic_regression.fit(X, np.ravel(y))

    results_train = logistic_regression.predict(X)

    # validation
    results = logistic_regression.predict(valid_tfidf)

    # score
    f1_train = f1_score(y, results_train, average='weighted')
    print("F1 Score Train: " + str(f1_train))

    f1 = f1_score(valid_data['Sentiment'], results, average='weighted')
    print("F1 Score Validation: " + str(f1))

    list_f1.append(f1)
    list_f1_train.append(f1_train)
    list_sample_size.append((times * 0.1 + 0.1))

plt.plot(list_sample_size, list_f1)
plt.plot(list_sample_size, list_f1_train)

plt.ylim(ymin=0)
plt.legend(["Validation", "Training"])
plt.show()

Retrain the model with the best parameters to re evaluate

In [None]:
logistic_regression = LogisticRegression(max_iter=5000, C=0.01, solver='saga')

# fitting to the training data
logistic_regression.fit(train_tfidf, train_data['Sentiment'])

# prediction
valid_predicted = logistic_regression.predict(valid_tfidf)

# evaluation
accuracy = accuracy_score(valid_data['Sentiment'], valid_predicted)
report = classification_report(valid_data['Sentiment'], valid_predicted)

print(f"Accuracy on the validation set: {accuracy:.2f}")
print("\nClassification Report:\n", report)

# Output prediction

In [None]:
test_prediction = logistic_regression.predict(test_tfidf)
submission = pd.DataFrame({'Id': test_data['New_ID'], 'Predicted': test_prediction})
submission.to_csv('submission.csv', index=False)