In [None]:
import numpy as np 
import pandas as pd 
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
import re
import string

from wordcloud import WordCloud
import matplotlib.pyplot as plt
import spacy
from nltk.tokenize import word_tokenize
import nltk
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.model_selection import KFold, cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, accuracy_score, classification_report, confusion_matrix, precision_score, recall_score

!python -m spacy download el_core_news_sm
!pip install contractions
import contractions
nlp = spacy.load("el_core_news_sm")
nltk.download('punkt')
nltk.download('stopwords')

In [None]:
df = pd.read_csv('/kaggle/input/ys19-2023-assignment-1/train_set.csv', sep=',')

test = pd.read_csv('/kaggle/input/ys19-2023-assignment-1/test_set.csv', sep=',')

valid = pd.read_csv('/kaggle/input/ys19-2023-assignment-1/valid_set.csv', sep=',')



In [None]:
def expand_contractions(tweet):
  return contractions.fix(tweet)

In [None]:
df.info()
print(df.head())
test.info()
print(test.head())
valid.info()
print(valid.head())

In [None]:
def lemmatization(tweet):
    doc = nlp(tweet)
    lemmatized_text = ' '.join([token.lemma_ for token in doc])
    return lemmatized_text



In [None]:
def text_preprocessing(tweet):
    patterns = [
        r'@\s?\w+',            # Remove tags
        r'https?://\S+',       # Remove URLs
        r'[^’\'\s\w]',         # Remove punctuation (except ' and ’)
        r'_',                  # Remove underscore characters
        r'[\r\n]',             # Remove line breaks
        r'\b[a-zA-Z]\b',       # Remove single characters
        r'\d+',                # Remove numbers
    ]

    for pattern in patterns:
        tweet = re.sub(pattern, ' ', tweet)

    # Convert multiple spaces to single space and remove beginning and end spaces
    tweet = re.sub(r' +', ' ', tweet).strip()
    
    # Convert to lowercase
    tweet = tweet.lower()

    return tweet

In [None]:
def remove_stopwords(tweet):
  tokens = tweet.split(' ')
  return ' '.join([w for w in tokens if w not in stopwords.words('greek')])

In [None]:
df['Text'] = df['Text'].apply(lambda x: lemmatization(x))
df['Text'] = df['Text'].apply(lambda x: text_preprocessing(x))
df['Text'] = df['Text'].apply(lambda x: remove_stopwords(x))
print(df)

In [None]:
test['Text'] = test['Text'].apply(lambda x: lemmatization(x))
test['Text'] = test['Text'].apply(lambda x: text_preprocessing(x))
test['Text'] = test['Text'].apply(lambda x: remove_stopwords(x))
print(test)

In [None]:
valid['Text'] = valid['Text'].apply(lambda x: lemmatization(x))
valid['Text'] = valid['Text'].apply(lambda x: text_preprocessing(x))
valid['Text'] = valid['Text'].apply(lambda x: remove_stopwords(x))
print(valid)

In [None]:
#vectorizer = CountVectorizer().fit(df['Text'].values) 
vectorizer = TfidfVectorizer(min_df=4, max_df=0.3).fit(df['Text'].values)
X_train = vectorizer.transform(df['Text'].values)
X_train
     

In [None]:
y_train = df['Sentiment'].values
y_train

In [None]:
X_val = vectorizer.transform(valid['Text'].values)
X_val

In [None]:
y_val = valid['Sentiment'].values
y_val

In [None]:
X_test = vectorizer.transform(test['Text'].values)
X_test

In [None]:
y_test = valid['Sentiment'].values


In [None]:
clf = LogisticRegression(max_iter=5000, tol=1e-8, multi_class='multinomial')
f1_train = []
f1_valid = []
for train_size in  np.linspace(20, X_train.shape[0], 15, dtype=int):
    clf.fit(X_train[:train_size], y_train[:train_size])
    y_true_pred = clf.predict(X_train[:train_size])
    f1_train += [f1_score(y_train[:train_size], y_true_pred, average='weighted')]
    print("F1 Score Train:", f1_score(y_train[:train_size], y_true_pred, average='weighted'))
    y_pred = clf.predict(X_val)
    f1_valid += [f1_score(y_val, y_pred, average='weighted')]
    print("F1 Score Validation:", f1_score(y_val, y_pred, average='weighted'))
plt.plot(np.linspace(20, X_train.shape[0], 15, dtype=int), f1_train, "o-", label="Train")
plt.plot(np.linspace(20, X_train.shape[0], 15, dtype=int), f1_valid, "o-", label="Validation")
plt.ylabel("F1 Score")
plt.legend()
plt.show()

In [None]:
y_test = clf.predict(X_test)
test['Sentiment'] = y_test
submission_df = pd.DataFrame({
    'Id': test['New_ID'], 
    'Prediction': y_test
})

submission_df.to_csv('submission.csv', index=False)