In [None]:
# This is the notebook that preprocesses the 3 initial lexicons

In [15]:
# import the libraries

import pandas as pd
from greek_stemmer import GreekStemmer
#!pip3 install greek_stemmer
from collections import Counter

# print non truncated column info in pandas dataframe
pd.set_option('display.max_colwidth', -1)
pd.set_option('display.max_columns', 500)

In [9]:
def delete_tonous(df, column_to_process, processed_column='Text_only'):
    """Replaces greek hyphend letters
    Args:
        df: pandas dataFrame.
        column_to_process: Column to be processed.
        processed_column: New column to be created, of processed column_to_process.
    Returns:
        It returns a pandas dataFrame with a new column or overwrites the existing one.
    Example:
        replace_df_GR2EN(df=df_spiti, column_to_process='itemName2', processed_column='itemNamenew')
    Raises:
        KeyError: Does not raise any exceptions
    """

    if (processed_column != column_to_process):
        df[processed_column] = df[column_to_process]  # create new column

    # replace greek hyphend letters
    replacements = {processed_column: {'ά': 'α', 'έ': 'ε', 'ή': 'η', 'ί': 'ι', 'ό': 'ο', 'ύ': 'υ', 'ώ': 'ω'}}
    df.replace(replacements, regex=True, inplace=True)
    
    return (df)

## Fix GrAFS_expanded.csv lexicon

In [4]:
# read the 'GrAFS_expanded.csv' lexicon
df_GrAFS = pd.read_csv('GrAFS_expanded.csv')

In [19]:
df_GrAFS.shape

(32884, 13)

In [10]:
# create a new column with the keyword without tonous
df_GrAFS = delete_tonous(df=df_GrAFS, column_to_process='keyword', processed_column='keyword2')

In [11]:
# create a new column with the keyword in capital
df_GrAFS['keyword_CAP'] = df_GrAFS['keyword2'].str.upper()

In [16]:
# stemmer
stemmer = GreekStemmer()
df_GrAFS['keyword_STEM'] = df_GrAFS['keyword_CAP'].apply(lambda x: stemmer.stem(x))

In [18]:
df_GrAFS.head()

Unnamed: 0,keyword,subj,positive,negative,anger,disgust,fear,happy,sad,surprise,keyword_CAP,keyword2,keyword_STEM
0,εκατοχρονίτικούς,0.375,0.0,0.75,0.125,0.3125,0.0,0.0,0.0,0.3125,ΕΚΑΤΟΧΡΟΝΙΤΙΚΟΥΣ,εκατοχρονιτικους,ΕΚΑΤΟΧΡΟΝΙΤ
1,επιδρομέων,0.25,0.0,0.5,0.1875,0.0625,0.25,0.0,0.0,0.25,ΕΠΙΔΡΟΜΕΩΝ,επιδρομεων,ΕΠΙΔΡΟΜ
2,άσεμνου,0.5,0.0,0.75,0.1875,0.375,0.0,0.0,0.0,0.0,ΑΣΕΜΝΟΥ,ασεμνου,ΑΣΕΜΝ
3,πυρπολούσε,0.375,0.0,0.5,0.25,0.125,0.125,0.0,0.0,0.25,ΠΥΡΠΟΛΟΥΣΕ,πυρπολουσε,ΠΥΡΠΟΛ
4,τζόβενο,0.625,0.0,0.75,0.1875,0.3125,0.0,0.0,0.0,0.25,ΤΖΟΒΕΝΟ,τζοβενο,ΤΖΟΒΕΝ


In [22]:
print('Unique keywords :', df_GrAFS.keyword.nunique())
print('Unique keyword_CAP :', df_GrAFS.keyword_CAP.nunique())
print('Unique keyword2 :', df_GrAFS.keyword2.nunique())
print('Unique keyword_STEM :', df_GrAFS.keyword_STEM.nunique())

Unique keywords : 32884
Unique keyword_CAP : 28522
Unique keyword2 : 28530
Unique keyword_STEM : 5046


In [24]:
print('Max - Min subj :', df_GrAFS.subj.min(), ' - ', df_GrAFS.subj.max())
print('Max - Min positive :', df_GrAFS.positive.min(), ' - ', df_GrAFS.positive.max())
print('Max - Min negative :', df_GrAFS.negative.min(), ' - ', df_GrAFS.negative.max())

Max - Min subj : 0.0  -  1.0
Max - Min positive : 0.0  -  1.0
Max - Min negative : 0.0  -  1.0


In [40]:
# check most positive words
df_GrAFS[df_GrAFS.positive==1].head()

Unnamed: 0,keyword,subj,positive,negative,anger,disgust,fear,happy,sad,surprise,keyword_CAP,keyword2,keyword_STEM
30,σκλαβωμένοι,0.75,1.0,1.0,0.25,0.375,0.25,0.375,0.0625,0.3125,ΣΚΛΑΒΩΜΕΝΟΙ,σκλαβωμενοι,ΣΚΛΑΒΩΜΕΝ
35,έρωτας,0.75,1.0,0.0,0.0,0.0,0.0,0.8125,0.0,0.3125,ΕΡΩΤΑΣ,ερωτας,ΕΡΩΤ
39,εγκάρδιοί,1.0,1.0,0.0,0.0,0.0,0.0,0.75,0.0,0.4375,ΕΓΚΑΡΔΙΟΙ,εγκαρδιοι,ΕΓΚΑΡΔ
54,ευεξίας,0.75,1.0,0.0,0.0,0.0,0.0,0.8125,0.0,0.4375,ΕΥΕΞΙΑΣ,ευεξιας,ΕΥΕΞ
77,συνεπαίρνουν,1.0,1.0,0.0,0.0,0.0,0.0,0.75,0.0,0.5,ΣΥΝΕΠΑΙΡΝΟΥΝ,συνεπαιρνουν,ΣΥΝΕΠΑΙΡΝ


In [41]:
# check most negative words
df_GrAFS[df_GrAFS.negative==1].head()

Unnamed: 0,keyword,subj,positive,negative,anger,disgust,fear,happy,sad,surprise,keyword_CAP,keyword2,keyword_STEM
10,πονοκεφαλιάζω,0.75,0.0,1.0,0.0625,0.4375,0.0,0.0,0.1875,0.375,ΠΟΝΟΚΕΦΑΛΙΑΖΩ,πονοκεφαλιαζω,ΠΟΝΟΚΕΦΑΛΙΑΖ
15,ταβερνόβιές,0.625,0.25,1.0,0.125,0.3125,0.0,0.0,0.0,0.125,ΤΑΒΕΡΝΟΒΙΕΣ,ταβερνοβιες,ΤΑΒΕΡΝΟΒ
22,κατατρομαγμένο,0.75,0.0,1.0,0.25,0.0625,0.9375,0.0,0.0,0.6875,ΚΑΤΑΤΡΟΜΑΓΜΕΝΟ,κατατρομαγμενο,ΚΑΤΑΤΡΟΜΑΓΜΕΝ
30,σκλαβωμένοι,0.75,1.0,1.0,0.25,0.375,0.25,0.375,0.0625,0.3125,ΣΚΛΑΒΩΜΕΝΟΙ,σκλαβωμενοι,ΣΚΛΑΒΩΜΕΝ
32,εξυπνακίστικούς,0.875,0.0,1.0,0.3125,0.5,0.0,0.0,0.0,0.25,ΕΞΥΠΝΑΚΙΣΤΙΚΟΥΣ,εξυπνακιστικους,ΕΞΥΠΝΑΚΙΣΤ


In [42]:
print('There are terms that are both highly positive & negative. e.g. σκλαβωμένοι')

There are terms that are both highly positive & negative. e.g. σκλαβωμένοι


In [38]:
# check if all same stemes has close positive/negative
df_GrAFS.sort_values(by=['keyword_STEM']).head()

Unnamed: 0,keyword,subj,positive,negative,anger,disgust,fear,happy,sad,surprise,keyword_CAP,keyword2,keyword_STEM
12247,αηδές,1.0,0.0,1.0,0.4375,1.0,0.0625,0.0,0.1875,0.3125,ΑΗΔΕΣ,αηδες,Α
25861,αηδών,1.0,0.0,1.0,0.4375,1.0,0.0625,0.0,0.1875,0.3125,ΑΗΔΩΝ,αηδων,Α
23521,α,0.875,1.0,1.0,0.6875,0.75,0.75,0.75,0.6875,0.9375,Α,α,Α
17202,αβάπτιστούς,0.25,0.5,0.5,0.375,0.4375,0.0,0.0,0.1875,0.4375,ΑΒΑΠΤΙΣΤΟΥΣ,αβαπτιστους,ΑΒΑΠΤΙΣΤ
5287,αβάπτιστε,0.25,0.5,0.5,0.375,0.4375,0.0,0.0,0.1875,0.4375,ΑΒΑΠΤΙΣΤΕ,αβαπτιστε,ΑΒΑΠΤΙΣΤ


In [141]:
df_GrAFS.shape

(32884, 13)

In [149]:
df_GrAFS.subj.max()

1.0

In [39]:
print('IMPORTANT: It seems that the keywords with the same keyword_STEM, have the same scores for subj,positive,negative ')

IMPORTANT: It seems that the keywords with the same keyword_STEM, have the same scores for subj,positive,negative 


In [44]:
# save as pickle
df_GrAFS.to_pickle('fixed_lexicons/df_GrAFS.pkl')

## Fix KBL lexicon - nGrams

In [47]:
# read the 'KBL.csv' lexicon
df_KBL = pd.read_csv('KBL.tsv', sep='\t')

In [61]:
df_KBL.shape

(190667, 4)

In [49]:
del df_KBL['Unnamed: 0']

In [51]:
# create a new column with the ngram without tonous
df_KBL = delete_tonous(df=df_KBL, column_to_process='ngram', processed_column='ngram2')

In [58]:
# create a new column with the ngram in capital
df_KBL['ngram_CAP'] = df_KBL['ngram2'].str.upper()

In [73]:
# create a column with the length of the ngram - 1:Unigram, 2:Bigram
df_KBL['length'] = df_KBL['ngram'].apply(lambda x: len(x.split(' ')))

In [74]:
df_KBL.head()

Unnamed: 0,ngram,score,ngram2,ngram_CAP,length
0,ντροπή,-14.280374,ντροπη,ΝΤΡΟΠΗ,1
1,ψώνια,-13.504118,ψωνια,ΨΩΝΙΑ,1
2,μωρή,-13.42377,μωρη,ΜΩΡΗ,1
3,φοβάμαι,-13.406769,φοβαμαι,ΦΟΒΑΜΑΙ,1
4,πόνο,-13.34117,πονο,ΠΟΝΟ,1


In [65]:
print('Score Max - Min: ', df_KBL.score.max(), ' - ', df_KBL.score.min())

Score Max - Min:  15.565456192000001  -  -14.2803744446


In [71]:
# check the most positive ngrams
df_KBL[df_KBL.score>13].head(10)

Unnamed: 0,ngram,score,ngram2,ngram_CAP
190656,φίλοι,13.007544,φιλοι,ΦΙΛΟΙ
190657,φίλη,13.173239,φιλη,ΦΙΛΗ
190658,αρέσει,13.392583,αρεσει,ΑΡΕΣΕΙ
190659,χαρά,13.449014,χαρα,ΧΑΡΑ
190660,φίλε,13.612293,φιλε,ΦΙΛΕ
190661,ωραία,13.761981,ωραια,ΩΡΑΙΑ
190662,αγάπη,13.965036,αγαπη,ΑΓΑΠΗ
190663,άρεσε ένα,14.227457,αρεσε ενα,ΑΡΕΣΕ ΕΝΑ
190664,μου άρεσε,14.292916,μου αρεσε,ΜΟΥ ΑΡΕΣΕ
190665,άρεσε,14.408254,αρεσε,ΑΡΕΣΕ


In [72]:
# check the most negative ngrams
df_KBL[df_KBL.score<-13].head(10)

Unnamed: 0,ngram,score,ngram2,ngram_CAP
0,ντροπή,-14.280374,ντροπη,ΝΤΡΟΠΗ
1,ψώνια,-13.504118,ψωνια,ΨΩΝΙΑ
2,μωρή,-13.42377,μωρη,ΜΩΡΗ
3,φοβάμαι,-13.406769,φοβαμαι,ΦΟΒΑΜΑΙ
4,πόνο,-13.34117,πονο,ΠΟΝΟ
5,άγχος,-13.268268,αγχος,ΑΓΧΟΣ
6,σκατά,-13.180413,σκατα,ΣΚΑΤΑ
7,για ψώνια,-13.121716,για ψωνια,ΓΙΑ ΨΩΝΙΑ
8,χάλια,-13.020112,χαλια,ΧΑΛΙΑ


In [142]:
df_KBL.shape

(190667, 5)

In [147]:
df_KBL[df_KBL.length==2].shape

(138090, 5)

In [78]:
# save as a pickle file
df_KBL.to_pickle('fixed_lexicons/df_KBL.pkl')

## Fix greek_sentiment_lexicon.tsv

In [120]:
# read the 'greek_sentiment_lexicon_2.tsv' lexicon
df_lex = pd.read_csv('greek_sentiment_lexicon_2.tsv', sep='\t')

In [121]:
df_lex.shape

(2315, 46)

In [122]:
# drop duplicate 'Terms'
df_lex = df_lex.drop_duplicates(subset=['Term'], keep='first')

In [123]:
df_lex.shape

(2256, 46)

In [124]:
# remove the last 9 columns that are not important
df_lex = df_lex[df_lex.columns[0:-9]]

In [125]:
df_lex.head()

Unnamed: 0,Term,POS1,POS2,POS3,POS4,Subjectivity1,Subjectivity2,Subjectivity3,Subjectivity4,Polarity1,Polarity2,Polarity3,Polarity4,Anger1,Anger2,Anger3,Anger4,Disgust1,Disgust2,Disgust3,Disgust4,Fear1,Fear2,Fear3,Fear4,Happiness1,Happiness2,Happiness3,Happiness4,Sadness1,Sadness2,Sadness3,Sadness4,Surprise1,Surprise2,Surprise3,Surprise4
0,αβάφτιστος,ADJ,ADJ,ADJ,ADJ,SUBJ-,OBJ,SUBJ-,OBJ,BOTH,,BOTH,,3.0,,5.0,,4.0,,5.0,,1.0,,1.0,,1.0,,1.0,,4.0,,1.0,,4.0,,5.0,
1,Χριστός,NOUN,NOUN,NOUN,,SUBJ+,SUBJ-,SUBJ+,SUBJ-,BOTH,BOTH,BOTH,NEG,5.0,5.0,5.0,3.0,4.0,5.0,5.0,1.0,5.0,5.0,5.0,2.0,5.0,5.0,5.0,1.0,5.0,5.0,5.0,1.0,5.0,5.0,5.0,3.0
2,α,INTJ,INTJ,INTJ,INTJ,SUBJ+,SUBJ+,SUBJ+,SUBJ-,BOTH,BOTH,BOTH,BOTH,4.0,5.0,5.0,1.0,5.0,5.0,5.0,1.0,5.0,5.0,5.0,1.0,5.0,5.0,5.0,1.0,4.0,5.0,5.0,1.0,5.0,5.0,5.0,4.0
3,αβάπτιστος,ADJ,ADJ,ADJ,ADJ,SUBJ-,OBJ,SUBJ-,OBJ,BOTH,,BOTH,,3.0,,5.0,,4.0,,5.0,,1.0,,1.0,,1.0,,1.0,,4.0,,1.0,,4.0,,5.0,
4,αβεβαιότητα,NOUN,NOUN,NOUN,NOUN,OBJ,SUBJ+,OBJ,SUBJ-,,NEG,,NEG,,1.0,,1.0,,1.0,,1.0,,1.0,,4.0,,1.0,,1.0,,1.0,,2.0,,1.0,,1.0


In [126]:
# create a new column with the word without tonous
df_lex = delete_tonous(df=df_lex, column_to_process='Term', processed_column='Term2')

In [127]:
# create a new column with the word in capital
df_lex['Term_CAP'] = df_lex['Term2'].str.upper()

In [128]:
# stemmer
stemmer = GreekStemmer()
df_lex['Term_STEM'] = df_lex['Term_CAP'].apply(lambda x: stemmer.stem(x))

In [129]:
# create a column 'POS_list' with the POS of the 4 rater
df_lex['POS_list'] = df_lex.apply(lambda x: [x['POS1'], x['POS2'],x['POS3'],x['POS4']]
                                 , axis=1)
# take as final POS the most common POS of the 4 raters
df_lex['POS'] = df_lex['POS_list'].apply(lambda x: Counter(x).most_common(1)[0][0])

In [130]:
# create a column 'Sub_list' with the Subjectivity of the 4 rater
df_lex['Sub_list'] = df_lex.apply(lambda x: [x['Subjectivity1'], x['Subjectivity2'],x['Subjectivity3'],x['Subjectivity4']]
                                 , axis=1)
# take as final Subjectivity the most common Subjectivity of the 4 raters
df_lex['Subjectivity'] = df_lex['Sub_list'].apply(lambda x: Counter(x).most_common(1)[0][0])

In [131]:
# create a column 'Polarity_list' with the Polarity of the 4 rater
df_lex['Polarity_list'] = df_lex.apply(lambda x: [x['Polarity1'], x['Polarity2'],x['Polarity3'],x['Polarity4']]
                                 , axis=1)
# take as final POS the most common POS of the 4 raters
df_lex['Polarity'] = df_lex['Polarity_list'].apply(lambda x: Counter(x).most_common(1)[0][0])

In [132]:
# keep only those columns
df_lex2 = df_lex[['Term', 'Term2', 'Term_CAP', 'Term_STEM', 'POS', 'Subjectivity', 'Polarity']]

In [133]:
df_lex2.head()

Unnamed: 0,Term,Term2,Term_CAP,Term_STEM,POS,Subjectivity,Polarity
0,αβάφτιστος,αβαφτιστος,ΑΒΑΦΤΙΣΤΟΣ,ΑΒΑΦΤΙΣΤ,ADJ,SUBJ-,BOTH
1,Χριστός,Χριστος,ΧΡΙΣΤΟΣ,ΧΡΙΣΤ,NOUN,SUBJ+,BOTH
2,α,α,Α,Α,INTJ,SUBJ+,BOTH
3,αβάπτιστος,αβαπτιστος,ΑΒΑΠΤΙΣΤΟΣ,ΑΒΑΠΤΙΣΤ,ADJ,SUBJ-,BOTH
4,αβεβαιότητα,αβεβαιοτητα,ΑΒΕΒΑΙΟΤΗΤΑ,ΑΒΕΒΑΙΟΤΗΤ,NOUN,OBJ,


In [134]:
df_lex2.groupby('POS').count()

Unnamed: 0_level_0,Term,Term2,Term_CAP,Term_STEM,Subjectivity,Polarity
POS,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ADJ,414,414,414,414,412,381
ADP,1,1,1,1,1,0
ADV,20,20,20,20,20,13
CONJ,5,5,5,5,5,0
INTJ,18,18,18,18,18,18
NOUN,1284,1284,1284,1284,1281,1162
OTHER,10,10,10,10,10,10
PART,25,25,25,25,25,24
PRON,8,8,8,8,8,2
VERB,458,458,458,458,458,378


In [135]:
df_lex2.groupby('Subjectivity').count()

Unnamed: 0_level_0,Term,Term2,Term_CAP,Term_STEM,POS,Polarity
Subjectivity,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
OBJ,268,268,268,268,265,57
SUBJ+,575,575,575,575,571,573
SUBJ-,1407,1407,1407,1407,1403,1364


In [136]:
df_lex2.groupby('Polarity').count()

Unnamed: 0_level_0,Term,Term2,Term_CAP,Term_STEM,POS,Subjectivity
Polarity,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
BOTH,101,101,101,101,101,101
NEG,1477,1477,1477,1477,1470,1475
POS,418,418,418,418,417,418


In [137]:
df_lex2.shape

(2256, 7)

In [138]:
df_lex2.Term.nunique()

2256

In [139]:
df_lex2.Term_STEM.nunique()

2123

In [140]:
# save as a pickle file
df_lex2.to_pickle('fixed_lexicons/greek_sentiment_lexicon.pkl')