In [1]:
import pandas as pandas
import numpy as numpy
from pathlib import Path
from collections import Counter 
import string

In [2]:
tweetsDF=pandas.DataFrame()
for i in range(0,101):
    file = Path("LemmaNegation\\LemmaTweetsData"+str(i)+".pkl")
    if file.exists():
        new_tweets=pandas.read_pickle("LemmaNegation\\LemmaTweetsData"+str(i)+".pkl")
        tweetsDF=pandas.concat([tweetsDF,new_tweets])
tweetsDF.to_excel('LemmaDataSet.xlsx')

In [3]:
tweetsDF= tweetsDF.drop('Text',axis=1)

## Handle negation

In [4]:
def handle_negation(text):
    res=[]
    punctuation=string.punctuation
    neg_index=0
    for token in text.split():
        if token == 'NEG':
            neg_index=3
        elif token in punctuation:
            neg_index=0
            res.append(token)
        elif(neg_index>0):
            res.append('_'.join([token,'NEG']))
            neg_index-=1
        else:
            res.append(token)
    return ' '.join(res)


In [5]:
tweetsDF['FilteredText']=tweetsDF.apply(lambda row:handle_negation(row['FilteredText']),axis=1)

In [6]:
positive_dataframe=tweetsDF.loc[tweetsDF['Label'] == 'Positive']
negative_dataframe=tweetsDF.loc[tweetsDF['Label'] == 'Negative']
neutral_dataframe=tweetsDF.loc[tweetsDF['Label'] == 'Neutral']

In [7]:
positive_counter=Counter(" ".join(positive_dataframe["FilteredText"]).split())
negative_counter=Counter(" ".join(negative_dataframe["FilteredText"]).split())
neutral_counter=Counter(" ".join(neutral_dataframe["FilteredText"]).split())
total_counter=Counter(" ".join(tweetsDF['FilteredText']).split())

In [8]:
words_summary_dataframe=pandas.DataFrame.from_dict(total_counter,orient='index', columns=['total_count'])

In [9]:
def find_word_count(dictionary,key):
    return dictionary[key]

In [10]:
words_summary_dataframe['positive_count']=words_summary_dataframe.apply(lambda row:find_word_count(positive_counter,row.name),axis=1)
words_summary_dataframe['negative_count']=words_summary_dataframe.apply(lambda row:find_word_count(negative_counter,row.name),axis=1)
words_summary_dataframe['neutral_count']=words_summary_dataframe.apply(lambda row:find_word_count(neutral_counter,row.name),axis=1)

In [11]:
def find_relative_word_count(dictionary,key):
    total_num_of_words_in_dictionary=sum(dictionary.values())
    return dictionary[key]*100/total_num_of_words_in_dictionary

In [12]:
words_summary_dataframe['rel_positive_count']=words_summary_dataframe.apply(lambda row:find_relative_word_count(positive_counter,row.name),axis=1)
words_summary_dataframe['rel_negative_count']=words_summary_dataframe.apply(lambda row:find_relative_word_count(negative_counter,row.name),axis=1)
words_summary_dataframe['rel_neutral_count']=words_summary_dataframe.apply(lambda row:find_relative_word_count(neutral_counter,row.name),axis=1)

In [44]:
def find_stop_words(margine, pos, neg, neut):
    avg=(pos+neg+neut)/3
    top=(1+margine)*avg
    bottom=(1-margine)*avg
    if (bottom<=pos<=top) and (bottom<=neg<=top) and (bottom<=neut<=top):
        return True
    return False

In [56]:
words_summary_dataframe['stop_words']=words_summary_dataframe.apply(lambda row:find_stop_words(0.25,row['rel_positive_count'],row['rel_negative_count'],row['rel_neutral_count']), axis=1)

In [57]:
stop_words_df=words_summary_dataframe.loc[words_summary_dataframe['stop_words']].sort_values(by='rel_positive_count',ascending=False)

In [58]:
stop_words_df

Unnamed: 0,total_count,positive_count,negative_count,neutral_count,rel_positive_count,rel_negative_count,rel_neutral_count,stop_words,infrequent_words,positive_words,negative_words
biti,13805,2868,5314,5623,4.990256,4.795639,4.712459,True,False,False,False
.,13959,2827,5267,5865,4.918917,4.753224,4.915271,True,False,False,False
",",11554,2327,4512,4715,4.048928,4.071871,3.951493,True,False,False,False
da,9882,1767,3872,4243,3.074541,3.494301,3.555924,True,False,False,False
i,6481,1435,2453,2593,2.496868,2.213719,2.173111,True,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...
krzno,5,1,2,2,0.001740,0.001805,0.001676,True,False,False,False
blogomanija,6,1,2,3,0.001740,0.001805,0.002514,True,False,False,False
očigledan,5,1,2,2,0.001740,0.001805,0.001676,True,False,False,False
biologija,5,1,2,2,0.001740,0.001805,0.001676,True,False,False,False


In [59]:
def find_infrequent_words(pos,neg,neut):
    sum=pos+neg+neut
    if((sum<4)and(pos*neg*neut>0)) or (sum<3):
        return True
    return False

In [60]:
words_summary_dataframe['infrequent_words']=words_summary_dataframe.apply(lambda row:find_infrequent_words(row['positive_count'],row['negative_count'],row['neutral_count']), axis=1)

In [61]:
infrequent_words_df=words_summary_dataframe.loc[words_summary_dataframe['infrequent_words']].sort_values(by='total_count',ascending=False)

In [62]:
infrequent_words_df

Unnamed: 0,total_count,positive_count,negative_count,neutral_count,rel_positive_count,rel_negative_count,rel_neutral_count,stop_words,infrequent_words,positive_words,negative_words
wannabe,3,1,1,1,0.00174,0.000902,0.000838,False,True,False,False
maloletnički,3,1,1,1,0.00174,0.000902,0.000838,False,True,False,False
cimet,3,1,1,1,0.00174,0.000902,0.000838,False,True,False,False
odspavati,3,1,1,1,0.00174,0.000902,0.000838,False,True,False,False
sarin,3,1,1,1,0.00174,0.000902,0.000838,False,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...
prepis,1,0,0,1,0.00000,0.000000,0.000838,False,True,False,False
brzo_NEG,1,0,0,1,0.00000,0.000000,0.000838,False,True,False,False
izguliti,1,0,1,0,0.00000,0.000902,0.000000,False,True,False,True
Kljajić,1,0,0,1,0.00000,0.000000,0.000838,False,True,False,False


In [63]:
drop_words_array=words_summary_dataframe.loc[words_summary_dataframe['infrequent_words']|words_summary_dataframe['stop_words']].index.values.tolist()

In [64]:
with open('DropWords.txt', 'w+', encoding='utf8') as f:
    for item in drop_words_array:
        f.write("%s\n" % item)

In [65]:
def remove_stop_words(stop_array,text):
    res=[]
    text_array=text.split()
    for word in text_array:
        if word not in stop_array:
            res.append(word)
    return ' '.join(res)

In [66]:
tweetsDF['without_sw']=tweetsDF.apply(lambda row:remove_stop_words(drop_words_array,row['FilteredText']),axis=1)
tweetsDF

Unnamed: 0,Label,FilteredText,without_sw
1,Negative,u ovaj bus sav nešto kašljati,
16,Negative,oskrnaviti katedrala pozirati gol ispred dubro...,pozirati gol !
22,Negative,"u počinjati sa učenje , tko ja biti zvati , sl...",počinjati učenje zvati slati poruka ili bilo s...
40,Negative,glavni problem Srbija biti taj što svaki bolid...,problem Srbija dobijati medijski genij
45,Negative,pucati sebe u glava . elektrotehnika,pucati glava
...,...,...,...
817,Neutral,posetiti kuća čaj sa,posetiti čaj
843,Positive,hello ! Kortna Koks bivši supruga željeti sav ...,hello ! bivši supruga željeti dobar
852,Positive,biti Vip student ! _emo_happy_face_or_smiley r...,Vip ! _emo_happy_face_or_smiley rok prijava le...
874,Negative,min. nakon skidanje vest gde objašnjavati svoj...,min. skidanje vest gde objašnjavati ostavka iz...


In [67]:
tweetsDF.to_pickle('TweetsWithoutSW.pkl')

In [68]:
def positive_words(margine,pos,neut,neg):
    top=margine*pos
    if(neg<=top) and (neut<top):
        return True
    return False

In [69]:
def negative_words(margine,pos,neut,neg):
    top=margine*neg
    if (pos<=top) and (neut<top):
        return True
    return False

In [70]:
words_summary_dataframe['positive_words']=words_summary_dataframe.apply(lambda row:positive_words(0.5,row['rel_positive_count'],row['rel_neutral_count'],row['rel_negative_count']), axis=1)

words_summary_dataframe['negative_words']=words_summary_dataframe.apply(lambda row:negative_words(0.5,row['rel_positive_count'],row['rel_neutral_count'],row['rel_negative_count']), axis=1)

In [74]:
positive_array=(words_summary_dataframe.loc[(words_summary_dataframe['total_count']>3) & (words_summary_dataframe['positive_words'])]).index.values.tolist()
negative_array=(words_summary_dataframe.loc[(words_summary_dataframe['total_count']>3) & (words_summary_dataframe['negative_words'])]).index.values.tolist()

In [75]:
with open('PositiveWordsFromDataset.txt', 'w+', encoding='utf8') as f:
    for item in positive_array:
        f.write("%s\n" % item)

In [76]:
with open('NegativeWordsFromDataset.txt', 'w+', encoding='utf8') as f:
    for item in negative_array:
        f.write("%s\n" % item)