In [1]:
import pandas as pd
import numpy as numpy
import string
import re
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
tweets=pd.read_pickle('PickleDataframes\\TweetsWithoutSW.pkl')
tweets=tweets.drop('FilteredText',axis=1)
tweets=tweets.reset_index(drop=True)

In [3]:
countvec = CountVectorizer(ngram_range=(1,1), binary=True)
tweetsDF=pd.DataFrame(countvec.fit_transform(tweets['without_sw']).toarray().astype('int8'), columns=countvec.get_feature_names())

In [4]:
tweetsDF.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20194 entries, 0 to 20193
Columns: 6416 entries, _emo_cat_face_with_tears_of_joy to Å¾vaka
dtypes: int8(6416)
memory usage: 123.6 MB


In [5]:
tweetsDF['Label']=tweets['Label']

In [6]:
def create_negative_words_array():
    negative_words=[]
    with open('TxtDocs\\NegativeSentimentWords.txt','r', encoding='utf8') as f:
        for line in f:
            for word in line.split():
                negative_words.append(word)

    with open('TxtDocs\\PositiveSentimentWords.txt','r',encoding='utf8') as f:
       for line in f:
          for word in line.split():
             negative_words.append(''.join([word,'_NEG']))
    return negative_words

In [7]:
def create_positive_words_array():
    positive_words=[]
    with open('TxtDocs\\PositiveSentimentWords.txt','r',encoding='utf8') as f:
        for line in f:
            for word in line.split():
                positive_words.append(word)

    with open('TxtDocs\\NegativeSentimentWords.txt','r',encoding='utf8') as f:
       for line in f:
          for word in line.split():
             positive_words.append(''.join([word,'_NEG']))
    return positive_words

In [8]:
def negative_words_from_dataset():
    negative_words=[]
    with open('TxtDocs\\NegativeWordsFromDataset.txt','r', encoding='utf8') as f:
        for line in f:
            for word in line.split():
                negative_words.append(word)
    return negative_words

In [9]:
def positive_words_from_dataset():
    positive_words=[]
    with open('TxtDocs\\PositiveWordsFromDataset.txt','r', encoding='utf8') as f:
        for line in f:
            for word in line.split():
                positive_words.append(word)
    return positive_words

#

In [10]:
def find_words(word_array,text):
    res=0
    for word in text.split():
        if word in word_array:
            res+=1
    return res

In [11]:
def count_negation(text):
    res=0
    i=0
    for word in text.split():
        i+=1       
        if word.endswith('_NEG'):
            res+=1
    
    return res

In [12]:
def count_emo(text):
    res=0
    i=0
    for word in text.split():
        i+=1
        if word.startswith('_emo_'):
            res+=1
    return res


In [13]:
positive_from_dataset=positive_words_from_dataset()
negative_from_dataset=negative_words_from_dataset()

In [14]:
positive_words=create_positive_words_array()
negative_words=create_negative_words_array()

In [15]:
tweetsDF['positive_sentiment']=tweets.apply(lambda row:find_words(positive_words,row['without_sw']),axis=1)
tweetsDF['negative_sentiment']=tweets.apply(lambda row:find_words(negative_words,row['without_sw']),axis=1)

In [16]:
tweetsDF['number_of_negations']=tweets.apply(lambda row:count_negation(row['without_sw']),axis=1)

In [17]:
tweetsDF['positive_fromDS']=tweets.apply(lambda row:find_words(positive_from_dataset, row['without_sw']),axis=1)
tweetsDF['negative_fromDS']=tweets.apply(lambda row:find_words(negative_from_dataset, row['without_sw']),axis=1)

In [18]:
positive_emo=['_emo_cat_face_with_tears_of_joy','_emo_clapping_hands','_emo_face_blowing_a_kiss','_emo_face_with_stuck','_emo_hace_or_smace_smiley','_emo_hace_smace_smiley', '_emo_happy_face_or_smiley', '_emo_happy_face_smiley', '_emo_heart_suit', '_emo_kiss', '_emo_laughing_big_grin_or_laugh_with_glasses', '_emo_party_popper', '_emo_red_heart','_emo_smiling_face_with_open_mouth_','_emo_tongue_sticking_out_cheeky_playful_or_blowing_a_raspberry','_emo_wink_or_smirk','_emo_woman_dancing']
negative_emo=['_emo_crying','_emo_embarrassed_or_blushing','_emo_face_screaming_in_fear','_emo_frown_sad_andry_or_pouting','_emo_sadness','_emo_skeptical_annoyed_undecided_uneasy_or_hesitant','_emo_surpised','_emo_surprise',]

In [19]:
tweetsDF['positive_emo']=tweets.apply(lambda row:find_words(positive_emo, row['without_sw']),axis=1)
tweetsDF['negative_emo']=tweets.apply(lambda row:find_words(negative_emo, row['without_sw']),axis=1)

In [20]:
def find_char(char,text):
    tokens=text.split()
    res=0
    for token in tokens:
        if token==char:
            res+=1
    return res

In [21]:
tweetsDF['question_mark_count']=tweets.apply(lambda row:find_char('?', row['without_sw']),axis=1)
tweetsDF['exclamation_mark_count']=tweets.apply(lambda row:find_char('!', row['without_sw']),axis=1)

#

In [22]:
tweet_fml2cl=tweetsDF.loc[tweetsDF['Label']!='Neutral']

# Classification

In [23]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix,classification_report,balanced_accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn import preprocessing
from sklearn.model_selection import cross_val_score
import statistics

In [24]:
def ClassificationFunction(classificator,X_train,X_test,y_train,y_test):
    classificator.fit(X_train,y_train)
    prediction=classificator.predict(X_test)

    print(classification_report(y_test,prediction))
    print(confusion_matrix(y_test,prediction))
   
    return accuracy_score(y_test,prediction)

In [26]:
le = preprocessing.LabelEncoder()
le.fit(tweetsDF['Label'])
tweetsDF.HandLabel=le.transform(tweetsDF['Label'])

In [27]:
le = preprocessing.LabelEncoder()
le.fit(tweet_fml2cl['Label'])
tweet_fml2cl.HandLabel=le.transform(tweet_fml2cl['Label'])

### X y is used for dataset with 3 classes positive, negative and neutral and X2 y2 for dataset with only positive and negative classes

In [28]:
X=tweetsDF.drop('Label',axis=1)
y=tweetsDF['Label']

X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.25,random_state=3,stratify=y)

X_train=pd.DataFrame(X_train,columns=list(X))
X_test=pd.DataFrame(X_test,columns=list(X))

In [29]:
X2=tweet_fml2cl.drop('Label',axis=1)
y2=tweet_fml2cl['Label']

X2_train,X2_test,y2_train,y2_test=train_test_split(X2,y2,test_size=0.25,random_state=3,stratify=y2)

X2_train=pd.DataFrame(X2_train,columns=list(X2))
X2_test=pd.DataFrame(X2_test,columns=list(X2))

In [30]:
import gc

In [31]:
del [[tweetsDF,tweet_fml2cl]]
gc.collect()

80

## MultinomialNB

In [32]:
from sklearn.naive_bayes import MultinomialNB

In [33]:
ClassificationFunction(MultinomialNB(alpha=7.3),X_train,X_test,y_train,y_test) 

              precision    recall  f1-score   support

    Negative       0.63      0.65      0.64      1772
     Neutral       0.58      0.59      0.58      2193
    Positive       0.58      0.53      0.55      1084

    accuracy                           0.60      5049
   macro avg       0.60      0.59      0.59      5049
weighted avg       0.60      0.60      0.60      5049

[[1156  533   83]
 [ 572 1296  325]
 [ 102  411  571]]


0.598732422261834

In [34]:
ClassificationFunction(MultinomialNB(alpha=7),X2_train,X2_test,y2_train,y2_test) 

              precision    recall  f1-score   support

    Negative       0.83      0.90      0.86      1772
    Positive       0.81      0.70      0.75      1084

    accuracy                           0.82      2856
   macro avg       0.82      0.80      0.81      2856
weighted avg       0.82      0.82      0.82      2856

[[1596  176]
 [ 328  756]]


0.8235294117647058

## GradientBoost

In [33]:
from sklearn.ensemble import GradientBoostingClassifier

In [34]:
ClassificationFunction(GradientBoostingClassifier(n_estimators=100),X_train,X_test,y_train,y_test) 

precision    recall  f1-score   support

    Negative       0.64      0.63      0.63      1772
     Neutral       0.56      0.61      0.58      2193
    Positive       0.63      0.54      0.58      1084

    accuracy                           0.60      5049
   macro avg       0.61      0.59      0.60      5049
weighted avg       0.60      0.60      0.60      5049

[[1108  599   65]
 [ 583 1335  275]
 [  46  451  587]]


0.6001188354129531

In [35]:
ClassificationFunction(GradientBoostingClassifier(random_state=0),X2_train,X2_test,y2_train,y2_test) 

precision    recall  f1-score   support

    Negative       0.87      0.90      0.88      1772
    Positive       0.83      0.77      0.80      1084

    accuracy                           0.85      2856
   macro avg       0.85      0.84      0.84      2856
weighted avg       0.85      0.85      0.85      2856

[[1598  174]
 [ 246  838]]


0.8529411764705882

## XGBoost

In [49]:
from xgboost import XGBClassifier

In [37]:
ClassificationFunction(XGBClassifier(),X_train,X_test,y_train,y_test) 

precision    recall  f1-score   support

    Negative       0.69      0.58      0.63      1772
     Neutral       0.58      0.71      0.64      2193
    Positive       0.64      0.51      0.57      1084

    accuracy                           0.62      5049
   macro avg       0.64      0.60      0.61      5049
weighted avg       0.63      0.62      0.62      5049

[[1022  683   67]
 [ 379 1567  247]
 [  75  457  552]]


0.6221033868092691

In [37]:
ClassificationFunction(XGBClassifier(),X2_train,X2_test,y2_train,y2_test)

precision    recall  f1-score   support

    Negative       0.87      0.90      0.88      1772
    Positive       0.82      0.78      0.80      1084

    accuracy                           0.85      2856
   macro avg       0.85      0.84      0.84      2856
weighted avg       0.85      0.85      0.85      2856

[[1592  180]
 [ 243  841]]


0.851890756302521

## SVC

In [35]:
from sklearn.svm import SVC

In [36]:
ClassificationFunction(SVC(C=1),X_train,X_test,y_train,y_test) 

              precision    recall  f1-score   support

    Negative       0.70      0.57      0.63      1772
     Neutral       0.58      0.75      0.65      2193
    Positive       0.68      0.46      0.55      1084

    accuracy                           0.63      5049
   macro avg       0.65      0.60      0.61      5049
weighted avg       0.64      0.63      0.62      5049

[[1016  704   52]
 [ 359 1647  187]
 [  85  499  500]]


0.6264606852842147

In [37]:
ClassificationFunction(SVC(C=2),X2_train,X2_test,y2_train,y2_test) 

              precision    recall  f1-score   support

    Negative       0.85      0.90      0.88      1772
    Positive       0.82      0.74      0.78      1084

    accuracy                           0.84      2856
   macro avg       0.84      0.82      0.83      2856
weighted avg       0.84      0.84      0.84      2856

[[1597  175]
 [ 278  806]]


0.8413865546218487

## Mean score

In [25]:
def MeanScorer(classifier,X,y, acc):
    cvs=cross_val_score(classifier,X,y, cv=5,scoring=acc, error_score='raise')
    print(cvs)
    return cvs.mean()

In [38]:
MeanScorer(SVC(C=1), X, y, 'accuracy')

[0.6135182  0.58603615 0.63010646 0.63035405 0.62456662]


0.6169162944598731

In [39]:
MeanScorer(SVC(C=2), X2, y2, 'accuracy')

[0.82669584 0.8083151  0.84507659 0.84726477 0.84325744]


0.8341219481350619