Smiley Conversion

In [1]:
from emot.core import emot
from googletrans import Translator  # version 3.1.0a0 use: pip install googletrans==3.1.0a0
import re

#### Function to get the right english translation 
from the 'mean' value in the dictionary returned by the <code>emoticon()</code> function

In [2]:
'''
1. '^[^\s,]+,': match the first word before the first comma
    - ^:        start of the string
    - [^\s,]+:  match any character that is not a space or a comma, one or more times

2. '^[^,]+,\s*(\w+)': match the first word after the first comma if there are multiple words before the comma
    - ^:        start of the string
    - [^,]+:    one or more characters that are not commas, followed by a comma
    - \s*:      zero or more whitespace characters (e.g., spaces or tabs)
    - (\w+):    one or more word characters (e.g., letters, digits, or underscores), captured in a group
    
3. '^\s*([\w\s]+?)\s*or\b': match the first words before the word 'or'
    - ^:            start of the string
    - \s*:          zero or more whitespace characters (e.g., spaces or tabs)
    - ([\w\s]+?):   one or more word characters (e.g., letters, digits, or underscores), or whitespace characters, captured in a non-greedy group
    - \s*:          zero or more whitespace characters
    - or\b:         the word "or", followed by a word boundary (to avoid matching words like "order" or "orange")
'''
def replacer(meanings):
    regex = re.compile(r'^[^\s,]+,')
    match = regex.match(meanings)
    if match:
        return match.group(0)[:-1]
    
    regex = re.compile(r'^[^,]+,\s*(\w+)')
    match = regex.match(meanings)
    if match:
        return match.group(1)
    
    regex = re.compile(r'^\s*([\w\s]+?)\s*or\b')
    match = regex.match(meanings)
    if match:
        return match.group(1)
    
    return meanings

In [33]:
def translate_emoticons(text):
    emotions = emot().emoticons(text)
    translator = Translator()
    correction = 0
    for i, location in enumerate(emotions['location']):
        emoticon = emotions['value'][i]
        start = location[0] + correction
        end = location[1] + correction
        meaning = emotions['mean'][i]
        replacement = replacer(meaning)
        text = text[:start] + replacement + text[end:]
        correction += len(replacement) - len(emoticon)     # correction for the length of the emoticon
    return translator.translate(translator.translate(text, dest='en').text, dest='de').text
test = "Hoi Andre i bi nöd bös :/, :D, :P, :), :(, :)), :))), :-)"
translate_emoticons(test)

'Hoi Andre i bi nöd bös Skeptisch, lachend, frech, glückliches Gesicht, Stirnrunzeln, sehr glückliches Gesicht, sehr sehr glückliches Gesicht, glückliches Gesicht Smiley'

#### Translate swiss german to standard german

In [4]:
# need to install googletrans version 3.1.0a0
def translate_swiss_german(text):
    translator = Translator()
    return translator.translate(translator.translate(text, src='de', dest='en').text, src='en', dest='de').text

In [5]:
texte = ['Das isch en super Sach!', 
         'I lieb di au!', 
         "Chrischtbaumschmuck, Brunsli, Nusshüüfeli, Haferflockeguetzli, Zimetstärn hani gärn, Mailänderli au, Aenisguetzli, Chrischtchindli, es lüütet es Glöögli, Schtilli Nacht, Schternschnuppe,s' Jesuschindli liit i de Chrippe, äs isch zu euch Mänsche uf d'Aerde abe cho"]

In [6]:
for text in texte:
    print(text)
    print(translate_swiss_german(text))
    print()

Das isch en super Sach!
Das ist eine tolle Sache!

I lieb di au!
Ich liebe dich jetzt!

Chrischtbaumschmuck, Brunsli, Nusshüüfeli, Haferflockeguetzli, Zimetstärn hani gärn, Mailänderli au, Aenisguetzli, Chrischtchindli, es lüütet es Glöögli, Schtilli Nacht, Schternschnuppe,s' Jesuschindli liit i de Chrippe, äs isch zu euch Mänsche uf d'Aerde abe cho
Christbaumschmuck, Brunsli, Nusshüüfeli, Haferflockenkekse, Zimtstangen Hani Gärn, Mailänderli au, Aenisguetzli, Chrischtchindli, es lüütet Glöögli, Schtilli Nacht, Schternschnuppe, s'Jesuschindli liit i de Chrippe, es ist euch Leuten auf d'Aerde aber cho



#### Time the function with multiple Emoticons

In [7]:
%timeit -r 5 -n 10 translate_emoticons(test)

1.15 s ± 172 ms per loop (mean ± std. dev. of 5 runs, 10 loops each)


In [8]:
%timeit -r 5 -n 10 translate_swiss_german(test)

259 ms ± 11.2 ms per loop (mean ± std. dev. of 5 runs, 10 loops each)


#### Naive Bayes

imports

In [9]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import pandas as pd
import numpy as np
from nltk.corpus import stopwords

data loading

In [22]:
FACEBOOK_PATH = "facebook_dataset/translated.csv"
SENTIMENT_PATH = "dataset/sentiment.csv"

In [23]:
# Load the comment data
comments_df = pd.read_csv(FACEBOOK_PATH)
# Load the sentiment counts data
sentiment_df = pd.read_csv(SENTIMENT_PATH)
# Merge the dataframes based on the sentence_id column
merged_df = pd.merge(comments_df, sentiment_df, on='sentence_id')
merged_df["sentence_text"] = merged_df["sentence_text"].str.lower().replace('[^\w\s]','')
print(comments_df.shape, sentiment_df.shape)
merged_df.head(3)

(56945, 7) (2799, 6)


Unnamed: 0,comment_id,status_id,parent_id,sentence_number,md5_hash,sentence_id,sentence_text,un,unsure,neut,neg,pos
0,101180699934084_27535,92886373210_101180699934084,-1,0,38540bcfd66b046240b62b9e3482442c,61217,ghana und serbien: smiley mit fröhlichem gesicht,1,0,0,0,0
1,101180699934084_28236,92886373210_101180699934084,-1,1,3fee57e44994d5f406f797e2e9819c13,30919,wer sonst?,1,0,0,0,0
2,10150099003948783_14797768,92886373210_10150099003948783,-1,0,58dc5fa1f3b24b6d082954b6712b5589,106557,z.b. was ist e-fize,0,0,2,0,0


train/test split

In [24]:
X = merged_df["sentence_text"].values
y = merged_df[['neut', 'neg', 'pos']].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Vectorize

In [25]:
german_stop_words = stopwords.words('german')

In [26]:
vectorizer = CountVectorizer(stop_words=german_stop_words)
X_train_counts = vectorizer.fit_transform(X_train)
X_test_counts = vectorizer.transform(X_test)

model with naive bayes

In [27]:
clf = MultinomialNB()
multi_clf = MultiOutputClassifier(clf, n_jobs=-1)
multi_clf.fit(X_train_counts, y_train)

evaluation

In [28]:
text = ["Super, Schön halli hallo"]
new_sentence_counts = vectorizer.transform(text)
prediciton = multi_clf.predict(new_sentence_counts)
sentiments = np.array(['neutral', 'negative', 'positive'])
most_likely_sentiment = sentiments[np.argmax(prediciton)]
print(f'Text: {text}, Sentiment: {most_likely_sentiment}')

Text: ['Super, Schön halli hallo'], Sentiment: positive


In [29]:
# predict the sentiment probabilities for the test set
y_pred = multi_clf.predict(X_test_counts)
multi_clf.classes_ = ['neutral', 'negative', 'positive']
# calculate accuracy, precision, recall, and F1 score for each label
for i in range(y_test.shape[1]):
    label = multi_clf.classes_[i]
    print(f"Label: {label}")
    print(f"Accuracy: {accuracy_score(y_test[:,i], y_pred[:,i])}")
    print(f"Precision: {precision_score(y_test[:,i], y_pred[:,i], average='weighted', zero_division=0)}")
    print(f"Recall: {recall_score(y_test[:,i], y_pred[:,i], average='weighted')}")
    print(f"F1 Score: {f1_score(y_test[:,i], y_pred[:,i], average='weighted')}")

Label: neutral
Accuracy: 0.5099009900990099
Precision: 0.4426765292808351
Recall: 0.5099009900990099
F1 Score: 0.4543443236512543
Label: negative
Accuracy: 0.8960396039603961
Precision: 0.8028869718655034
Recall: 0.8960396039603961
F1 Score: 0.8469094951270585
Label: positive
Accuracy: 0.7920792079207921
Precision: 0.6273894716204294
Recall: 0.7920792079207921
F1 Score: 0.700180515289098
