Smiley Conversion

In [4]:
from emot.core import emot
from googletrans import Translator  # version 3.1.0a0 use: pip install googletrans==3.1.0a0
import re

#### Function to get the right english translation 
from the 'mean' value in the dictionary returned by the <code>emoticon()</code> function

In [5]:
'''
1. '^[^\s,]+,': match the first word before the first comma
    - ^:        start of the string
    - [^\s,]+:  match any character that is not a space or a comma, one or more times

2. '^[^,]+,\s*(\w+)': match the first word after the first comma if there are multiple words before the comma
    - ^:        start of the string
    - [^,]+:    one or more characters that are not commas, followed by a comma
    - \s*:      zero or more whitespace characters (e.g., spaces or tabs)
    - (\w+):    one or more word characters (e.g., letters, digits, or underscores), captured in a group
    
3. '^\s*([\w\s]+?)\s*or\b': match the first words before the word 'or'
    - ^:            start of the string
    - \s*:          zero or more whitespace characters (e.g., spaces or tabs)
    - ([\w\s]+?):   one or more word characters (e.g., letters, digits, or underscores), or whitespace characters, captured in a non-greedy group
    - \s*:          zero or more whitespace characters
    - or\b:         the word "or", followed by a word boundary (to avoid matching words like "order" or "orange")
'''
def replacer(meanings):
    regex = re.compile(r'^[^\s,]+,')
    match = regex.match(meanings)
    if match:
        return match.group(0)[:-1]
    
    regex = re.compile(r'^[^,]+,\s*(\w+)')
    match = regex.match(meanings)
    if match:
        return match.group(1)
    
    regex = re.compile(r'^\s*([\w\s]+?)\s*or\b')
    match = regex.match(meanings)
    if match:
        return match.group(1)
    
    return meanings

In [6]:
def translate_emoticons(text):
    emotions = emot().emoticons(text)
    correction = 0
    for i, location in enumerate(emotions['location']):
        emoticon = emotions['value'][i]
        start = location[0] + correction
        end = location[1] + correction
        meaning = emotions['mean'][i]
        replacement = replacer(meaning)
        text = text[:start] + replacement + text[end:]
        correction += len(replacement) - len(emoticon)     # correction for the length of the emoticon
    return text
test = 'Ich freu mi ufs Fuessballspiele :))'
translator = Translator()
translator.translate(translate_emoticons(test), src='de', dest='en').text

'I enjoy playing soccer Very Happy face'

#### Translate swiss german to standard german

In [4]:
# need to install googletrans version 3.1.0a0
def translate_swiss_german(text):
    translator = Translator()
    return translator.translate(translator.translate(text, src='de', dest='en').text, src='en', dest='de').text

In [5]:
texte = ['Das isch en super Sach!', 
         'I lieb di au!', 
         "Chrischtbaumschmuck, Brunsli, Nusshüüfeli, Haferflockeguetzli, Zimetstärn hani gärn, Mailänderli au, Aenisguetzli, Chrischtchindli, es lüütet es Glöögli, Schtilli Nacht, Schternschnuppe,s' Jesuschindli liit i de Chrippe, äs isch zu euch Mänsche uf d'Aerde abe cho"]

In [6]:
for text in texte:
    print(text)
    print(translate_swiss_german(text))
    print()

Das isch en super Sach!
Das ist eine tolle Sache!

I lieb di au!
Ich liebe dich jetzt!

Chrischtbaumschmuck, Brunsli, Nusshüüfeli, Haferflockeguetzli, Zimetstärn hani gärn, Mailänderli au, Aenisguetzli, Chrischtchindli, es lüütet es Glöögli, Schtilli Nacht, Schternschnuppe,s' Jesuschindli liit i de Chrippe, äs isch zu euch Mänsche uf d'Aerde abe cho
Christbaumschmuck, Brunsli, Nusshüüfeli, Haferflockenkekse, Zimtstangen Hani Gärn, Mailänderli au, Aenisguetzli, Chrischtchindli, es lüütet Glöögli, Schtilli Nacht, Schternschnuppe, s'Jesuschindli liit i de Chrippe, es ist euch Leuten auf d'Aerde aber cho



#### Time the function with multiple Emoticons

In [7]:
%timeit -r 5 -n 10 translate_emoticons(test)

64.4 ms ± 4.84 ms per loop (mean ± std. dev. of 5 runs, 10 loops each)


In [8]:
%timeit -r 5 -n 10 translate_swiss_german(test)

163 ms ± 8.72 ms per loop (mean ± std. dev. of 5 runs, 10 loops each)


chatmania & facebook merge
emoticons übersetzen
2 files: facebook_english, facebook_deutsch (mit der jeweiligen sprache übersetzen)

#### V2 merge, translate chatmania & facebook.full

In [9]:
from emot.core import emot
from googletrans import Translator  # version 3.1.0a0 use: pip install googletrans==3.1.0a0
import re
import pandas as pd
from tqdm import tqdm

In [10]:
FACEBOOK_PATH = "facebook_dataset/facebook.full.csv"
CHATMANIA_PATH = "dataset/chatmania.csv"

In [11]:
def replacer(meanings):
    regex = re.compile(r'^[^\s,]+,')
    match = regex.match(meanings)
    if match:
        return match.group(0)[:-1]
    
    regex = re.compile(r'^[^,]+,\s*(\w+)')
    match = regex.match(meanings)
    if match:
        return match.group(1)
    
    regex = re.compile(r'^\s*([\w\s]+?)\s*or\b')
    match = regex.match(meanings)
    if match:
        return match.group(1)
    
    return meanings

def translate_emoticons(text):
    emotions = emot().emoticons(text)
    correction = 0
    for i, location in enumerate(emotions['location']):
        emoticon = emotions['value'][i]
        start = location[0] + correction
        end = location[1] + correction
        meaning = emotions['mean'][i]
        replacement = replacer(meaning)
        text = text[:start] + replacement + text[end:]
        correction += len(replacement) - len(emoticon)
    return text

#### Check if the sentence IDs are unique and the values can be appended (no duplicates)

In [1]:
import pandas as pd
df_facebook = pd.read_csv('../facebook_dataset/facebook.full.csv')
df_chatmania = pd.read_csv('../dataset/chatmania.csv')
unique_sentence_ids = df_facebook['sentence_id'].unique()
unique_sentence_ids_chatmania = df_chatmania['sentence_id'].unique()
unique_sentence_ids.shape, unique_sentence_ids_chatmania.shape

((56945,), (90897,))

In [2]:
# check if there are ids in both datasets
any([sentence_id in unique_sentence_ids_chatmania for sentence_id in unique_sentence_ids])

False

#### Merge Dataframes

In [3]:
merged_df = pd.concat([df_facebook, df_chatmania], ignore_index=True)
merged_df.shape

(147842, 7)

In [None]:
merged_df.head(3)

#### check for failed translations

In [None]:
import pandas as pd
from googletrans import Translator  # version 3.1.0a0 use: pip install googletrans==3.1.0a0

In [None]:
english_df = pd.read_csv("facebook_dataset/english.csv")
german_df = pd.read_csv("facebook_dataset/german.csv")

In [None]:
any(english_df['translate_success'] == False)

True

In [None]:
any(german_df['translate_success'] == False)

True

In [None]:
english_df.head(3)

Unnamed: 0,sentence_id,sentence_text,translate_success
0,100464,damn dräcksteam it's uruguay!,True
1,13530,Düüüüütschlaaaaaaaaaaaaaand Happy face smiley ...,True
2,110562,I'm betting on Germany.,True


translate the rest

In [None]:
english_df = english_df[english_df['translate_success'] == False]
english_df.shape

(1089, 3)

In [None]:
german_df = german_df[german_df['translate_success'] == False]
german_df.shape

(1089, 3)

In [None]:
indexes = english_df.index
indexes

Int64Index([   424,    425,    427,    428,    431,    432,    433,    434,
               435,    436,
            ...
            146816, 146828, 146905, 147217, 147260, 147273, 147322, 147363,
            147617, 147729],
           dtype='int64', length=1089)

In [None]:
translator = Translator()
while len(indexes) > 0:
    index = indexes[0]
    print(f"Line: {index}", end=" ")
    try:
        english = translator.translate(english_df.loc[index].sentence_text, dest='en').text
        german = translator.translate(german_df.loc[index].sentence_text, dest='de').text
        english_df.loc[index, 'sentence_text'] = english
        german_df.loc[index, 'sentence_text'] = german
        indexes = indexes[1:]
        print(f"English: {english}, German: {german}")
    except:
        print(f"Error")
        continue

Line: 424 English: in that case, German: In diesem Fall
Line: 425 English: For book ha ig scho in december pay u to hats NID OVER CHOME!!, German: Für Buch ha ig scho in dezember zahlt u bis hüte NID ÜBERCHOME!!
Line: 427 English: !The are definitely wrong there!, German: !den sinder definitiv falsch da!
Line: 428 English: !aso really, embarrassing gaz yes probably nüme !!!!!!!!!!!!, German: !aso echt,peinlicher gaz ja wohl nüme!!!!!!!!!!!!
Line: 431 English: very stupid, wil me jez as admin nuemm cha go lie what's going on isch., German: sehr dumm, wil me jez als admin nuemm cha go luege was los isch.
Line: 432 English: If I'm not a creator, I can write on my own., German: Wen me nid kreator gsi isch darf me nuemm uf'd siite go schriibe.
Line: 433 English: I have lost all my permission., German: I ha do alli mini erlaubnis verlore.
Line: 434 English: Cha nuemm go fans update., German: Cha nuemm go Fans aktualisieren.
Line: 435 English: @Tarja I haven't noticed that until now, maybe it

In [None]:
# 424
english = pd.read_csv("facebook_dataset/english.csv")
german = pd.read_csv("facebook_dataset/german.csv")

In [None]:
english.update(english_df)
german.update(german_df)

In [None]:
english.iloc[424], english_df.iloc[0], english.shape

(sentence_id               78935.0
 sentence_text        in that case
 translate_success            True
 Name: 424, dtype: object,
 sentence_id                 78935
 sentence_text        in that case
 translate_success            True
 Name: 424, dtype: object,
 (147842, 3))

In [None]:
english.to_csv("facebook_dataset/english.csv", index=False)
german.to_csv("facebook_dataset/german.csv", index=False)

check if really all values have been translated

In [None]:
any(english['translate_success'] == False)

False

In [None]:
any(german['translate_success'] == False)

False

#### English Sentiment Analysis

In [None]:
import nltk
#nltk.download('vader_lexicon')
'''
Hutto, C.J. & Gilbert, E.E. (2014). 
VADER: A Parsimonious Rule-based Model for Sentiment Analysis of Social Media Text. 
Eighth International Conference on Weblogs and Social Media (ICWSM-14). 
https://doi.org/10.1609/icwsm.v8i1.14550 
'''
from nltk.sentiment import SentimentIntensityAnalyzer
import pandas as pd

In [None]:
# Load the CSV file
df = pd.read_csv('facebook_dataset/english.csv')

In [None]:
# Create an instance of the sentiment analyzer
sid = SentimentIntensityAnalyzer()

In [None]:
# Define a function to get the sentiment score for each sentence
def get_sentiment_score(sentence):
    sentiment_scores = sid.polarity_scores(sentence)
    return sentiment_scores['compound']
def get_sentiment_label(score):
    if score >= 0.05:
        return 'positive'
    elif score <= -0.05:
        return 'negative'
    else:
        return 'neutral'

In [None]:
test_senctence = df.loc[20].sentence_text
print(f"Text: {test_senctence}, Sentiment Score: {get_sentiment_label(get_sentiment_score(test_senctence))}")


Text: Ghana and Australia, Sentiment Score: neutral
