Smiley Conversion

In [None]:
from emot.core import emot
from googletrans import Translator  # version 3.1.0a0 use: pip install googletrans==3.1.0a0
import re

#### Function to get the right english translation 
from the 'mean' value in the dictionary returned by the <code>emoticon()</code> function

In [None]:
'''
1. '^[^\s,]+,': match the first word before the first comma
    - ^:        start of the string
    - [^\s,]+:  match any character that is not a space or a comma, one or more times

2. '^[^,]+,\s*(\w+)': match the first word after the first comma if there are multiple words before the comma
    - ^:        start of the string
    - [^,]+:    one or more characters that are not commas, followed by a comma
    - \s*:      zero or more whitespace characters (e.g., spaces or tabs)
    - (\w+):    one or more word characters (e.g., letters, digits, or underscores), captured in a group
    
3. '^\s*([\w\s]+?)\s*or\b': match the first words before the word 'or'
    - ^:            start of the string
    - \s*:          zero or more whitespace characters (e.g., spaces or tabs)
    - ([\w\s]+?):   one or more word characters (e.g., letters, digits, or underscores), or whitespace characters, captured in a non-greedy group
    - \s*:          zero or more whitespace characters
    - or\b:         the word "or", followed by a word boundary (to avoid matching words like "order" or "orange")
'''
def replacer(meanings):
    regex = re.compile(r'^[^\s,]+,')
    match = regex.match(meanings)
    if match:
        return match.group(0)[:-1]
    
    regex = re.compile(r'^[^,]+,\s*(\w+)')
    match = regex.match(meanings)
    if match:
        return match.group(1)
    
    regex = re.compile(r'^\s*([\w\s]+?)\s*or\b')
    match = regex.match(meanings)
    if match:
        return match.group(1)
    
    return meanings

In [None]:
def translate_emoticons(text):
    emotions = emot().emoticons(text)
    correction = 0
    for i, location in enumerate(emotions['location']):
        emoticon = emotions['value'][i]
        start = location[0] + correction
        end = location[1] + correction
        meaning = emotions['mean'][i]
        replacement = replacer(meaning)
        text = text[:start] + replacement + text[end:]
        correction += len(replacement) - len(emoticon)     # correction for the length of the emoticon
    return text
test = "Hoi Andre i bi nöd bös :/, :D, :P, :), :(, :)), :))), :-)"
translator = Translator()
translator.translate(translate_emoticons(test), src='de', dest='en').text

#### Translate swiss german to standard german

In [None]:
# need to install googletrans version 3.1.0a0
def translate_swiss_german(text):
    translator = Translator()
    return translator.translate(translator.translate(text, src='de', dest='en').text, src='en', dest='de').text

In [None]:
texte = ['Das isch en super Sach!', 
         'I lieb di au!', 
         "Chrischtbaumschmuck, Brunsli, Nusshüüfeli, Haferflockeguetzli, Zimetstärn hani gärn, Mailänderli au, Aenisguetzli, Chrischtchindli, es lüütet es Glöögli, Schtilli Nacht, Schternschnuppe,s' Jesuschindli liit i de Chrippe, äs isch zu euch Mänsche uf d'Aerde abe cho"]

In [None]:
for text in texte:
    print(text)
    print(translate_swiss_german(text))
    print()

#### Time the function with multiple Emoticons

In [None]:
%timeit -r 5 -n 10 translate_emoticons(test)

In [None]:
%timeit -r 5 -n 10 translate_swiss_german(test)

#### Restliche Emoticons von hand entfernen

In [None]:
# :-D, <3, 

chatmania & facebook merge
emoticons übersetzen
2 files: facebook_english, facebook_deutsch (mit der jeweiligen sprache übersetzen)

#### Naive Bayes

imports

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import pandas as pd
import numpy as np
from nltk.corpus import stopwords

data loading

In [None]:
FACEBOOK_PATH = "facebook_dataset/translated.csv"
SENTIMENT_PATH = "dataset/sentiment.csv"

In [None]:
# Load the comment data
comments_df = pd.read_csv(FACEBOOK_PATH)
# Load the sentiment counts data
sentiment_df = pd.read_csv(SENTIMENT_PATH)
# Merge the dataframes based on the sentence_id column
merged_df = pd.merge(comments_df, sentiment_df, on='sentence_id')
merged_df["sentence_text"] = merged_df["sentence_text"].str.lower().replace('[^\w\s]','')
print(comments_df.shape, sentiment_df.shape)
merged_df.head(3)

train/test split

In [None]:
X = merged_df["sentence_text"].values
y = merged_df[['neut', 'neg', 'pos']].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Vectorize

In [None]:
german_stop_words = stopwords.words('german')

In [None]:
vectorizer = CountVectorizer(stop_words=german_stop_words)
X_train_counts = vectorizer.fit_transform(X_train)
X_test_counts = vectorizer.transform(X_test)

model with naive bayes

In [None]:
clf = MultinomialNB()
multi_clf = MultiOutputClassifier(clf, n_jobs=-1)
multi_clf.fit(X_train_counts, y_train)

evaluation

In [None]:
text = ["Hoi du schwuli sau"]
new_sentence_counts = vectorizer.transform(text)
prediciton = multi_clf.predict(new_sentence_counts)
sentiments = np.array(['neutral', 'negative', 'positive'])
most_likely_sentiment = sentiments[np.argmax(prediciton)]
print(f'Text: {text}, Sentiment: {most_likely_sentiment}')

In [None]:
# predict the sentiment probabilities for the test set
y_pred = multi_clf.predict(X_test_counts)
multi_clf.classes_ = ['neutral', 'negative', 'positive']
# calculate accuracy, precision, recall, and F1 score for each label
for i in range(y_test.shape[1]):
    label = multi_clf.classes_[i]
    print(f"Label: {label}")
    print(f"Accuracy: {accuracy_score(y_test[:,i], y_pred[:,i])}")
    print(f"Precision: {precision_score(y_test[:,i], y_pred[:,i], average='weighted', zero_division=0)}")
    print(f"Recall: {recall_score(y_test[:,i], y_pred[:,i], average='weighted')}")
    print(f"F1 Score: {f1_score(y_test[:,i], y_pred[:,i], average='weighted')}")

#### V2 merge, translate chatmania & facebook.full

In [None]:
from emot.core import emot
from googletrans import Translator  # version 3.1.0a0 use: pip install googletrans==3.1.0a0
import re
import pandas as pd
from tqdm import tqdm

In [None]:
FACEBOOK_PATH = "facebook_dataset/facebook.full.csv"
CHATMANIA_PATH = "dataset/chatmania.csv"

In [None]:
def replacer(meanings):
    regex = re.compile(r'^[^\s,]+,')
    match = regex.match(meanings)
    if match:
        return match.group(0)[:-1]
    
    regex = re.compile(r'^[^,]+,\s*(\w+)')
    match = regex.match(meanings)
    if match:
        return match.group(1)
    
    regex = re.compile(r'^\s*([\w\s]+?)\s*or\b')
    match = regex.match(meanings)
    if match:
        return match.group(1)
    
    return meanings

def translate_emoticons(text):
    emotions = emot().emoticons(text)
    correction = 0
    for i, location in enumerate(emotions['location']):
        emoticon = emotions['value'][i]
        start = location[0] + correction
        end = location[1] + correction
        meaning = emotions['mean'][i]
        replacement = replacer(meaning)
        text = text[:start] + replacement + text[end:]
        correction += len(replacement) - len(emoticon)
    return text

#### Load both Dataframes and drop unnecessairy cols

In [None]:
df_facebook = pd.read_csv(FACEBOOK_PATH)
df_facebook = df_facebook[['sentence_id', 'sentence_text']]
df_chatmania = pd.read_csv(CHATMANIA_PATH)
df_facebook.shape, df_chatmania.shape

#### Check if the sentence IDs are unique and the values can be appended (no duplicates)

In [None]:
unique_sentence_ids = df_facebook['sentence_id'].unique()
unique_sentence_ids_chatmania = df_chatmania['sentence_id'].unique()
unique_sentence_ids.shape, unique_sentence_ids_chatmania.shape

In [None]:
# check if there are ids in both datasets
any([sentence_id in unique_sentence_ids_chatmania for sentence_id in unique_sentence_ids])

#### Merge Dataframes

In [None]:
merged_df = pd.concat([df_facebook, df_chatmania], ignore_index=True)
merged_df.shape

In [None]:
merged_df.head(3)

#### Translate emoticon

In [None]:
tqdm.pandas()
for i, row in tqdm(merged_df.iterrows(), total=merged_df.shape[0]):
    text = row['sentence_text']
    merged_df.at[i, 'sentence_text'] = translate_emoticons(text)


In [None]:
english_df = merged_df.copy().assign(translate_success=False)
german_df = merged_df.copy().assign(translate_success=False)
translator = Translator()
failed_translations_indexes = []
for i, row in tqdm(merged_df.iterrows(), total=merged_df.shape[0]):
    try:
        translated_engl = translator.translate(row['sentence_text'], dest='en').text
        english_df.loc[i] = [row['sentence_id'], translated_engl, True]
        translated_ger = translator.translate(translated_engl, dest='de').text
        german_df.loc[i] = [row['sentence_id'], translated_ger, True]
    except:
        failed_translations_indexes.append(i)
        english_df.loc[i] = [row['sentence_id'], row['sentence_text'], False]
        german_df.loc[i] = [row['sentence_id'], row['sentence_text'], False]
        continue
print(f"Failed translations: {len(failed_translations_indexes)}")

In [None]:
english_df.to_csv("facebook_dataset/english.csv", index=False)
german_df.to_csv("facebook_dataset/german.csv", index=False)

In [None]:
while failed_translations_indexes:
    last_index = failed_translations_indexes.pop()
    # try translating the sentence again
    row = merged_df.loc[last_index]
    try:
        translated_engl = translator.translate(row['sentence_text'], dest='en').text
        english_df.loc[i] = [row['sentence_id'], translated_engl, True]
        translated_ger = translator.translate(translated_engl, dest='de').text
        german_df.loc[i] = [row['sentence_id'], translated_ger, True]
    except:
        print(f"Failed to translate: Row {last_index}")
        failed_translations_indexes.append(last_index)
        continue
    