# Useful scripts for working with post data 
### Language detection and translation with Google and DeepL 
### Sentiment analysis with textblob and textblob-de

In [None]:
# language detection, uses Google
# Due to API limits pause 1 sec after every iteration

from textblob_de import TextBlobDE as TextBlob
from tqdm.notebook import tqdm
import time 

for i in tqdm(range(100)):
    blob = TextBlob("Der Park ist wunderschön im Frühling!")

    blob.detect_language()
    time.sleep(1)

In [None]:
# Test of German sentiment analysis textblob-de with pandas. Shows poor results.
import pandas as pd 
df = pd.read_excel("posts.xlsx") # rows with post texts and language
df

def sentim(text):
    blob = TextBlobDE(text)
    return blob.sentiment

# sentim("ich liebe dich") # Sentiment(polarity=1.0, subjectivity=0.0)
df["polarity"] = 999
df["subjectivity"] = 999

for index, row in tqdm(df.iterrows()):
    if row["language"] == "de":
        try: 
            currblob = sentim(row.plaintext)
            df.loc[index,"polarity"] = currblob[0]
            df.loc[index,"subjectivity"] = currblob[1]
        except:
            print("failed at "+ index)

# few phrases recognized, often false sentiment analysis results

In [None]:
# Google translation from any language to English
from googletrans import Translator
translator = Translator()
from tqdm.notebook import tqdm

def gtrans(text):
    try:
        transl = translator.translate(text)
        return transl.text
    except:
        return ""
    
df["GoogleEN"] = ""
for index, row in tqdm(df.iterrows()): # normally don´t use iterrows due to performance issues. Used here for quick error handling
    try:
        df.loc[index,"GoogleEN"] = gtrans(row["plaintext"])
    except:
        print(index)

In [None]:
# DeepL offers an API but would be overkill for this project
# Simply use Deepl's document translator
# i.e. a word document: every post with a new line
# Afterwards create a txt file and read in Python

f = open(r"deepl_korrektur_output.txt", encoding="utf-8").read()
f

# after let's create a pandas dataframe
allposts = []
for i in tqdm(range(111)):
    if i == 0:
        continue
    
    splitter2 = "\n"+str(i)
    splitter1 = "\n"+str(i-1)
    allposts.append(f.split(splitter1)[1].split(splitter2)[0].replace("\n","").replace("\t",""))

df = pd.DataFrame({'col':allposts})
df.to_excel("deepl_tranlsation.xlsx")

# concat and check results

In [None]:
# sentiment analysis with textblob

from textblob import TextBlob

def senten(text):
    try:
        blob = TextBlob(text)
        return blob.sentiment
    except:
        return [99,99]

# for Google's tranlsation
df["SentENpolGoogle"] = 0
df["SentENsubGoogle"] = 0

for index, row in tqdm(df.iterrows()):
    currsen = senten(row["GoogleEN"])
    df.loc[index,"SentENpolGoogle"] = currsen[0]
    df.loc[index,"SentENsubGoogle"] = currsen[1]
    
# same for DeepL
df["SentENpolDeepL"] = 0
df["SentENsubDeepL"] = 0

for index, row in tqdm(df.iterrows()):
    currsen = senten(row["DeepLEN"])
    df.loc[index,"SentENpolDeepL"] = currsen[0]
    df.loc[index,"SentENsubDeepL"] = currsen[1]

In [None]:
# classify sentiment values to ordinal scale: positive, neutral, negative or unclassifiable
def posneg(text):
    if text == 0:
        return "neutral"
    if text == 99:
        return "unclassifiable"
    if text < 0:
        return "negative"
    if text > 0:
        return "positive"
        
df["Sentiment_Google"] = df.SentENpolGoogle.apply(lambda x: posneg(x))
df["Sentiment_Deepl"] = df.SentENpolDeepL.apply(lambda x: posneg(x))

In [None]:
# check if different translation led to same sentiments 

df["equal_sentiment"] = True
df["final_sentiment"] = df["Sentiment_Google"]

for index, row in tqdm(df.iterrows()):
    # only if not equal
    if row["Sentiment_Google"] != row["Sentiment_Deepl"]:
        
        # if one is neutral take the other
        if row["Sentiment_Google"] == "neutral":
            df.loc[index,"final_sentiment"] =  row["Sentiment_Deepl"]
        if row["Sentiment_Deepl"] == "neutral":
            df.loc[index,"final_sentiment"] =  row["Sentiment_Google"]
        
        # if one is unclassifiable take the other
        if row["Sentiment_Google"] == "unclassifiable":
            df.loc[index,"final_sentiment"] =  row["Sentiment_Deepl"]
        if row["Sentiment_Deepl"] == "unclassifiable":
            df.loc[index,"final_sentiment"] =  row["Sentiment_Google"]
            
        df.loc[index,"equal_sentiment"] = False

# check some results i.e. with some loc conditions
tf = df.loc[df.Sentiment_Google =="unclassifiable"]
tf.loc[tf.equal_sentiment == True]

In [None]:
# one more check: if post is other than English even after translation return unclassifiable

for index, row in tqdm(df.iterrows()):
    if row["language"] != "en":
        df.loc[index,"final_sentiment"] =  "unclassifiable"
        