# Translate Tweets


Open Google Cloud SDK Shell and enter the following code:

`set GOOGLE_APPLICATION_CREDENTIALS = .\methods\translation\apikey.json`


### Import Packages & Define Key Variables

In [1]:
import pandas as pd
import numpy as np
from google.cloud import translate_v2 as translate
import re
import matplotlib.pyplot as plt
import time

# Define rootpath
rp = '.'
mp = 'methods\\'
dp = 'data\\tweet_data\\'  

In [110]:
def translate_text(text,target='en'):
    """
    Target must be an ISO 639-1 language code.
    https://cloud.google.com/translate/docs/languages
    """
    translate_client = translate.Client()
    result = translate_client.translate(text, target_language=target)

    return result

### Translate Spanish Tweets

In [118]:
# Import spanish tweets
spain_tweets = pd.read_csv(rp + dp + 'tweets\\spain_tweets_01122019_01052020_VADER.csv').drop(['Unnamed: 0'], axis=1)

In [119]:
# Set inputs
df     = spain_tweets
n      = 10
st_row = 1000
loops  = 900

In [120]:
# Create group_index to split df by
df['group_index'] = pd.Series(df.index.values).apply(lambda x: x // n)

# Use group_index to split df into list of smaller dataframes
split_df = list(df.groupby('group_index'))

In [121]:
# Create empty list to populate with transalated tweet dfs
trans = []

In [122]:
# if statement which checks whether planned loop will exceed total rows in the dataframe and adjusts values if it will
if (st_row//n)+loops < len(split_df): # if planned iterations do not exceed list length
    st_loop = st_row//n ; en_loop = (st_row//n)+loops ; en_row = st_row+n*loops # define start/end loops/rows
else: # if planned iterations do exceed list length
    st_loop = st_row//n ; en_loop = len(split_df) ; en_row = df.shape[0] ; loops = en_loop - st_loop # redefine start/end loops/rows to ensure df is not exceeded
    
# for loop which translates tweets, adds them to trans then saves them as a dataframe
for group in split_df[st_loop:en_loop]:
    
    # Record start time
    start_time = time.time()
    
    # Translate tweets
    group[1]['translated_text'] = group[1]['VADER_text'].apply(lambda x: translate_text(x)['translatedText'] )
    
    # Delete group_index variable (used for splitting variables, so no longer needed)
    del group[1]['group_index']
    
    # Append to trans
    trans.append(group[1])
    
    # Concatenate all tweets translated so far into a dataframe
    trans_df = pd.concat(trans)
    
    # Save as csv
    trans_df.to_csv(rp + dp + 'tweets\\spain_tweets_01122019_01052020_translations_' + str(st_row) + '_' + str(en_row) + '.csv')
    
    # Print progress
    loop = group[0] - st_row//n #  Define loop iteration
    en_loop_row = st_row+loop*n+n if st_row+loop*n+n <= df.shape[0] else df.shape[0] # Check whether loop exceeds df index and adjust if it does
    print(str(st_row+loop*n) + ':' + str(en_loop_row) + ' of ' + str(df.shape[0]) + ' tweets called', end=" --- ")
    
    # Print run-time
    print("Loop Run Time (s): %s" % round((time.time() - start_time),3), end="\r")

9990:10000 of 122707 tweets called --- Loop Run Time (s): 24.948

In [None]:
# Read in all translated tweets
grman_tweets_0_10000       = pd.read_csv(rp + dp + 'tweets\\grman_tweets_01122019_01052020_translations_0_10000.csv').drop(['Unnamed: 0'], axis=1)
grman_tweets_10000_20000   = pd.read_csv(rp + dp + 'tweets\\grman_tweets_01122019_01052020_translations_10000_20000.csv').drop(['Unnamed: 0'], axis=1)
grman_tweets_20000_30000   = pd.read_csv(rp + dp + 'tweets\\grman_tweets_01122019_01052020_translations_20000_30000.csv').drop(['Unnamed: 0'], axis=1)
grman_tweets_30000_40000   = pd.read_csv(rp + dp + 'tweets\\grman_tweets_01122019_01052020_translations_30000_40000.csv').drop(['Unnamed: 0'], axis=1)
grman_tweets_40000_50000   = pd.read_csv(rp + dp + 'tweets\\grman_tweets_01122019_01052020_translations_40000_50000.csv').drop(['Unnamed: 0'], axis=1)
grman_tweets_50000_60000   = pd.read_csv(rp + dp + 'tweets\\grman_tweets_01122019_01052020_translations_50000_60000.csv').drop(['Unnamed: 0'], axis=1)
grman_tweets_60000_70000   = pd.read_csv(rp + dp + 'tweets\\grman_tweets_01122019_01052020_translations_60000_70000.csv').drop(['Unnamed: 0'], axis=1)
grman_tweets_70000_80000   = pd.read_csv(rp + dp + 'tweets\\grman_tweets_01122019_01052020_translations_70000_80000.csv').drop(['Unnamed: 0'], axis=1)
grman_tweets_80000_90000   = pd.read_csv(rp + dp + 'tweets\\grman_tweets_01122019_01052020_translations_80000_90000.csv').drop(['Unnamed: 0'], axis=1)
grman_tweets_90000_100000  = pd.read_csv(rp + dp + 'tweets\\grman_tweets_01122019_01052020_translations_90000_100000.csv').drop(['Unnamed: 0'], axis=1)
grman_tweets_100000_110000 = pd.read_csv(rp + dp + 'tweets\\grman_tweets_01122019_01052020_translations_100000_110000.csv').drop(['Unnamed: 0'], axis=1)
grman_tweets_110000_117408 = pd.read_csv(rp + dp + 'tweets\\grman_tweets_01122019_01052020_translations_110000_117408.csv').drop(['Unnamed: 0'], axis=1)

# Combine into single dataframe
grman_tweets = pd.concat([grman_tweets_0_10000, grman_tweets_10000_20000, grman_tweets_20000_30000, grman_tweets_30000_40000,
                         grman_tweets_40000_50000, grman_tweets_50000_60000, grman_tweets_60000_70000, grman_tweets_70000_80000,
                         grman_tweets_80000_90000, grman_tweets_90000_100000, grman_tweets_100000_110000, grman_tweets_110000_117408]).reset_index(drop=True)

# Save tweets
#grman_tweets.to_csv(rp + dp + 'tweets\\grman_tweets_01122019_01052020_VADER_translated.csv')

In [None]:
# Read in all translated tweets
italy_tweets_0_10000       = pd.read_csv(rp + dp + 'tweets\\italy_tweets_01122019_01052020_translations_0_10000.csv').drop(['Unnamed: 0'], axis=1)
italy_tweets_10000_20000   = pd.read_csv(rp + dp + 'tweets\\italy_tweets_01122019_01052020_translations_10000_20000.csv').drop(['Unnamed: 0'], axis=1)
italy_tweets_20000_30000   = pd.read_csv(rp + dp + 'tweets\\italy_tweets_01122019_01052020_translations_20000_30000.csv').drop(['Unnamed: 0'], axis=1)
italy_tweets_30000_40000   = pd.read_csv(rp + dp + 'tweets\\italy_tweets_01122019_01052020_translations_30000_40000.csv').drop(['Unnamed: 0'], axis=1)
italy_tweets_40000_50000   = pd.read_csv(rp + dp + 'tweets\\italy_tweets_01122019_01052020_translations_40000_50000.csv').drop(['Unnamed: 0'], axis=1)
italy_tweets_50000_60000   = pd.read_csv(rp + dp + 'tweets\\italy_tweets_01122019_01052020_translations_50000_60000.csv').drop(['Unnamed: 0'], axis=1)
italy_tweets_60000_70000   = pd.read_csv(rp + dp + 'tweets\\italy_tweets_01122019_01052020_translations_60000_70000.csv').drop(['Unnamed: 0'], axis=1)
italy_tweets_70000_80000   = pd.read_csv(rp + dp + 'tweets\\italy_tweets_01122019_01052020_translations_70000_80000.csv').drop(['Unnamed: 0'], axis=1)
italy_tweets_80000_90000   = pd.read_csv(rp + dp + 'tweets\\italy_tweets_01122019_01052020_translations_80000_90000.csv').drop(['Unnamed: 0'], axis=1)
italy_tweets_90000_100000  = pd.read_csv(rp + dp + 'tweets\\italy_tweets_01122019_01052020_translations_90000_100000.csv').drop(['Unnamed: 0'], axis=1)
italy_tweets_100000_104066 = pd.read_csv(rp + dp + 'tweets\\italy_tweets_01122019_01052020_translations_100000_104066.csv').drop(['Unnamed: 0'], axis=1)

# Combine into single dataframe
italy_tweets = pd.concat([italy_tweets_0_10000, italy_tweets_10000_20000, italy_tweets_20000_30000, italy_tweets_30000_40000, italy_tweets_40000_50000,
                          italy_tweets_50000_60000, italy_tweets_60000_70000, italy_tweets_70000_80000, italy_tweets_80000_90000, italy_tweets_90000_100000,
                          italy_tweets_100000_104066]).reset_index(drop=True)

# Save tweets
italy_tweets.to_csv(rp + dp + 'tweets\\italy_tweets_01122019_01052020_VADER_translated.csv')

In [5]:
# Read in all translated tweets
spain_tweets_0_1000        = pd.read_csv(rp + dp + 'tweets\\spain_tweets_01122019_01052020_translations_0_1000.csv').drop(['Unnamed: 0'], axis=1)
spain_tweets_1000_10000    = pd.read_csv(rp + dp + 'tweets\\spain_tweets_01122019_01052020_translations_1000_10000.csv').drop(['Unnamed: 0'], axis=1)
spain_tweets_10000_20000   = pd.read_csv(rp + dp + 'tweets\\spain_tweets_01122019_01052020_translations_10000_20000.csv').drop(['Unnamed: 0'], axis=1)
spain_tweets_20000_30000   = pd.read_csv(rp + dp + 'tweets\\spain_tweets_01122019_01052020_translations_20000_30000.csv').drop(['Unnamed: 0'], axis=1)
spain_tweets_30000_31000   = pd.read_csv(rp + dp + 'tweets\\spain_tweets_01122019_01052020_translations_30000_31000.csv').drop(['Unnamed: 0'], axis=1)
spain_tweets_31000_40000   = pd.read_csv(rp + dp + 'tweets\\spain_tweets_01122019_01052020_translations_31000_40000.csv').drop(['Unnamed: 0'], axis=1)
spain_tweets_40000_50000   = pd.read_csv(rp + dp + 'tweets\\spain_tweets_01122019_01052020_translations_40000_50000.csv').drop(['Unnamed: 0'], axis=1)
spain_tweets_50000_60000   = pd.read_csv(rp + dp + 'tweets\\spain_tweets_01122019_01052020_translations_50000_60000.csv').drop(['Unnamed: 0'], axis=1)
spain_tweets_60000_61000   = pd.read_csv(rp + dp + 'tweets\\spain_tweets_01122019_01052020_translations_60000_61000.csv').drop(['Unnamed: 0'], axis=1)
spain_tweets_61000_70000   = pd.read_csv(rp + dp + 'tweets\\spain_tweets_01122019_01052020_translations_61000_70000.csv').drop(['Unnamed: 0'], axis=1)
spain_tweets_70000_80000   = pd.read_csv(rp + dp + 'tweets\\spain_tweets_01122019_01052020_translations_70000_80000.csv').drop(['Unnamed: 0'], axis=1)
spain_tweets_80000_90000   = pd.read_csv(rp + dp + 'tweets\\spain_tweets_01122019_01052020_translations_80000_90000.csv').drop(['Unnamed: 0'], axis=1)
spain_tweets_90000_91000   = pd.read_csv(rp + dp + 'tweets\\spain_tweets_01122019_01052020_translations_90000_91000.csv').drop(['Unnamed: 0'], axis=1)
spain_tweets_91000_100000  = pd.read_csv(rp + dp + 'tweets\\spain_tweets_01122019_01052020_translations_91000_100000.csv').drop(['Unnamed: 0'], axis=1)
spain_tweets_100000_110000 = pd.read_csv(rp + dp + 'tweets\\spain_tweets_01122019_01052020_translations_100000_110000.csv').drop(['Unnamed: 0'], axis=1)
spain_tweets_110000_120000 = pd.read_csv(rp + dp + 'tweets\\spain_tweets_01122019_01052020_translations_110000_120000.csv').drop(['Unnamed: 0'], axis=1)
spain_tweets_120000_122707 = pd.read_csv(rp + dp + 'tweets\\spain_tweets_01122019_01052020_translations_120000_122707.csv').drop(['Unnamed: 0'], axis=1)

# Combine into single dataframe
spain_tweets = pd.concat([spain_tweets_0_1000, spain_tweets_1000_10000, spain_tweets_10000_20000, spain_tweets_20000_30000, spain_tweets_30000_31000, spain_tweets_31000_40000, 
                          spain_tweets_40000_50000, spain_tweets_50000_60000, spain_tweets_60000_61000, spain_tweets_61000_70000, spain_tweets_70000_80000, spain_tweets_80000_90000, 
                          spain_tweets_90000_91000, spain_tweets_91000_100000, spain_tweets_100000_110000, spain_tweets_110000_120000, spain_tweets_120000_122707]).reset_index(drop=True)

# Save tweets
spain_tweets.to_csv(rp + dp + 'tweets\\spain_tweets_01122019_01052020_VADER_translated.csv')