## 2. Preprocessing of Tweets

### Import packages

In [None]:
import pandas as pd
import glob
import re
from tqdm.auto import tqdm
from transformers import AutoModelWithLMHead, AutoTokenizer,AutoModelForSeq2SeqLM, pipeline
tqdm.pandas()

In [None]:
# pd.set_option('display.max_colwidth', None)

### Regex patterns

In [None]:
pattern_filename = r'tweets_[a-z]*_[a-z]{2}'
pattern_keyword = r'_([a-z]+)_[a-z]{2}.csv'
pattern_lang = r'_([a-z]{2}).csv'
# pattern to remove url, \n, \xxx, #hastag, @user, and \
pattern_sub = r'https://t.co/\w{9,10}|\\n|\\x\w{2}|#\w+|@\w+|\\'

### Functions

In [None]:
# Number of tweets per keyword and language
def stats_keyword_lang(filepath):
    keyword = []
    lang = []
    nrow = []
    nrow_non_null = []
    for filepath in tqdm(glob.glob(filepath)):
        temp_df = pd.read_csv(filepath, lineterminator= '\n', encoding= 'latin-1')
        keyword += [re.search(pattern_keyword, filepath).group(1)]
        lang += [re.search(pattern_lang, filepath).group(1)]
        nrow += [temp_df.shape[0]]
        nrow_non_null += [temp_df[~temp_df['text'].isna()].shape[0]]

    df = pd.DataFrame({
        'keyword':keyword,
        'lang':lang,
        'nrow':nrow,
        'nrow_non_null':nrow_non_null
    })
    df['pct_non_null'] = df['nrow_non_null']/df['nrow']*100
    return df

### Concatenate csv per keyword and language

In [None]:
# Select sebset of tweets for four keywords in fr, es, and pt
keywords = ['ukraine', 'russia', 'zelenskyy', 'putin']
langs = ['fr', 'es', 'pt']
for keyword in keywords:
    for lang in langs:
        df = pd.DataFrame()
        for filepath in tqdm(glob.glob(f'../data/tweets/tweets_{keyword}/tweets_{keyword}_{lang}/*')):
            df_temp = pd.read_csv(filepath, lineterminator='\n', encoding= 'latin-1')
            df_temp = df_temp.dropna(subset=['location'])  # remove tweets with NaN in location
            df_temp = df_temp[:500] # select 500 tweets per day, keyword, language
            df = pd.concat([df, df_temp]).sort_values('date')
        df = df.dropna(subset = ['tweet_id', 'text']).drop_duplicates('text')  # remove tweets with NaN in tweet_id or text and drop duplicated text
        df.to_csv(f'../data/tweets_final/tweets_{keyword}_{lang}.csv', index=False)
        

In [None]:
# Number of tweets per keyword and language
filepath = '../data/tweets_final/*'
df = stats_keyword_lang(filepath)

In [None]:
df.sort_values('nrow_non_null', ascending = False)

In [None]:
df.groupby('keyword').sum().sort_values('nrow_non_null', ascending = False)

In [None]:
df.groupby('lang').sum().sort_values('nrow_non_null', ascending = False)

In [None]:
df[df['lang']!='en']['nrow_non_null'].sum()

### Clean tweets
Remove b and '', url, \n, \xxx, #hastag, @user, and \

In [None]:
for filepath in tqdm(glob.glob('../data/tweets_final/*')):
    df = pd.read_csv(filepath,lineterminator='\n', encoding= 'latin-1')
    df['text'] = df['text'].apply(lambda x: re.sub(pattern_sub, '', repr(str(x).strip("b'")).strip("'")))
    df.to_csv(filepath, index=False)

In [None]:
# Number of tweets per keyword and language
filepath = '../data/tweets_final/*'
df = stats_keyword_lang(filepath)

In [None]:
df.sort_values('nrow_non_null', ascending = False)

In [None]:
df.groupby('keyword').sum().sort_values('nrow_non_null', ascending = False)

In [None]:
df[df['lang']!='en']['nrow_non_null'].sum()

### Select subset of tweets and delete potential fake accounts

In [None]:
#delete potential fake accounts
keywords = ['ukraine', 'russia', 'zelenskyy', 'putin']
for keyword in keywords:
    for filepath in tqdm(glob.glob(f'../data/tweets_final/tweets_{keyword}*')):
        df = pd.read_csv(filepath,lineterminator='\n', encoding= 'latin-1')    
        # Select tweets before 2022-04-16
        df['date'] = pd.to_datetime(df['date'], errors='coerce', format='%Y-%m-%d %H:%M:%S')
        df = df[df['date']<='2022-04-16']
        # Delete potential fake accounts
        df['friends_ount'] = pd.to_numeric(df['friends_ount'], errors= 'coerce').fillna(0)
        df['followers_ount'] = pd.to_numeric(df['followers_ount'], errors= 'coerce').fillna(0) 
        df = df[~((df['friends_ount'] <= 20)&( df['followers_ount'] <= 5)& df['acct_desc'].isnull())]
        filepath_new = re.sub('tweets_final', 'tweets_translated', filepath)
        df.to_csv(filepath_new, index=False)

In [None]:
# Number of tweets per keyword and language
filepath = '../data/tweets_translated/*'
df = stats_keyword_lang(filepath)

In [None]:
df.groupby('lang').sum().sort_values('nrow_non_null', ascending = False)

In [None]:
df[df['lang']!='en']['nrow_non_null'].sum()

### Translation of tweets to English

In [None]:
# Pretrained models
model_es_en = AutoModelWithLMHead.from_pretrained("Helsinki-NLP/opus-mt-es-en")
model_pt_en = AutoModelForSeq2SeqLM.from_pretrained("unicamp-dl/translation-pt-en-t5")
model_fr_en = AutoModelWithLMHead.from_pretrained("Helsinki-NLP/opus-mt-fr-en")

# Pretrained tokenizers
tokenizer_es_en = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-es-en")
tokenizer_pt_en = AutoTokenizer.from_pretrained("unicamp-dl/translation-pt-en-t5")
tokenizer_fr_en = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-fr-en")

# Translation pipelines
pipeline_es_en = pipeline("translation_es_to_en", model=model_es_en, tokenizer=tokenizer_es_en)
pipeline_pt_en = pipeline("text2text-generation", model=model_pt_en, tokenizer=tokenizer_pt_en)
pipeline_fr_en = pipeline("translation_fr_to_en", model=model_fr_en, tokenizer=tokenizer_fr_en)

def translate_es_en(text):
    translated_text = pipeline_es_en(text, max_length=400)[0]['translation_text']
    return translated_text
def translate_pt_en(text):
    translated_text = pipeline_pt_en(text, max_length=400)[0]['generated_text']
    return translated_text
def translate_fr_en(text):
    translated_text = pipeline_fr_en(text, max_length=400)[0]['translation_text']
    return translated_text

In [None]:
langs = ['es', 'pt', 'fr']
for lang in langs:
    filepaths = glob.glob(f'../data/tweets_translated/*_{lang}.csv')
    for filepath in tqdm(filepaths):
        print(filepath)
        df = pd.read_csv(filepath,lineterminator='\n', encoding= 'latin-1')
        if lang == 'es':
            df['text'] = df['text'].apply(translate_es_en)
            df['location'] = df['location'].apply(lambda x: translate_es_en(x).strip('. ') if not pd.isna(x) else x)
        elif lang == 'pt':
            df['text'] = df['text'].apply(translate_pt_en)
            df['location'] = df['location'].apply(lambda x: translate_pt_en(x).strip('. ') if not pd.isna(x) else x)
        elif lang == 'fr':
            df['text'] = df['text'].apply(translate_fr_en)
            df['location'] = df['location'].apply(lambda x: translate_fr_en(x).strip('. ') if not pd.isna(x) else x)
        else:
            df['text'] = df['text'].apply(translate_de_en)
            df['location'] = df['location'].apply(lambda x: translate_de_en(x).strip('. ') if not pd.isna(x) else x)
        df.to_csv(filepath, index=False)

In [None]:
# Number of tweets per keyword and language
filepath = '../data/tweets_translated/*'
df = stats_keyword_lang(filepath)

In [None]:
df.groupby('keyword').sum().sort_values('nrow_non_null', ascending = False)

### Concatenate translated tweets per keyword

In [None]:
# Concatenate translated tweets per keywords
keywords = ['ukraine', 'russia', 'zelenskyy', 'putin']
for keyword in keywords:
    df = pd.DataFrame()
    for filepath in glob.glob(f'../data/tweets_translated/tweets_{keyword}_*.csv'):
        df_temp = pd.read_csv(filepath, lineterminator='\n', encoding= 'latin-1')
        df = pd.concat([df, df_temp]).sort_values('date')
    df.to_csv(f'../data/tweets_translated/tweets_{keyword}_translated.csv', index=False)

### Combine English tweets with non-English tweets

In [None]:
col_to_keep = ['tweet_id', 'acct_desc', 'date', 'location', 'friends_ount','followers_ount', 'text', 'compound']
for keyword in keywords:
    df = pd.read_csv(f'../data/tweets_en/tweets_{keyword}_en.csv', usecols=col_to_keep, lineterminator='\n', encoding='latin-1')
    df2 = pd.read_csv(f'../data/tweets_translated/tweets_{keyword}_translated.csv', lineterminator='\n', encoding='latin-1')
    pd.concat([df, df2]).to_csv(f'../data/tweets_en/tweets_{keyword}_en.csv', index = False)