## Preprocessing of tweets

### Import packages

In [40]:
import pandas as pd
import glob
import re
from tqdm.auto import tqdm
from transformers import AutoModelWithLMHead, AutoTokenizer,AutoModelForSeq2SeqLM, pipeline
tqdm.pandas()

In [35]:
pd.set_option('display.max_colwidth', None)

In [3]:
pattern_filename = r'tweets_[a-z]*_[a-z]{2}'
# pattern to remove url, \n, \xxx, #hastag, @user, and \
pattern_sub = r'https://t.co/\w{9,10}|\\n|\\x\w{2}|#\w+|@\w+|\\'

### Concatenate csv per keyword and language

In [None]:
for filepath1 in tqdm(glob.glob('../data/tweets/*')):
    for filepath2 in glob.glob(f'{filepath1}/*'):
        concat_csv = []
        for filepath3 in glob.glob(f'{filepath2}/*'):
            concat_csv.append(pd.read_csv(filepath3, lineterminator='\n')) 
        df = pd.concat(concat_csv).sort_values('date')
        df = df[~df['tweet_id'].isna()].drop_duplicates('text') # remove tweets w/o tweet_id and duplicated tweets
        match = re.search(pattern_filename, filepath2)
        df.to_csv(f'../data/tweets_final/{match.group(0)}.csv')

### Clean tweets
Remove b and '', url, \n, \xxx, #hastag, @user, and \

In [None]:
for filepath in tqdm(glob.glob('../data/tweets_final/*')):
    df = pd.read_csv(filepath,lineterminator='\n')
    df['text'] = df['text'].apply(lambda x: re.sub(pattern_sub, '', repr(str(x).strip("b'")).strip("'")))
    df.to_csv(filepath)

### Translation of tweets to English

In [104]:
# Pretrained models
model_es_en = AutoModelWithLMHead.from_pretrained("Helsinki-NLP/opus-mt-es-en")
model_pt_en = AutoModelForSeq2SeqLM.from_pretrained("unicamp-dl/translation-pt-en-t5")
model_fr_en = AutoModelWithLMHead.from_pretrained("Helsinki-NLP/opus-mt-fr-en")
model_de_en = AutoModelWithLMHead.from_pretrained("Helsinki-NLP/opus-mt-de-en")
model_ru_en = AutoModelWithLMHead.from_pretrained("Helsinki-NLP/opus-mt-ru-en")
model_uk_en = AutoModelWithLMHead.from_pretrained("Helsinki-NLP/opus-mt-uk-en")
model_zh_en = AutoModelWithLMHead.from_pretrained("Helsinki-NLP/opus-mt-zh-en")
model_ja_en = AutoModelWithLMHead.from_pretrained("Helsinki-NLP/opus-mt-ja-en")
model_hi_en = AutoModelWithLMHead.from_pretrained("Helsinki-NLP/opus-mt-hi-en")

# Pretrained tokenizers
tokenizer_es_en = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-es-en")
tokenizer_pt_en = AutoTokenizer.from_pretrained("unicamp-dl/translation-pt-en-t5")
tokenizer_fr_en = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-fr-en")
tokenizer_de_en = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-de-en")
tokenizer_ru_en = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-ru-en")
tokenizer_uk_en = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-uk-en")
tokenizer_zh_en = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-zh-en")
tokenizer_ja_en = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-ja-en")
tokenizer_hi_en = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-hi-en")

# Translation pipelines
pipeline_es_en = pipeline("translation_es_to_en", model=model_es_en, tokenizer=tokenizer_es_en)
pipeline_pt_en = pipeline("text2text-generation", model=model_pt_en, tokenizer=tokenizer_pt_en)
pipeline_fr_en = pipeline("translation_fr_to_en", model=model_fr_en, tokenizer=tokenizer_fr_en)
pipeline_de_en = pipeline("translation_de_to_en", model=model_de_en, tokenizer=tokenizer_de_en)
pipeline_ru_en = pipeline("translation_ru_to_en", model=model_ru_en, tokenizer=tokenizer_ru_en)
pipeline_uk_en = pipeline("translation_uk_to_en", model=model_uk_en, tokenizer=tokenizer_uk_en)
pipeline_zh_en = pipeline("translation_zh_to_en", model=model_zh_en, tokenizer=tokenizer_zh_en)
pipeline_ja_en = pipeline("translation_ja_to_en", model=model_ja_en, tokenizer=tokenizer_ja_en)
pipeline_hi_en = pipeline("translation_hi_to_en", model=model_hi_en, tokenizer=tokenizer_hi_en)


In [54]:
def translate_es_en(text):
    translated_text = pipline_es_en(text, max_length=400)[0]['translation_text']
    return translated_text
def translate_pt_en(text):
    translated_text = pipline_pt_en(text, max_length=400)[0]['generated_text']
    return translated_text
def translate_fr_en(text):
    translated_text = pipline_fr_en(text, max_length=400)[0]['translation_text']
    return translated_text
def translate_de_en(text):
    translated_text = pipline_de_en(text, max_length=400)[0]['translation_text']
    return translated_text
def translate_ru_en(text):
    translated_text = pipline_ru_en(text, max_length=400)[0]['translation_text']
    return translated_text
def translate_uk_en(text):
    translated_text = pipline_uk_en(text, max_length=400)[0]['translation_text']
    return translated_text
def translate_zh_en(text):
    translated_text = pipline_zh_en(text, max_length=400)[0]['translation_text']
    return translated_text
def translate_ja_en(text):
    translated_text = pipline_ja_en(text, max_length=400)[0]['translation_text']
    return translated_text
def translate_hi_en(text):
    translated_text = pipline_hi_en(text, max_length=400)[0]['translation_text']
    return translated_text

In [253]:
lang = 'es'
glob.glob(f'../data/tweets_final/*_{lang}.csv')

['../data/tweets_final/tweets_nato_es.csv',
 '../data/tweets_final/tweets_macron_es.csv',
 '../data/tweets_final/tweets_scholz_es.csv',
 '../data/tweets_final/tweets_zelenskyy_es.csv',
 '../data/tweets_final/tweets_putin_es.csv',
 '../data/tweets_final/tweets_johnson_es.csv',
 '../data/tweets_final/tweets_biden_es.csv',
 '../data/tweets_final/tweets_russia_es.csv',
 '../data/tweets_final/tweets_eu_es.csv',
 '../data/tweets_final/tweets_ukraine_es.csv']

In [None]:
langs = ['es', 'pt', 'fr', 'de', 'ru', 'uk', 'zh', 'ja', 'hi']
for lang in tqdm(langs):
    filepaths = glob.glob(f'../data/tweets_final/*_{lang}.csv')
    for filepath in filepaths:
        df = pd.read_csv(filepath,lineterminator='\n')
        if lang == 'es':
            df['text'] = df['text'].apply(translate_es_en)
            df['location_trans'] = df['location'].apply(lambda x: translate_es_en(x).strip('. ') if not pd.isna(x) else x)
        elif lang == 'pt':
            df['text'] = df['text'].apply(translate_pt_en)
            df['location_trans'] = df['location'].apply(lambda x: translate_pt_en(x).strip('. ') if not pd.isna(x) else x)
        elif lang == 'fr':
            df['text'] = df['text'].apply(translate_fr_en)
            df['location_trans'] = df['location'].apply(lambda x: translate_fr_en(x).strip('. ') if not pd.isna(x) else x)
        elif lang == 'de':
            df['text'] = df['text'].apply(translate_de_en)
            df['location_trans'] = df['location'].apply(lambda x: translate_de_en(x).strip('. ') if not pd.isna(x) else x)
        elif lang == 'ru':
            df['text'] = df['text'].apply(translate_ru_en)
            df['location_trans'] = df['location'].apply(lambda x: translate_ru_en(x).strip('. ') if not pd.isna(x) else x)
        elif lang == 'uk':
            df['text'] = df['text'].apply(translate_uk_en)
            df['location_trans'] = df['location'].apply(lambda x: translate_uk_en(x).strip('. ') if not pd.isna(x) else x)
        elif lang == 'zh':
            df['text'] = df['text'].apply(translate_zh_en)
            df['location_trans'] = df['location'].apply(lambda x: translate_zh_en(x).strip('. ') if not pd.isna(x) else x)
        elif lang == 'ja':
            df['text'] = df['text'].apply(translate_ja_en)
            df['location_trans'] = df['location'].apply(lambda x: translate_ja_en(x).strip('. ') if not pd.isna(x) else x)
        else:
            df['text'] = df['text'].apply(translate_hi_en)
        df.to_csv(filepath)



  df = pd.read_csv(filepath,lineterminator='\n')


### Concatenate translated tweets per keyword

### Get the latitude and longitude of tweets' location

In [189]:
df = pd.read_csv('../data/worldcities/worldcities.csv')
df[df['iso2']=='ES']

Unnamed: 0,city,city_ascii,lat,lng,country,iso2,iso3,admin_name,capital,population,id
102,Madrid,Madrid,40.4167,-3.7167,Spain,ES,ESP,Madrid,primary,6006000.0,1724616994
159,Barcelona,Barcelona,41.3825,2.1769,Spain,ES,ESP,Catalonia,admin,4735000.0,1724594040
986,Valencia,Valencia,39.4700,-0.3764,Spain,ES,ESP,Valencia,admin,789744.0,1724981666
1094,Sevilla,Sevilla,37.3900,-5.9900,Spain,ES,ESP,Andalusia,admin,684234.0,1724991838
1107,Zaragoza,Zaragoza,41.6500,-0.8833,Spain,ES,ESP,Aragon,admin,675301.0,1724907391
...,...,...,...,...,...,...,...,...,...,...,...
41871,Corral de Almaguer,Corral de Almaguer,39.7594,-3.1650,Spain,ES,ESP,Castille-La Mancha,,5068.0,1724842542
41888,Castilblanco de los Arroyos,Castilblanco de los Arroyos,37.6750,-5.9889,Spain,ES,ESP,Andalusia,,5059.0,1724547049
41892,Candeleda,Candeleda,40.1558,-5.2408,Spain,ES,ESP,Castille-Leon,,5049.0,1724648245
42411,Candás,Candas,43.5486,-5.7897,Spain,ES,ESP,Asturias,,7193.0,1724687353


### Notes
In English tweets for Ukraine 4/17/2022, the Firend Count of 0.1 quantile is 22 and the Follower Count of 0.1 quantile is 6. To remove potential fake accounts, use the tweets from uses of more than 20 Fridens, 5 Followers, and with some acctdesc. df[~((df['Friends Count'] <= 20)&( df['Follower Count'] <= 5)& df['acctdesc'].isnull())]

In [None]:
# Rename and drop columns 
tweets_us_north = (tweets_us_north
          .drop(columns = ['username', 'acctdesc', 'Friends Count','Account Verified', 'Favorite Count', 'hashtags'])
          .rename(columns = {'Tweet Id': 'id', 'Tweet Date': 'date', 'Follower Count': 'follower_count', 'Retweets': 'retweets', 'Tweet Text': 'text'})
         )