## Preprocessing of tweets

### Import packages

In [1]:
import pandas as pd
import glob
import re
from tqdm.auto import tqdm
from transformers import AutoModelWithLMHead, AutoTokenizer,AutoModelForSeq2SeqLM, pipeline
tqdm.pandas()

  from .autonotebook import tqdm as notebook_tqdm


In [35]:
# pd.set_option('display.max_colwidth', None)

### Regex patterns

In [3]:
pattern_filename = r'tweets_[a-z]*_[a-z]{2}'
pattern_keyword = r'_([a-z]+)_[a-z]{2}.csv'
pattern_lang = r'_([a-z]{2}).csv'
# pattern to remove url, \n, \xxx, #hastag, @user, and \
pattern_sub = r'https://t.co/\w{9,10}|\\n|\\x\w{2}|#\w+|@\w+|\\'

### Functions

In [436]:
# Number of tweets per keyword and language
def stats_keyword_lang(filepath):
    keyword = []
    lang = []
    nrow = []
    nrow_non_null = []
    for filepath in tqdm(glob.glob(filepath)):
        temp_df = pd.read_csv(filepath, lineterminator= '\n', encoding= 'latin-1')
        keyword += [re.search(pattern_keyword, filepath).group(1)]
        lang += [re.search(pattern_lang, filepath).group(1)]
        nrow += [temp_df.shape[0]]
        nrow_non_null += [temp_df[~temp_df['text'].isna()].shape[0]]

    df = pd.DataFrame({
        'keyword':keyword,
        'lang':lang,
        'nrow':nrow,
        'nrow_non_null':nrow_non_null
    })
    df['pct_non_null'] = df['nrow_non_null']/df['nrow']*100
    return df

### Concatenate csv per keyword and language

In [596]:
# Select tweets for four keywords
keywords = ['ukraine', 'russia', 'zelenskyy', 'putin']
langs = ['fr', 'es', 'pt']
for keyword in keywords:
    for lang in langs:
        df = pd.DataFrame()
        for filepath in tqdm(glob.glob(f'../data/tweets/tweets_{keyword}/tweets_{keyword}_{lang}/*')):
            df_temp = pd.read_csv(filepath, lineterminator='\n', encoding= 'latin-1')
            df_temp = df_temp.dropna(subset=['location'])  # remove tweets with NaN in location
            df_temp = df_temp[:500] # select 500 tweets per day, keyword, language
            df = pd.concat([df, df_temp]).sort_values('date')
        df = df.dropna(subset = ['tweet_id', 'text']).drop_duplicates('text')  # remove tweets with NaN in tweet_id or text and drop duplicated text
        df.to_csv(f'../data/tweets_final/tweets_{keyword}_{lang}.csv', index=False)
        

100%|███████████████████████████████████████████| 67/67 [00:03<00:00, 20.12it/s]
100%|███████████████████████████████████████████| 51/51 [00:01<00:00, 27.06it/s]
100%|███████████████████████████████████████████| 51/51 [00:01<00:00, 26.55it/s]
100%|███████████████████████████████████████████| 51/51 [00:01<00:00, 30.82it/s]
100%|███████████████████████████████████████████| 51/51 [00:01<00:00, 27.27it/s]
100%|███████████████████████████████████████████| 67/67 [00:02<00:00, 23.02it/s]
100%|███████████████████████████████████████████| 51/51 [00:01<00:00, 26.40it/s]
100%|███████████████████████████████████████████| 51/51 [00:01<00:00, 26.20it/s]
100%|███████████████████████████████████████████| 51/51 [00:01<00:00, 27.66it/s]
100%|███████████████████████████████████████████| 51/51 [00:02<00:00, 24.80it/s]
100%|███████████████████████████████████████████| 65/65 [00:02<00:00, 27.93it/s]
100%|███████████████████████████████████████████| 51/51 [00:00<00:00, 53.01it/s]
100%|███████████████████████

In [597]:
# Number of tweets per keyword and language
filepath = '../data/tweets_final/*'
df = stats_keyword_lang(filepath)

100%|███████████████████████████████████████████| 20/20 [00:01<00:00, 10.69it/s]


In [598]:
df.sort_values('nrow_non_null', ascending = False)

Unnamed: 0,keyword,lang,nrow,nrow_non_null,pct_non_null
9,russia,en,23308,23308,100.0
16,putin,en,23027,23027,100.0
1,ukraine,en,22690,22690,100.0
3,russia,fr,22454,22454,100.0
13,russia,de,22450,22450,100.0
0,ukraine,fr,22442,22442,100.0
12,putin,fr,22439,22439,100.0
4,putin,de,22432,22432,100.0
18,ukraine,de,22431,22431,100.0
17,ukraine,pt,22422,22422,100.0


In [599]:
df.groupby('keyword').sum().sort_values('nrow_non_null', ascending = False)

Unnamed: 0_level_0,nrow,nrow_non_null,pct_non_null
keyword,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
russia,112932,112932,500.0
putin,112238,112238,500.0
ukraine,110782,110782,500.0
zelenskyy,60717,60717,500.0


In [600]:
df.groupby('lang').sum().sort_values('nrow_non_null', ascending = False)

Unnamed: 0_level_0,nrow,nrow_non_null,pct_non_null
lang,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
en,90807,90807,400.0
fr,85212,85212,400.0
de,81461,81461,400.0
es,70556,70556,400.0
pt,68633,68633,400.0


In [601]:
df[df['lang']!='en']['nrow_non_null'].sum()

305862

### Clean tweets
Remove b and '', url, \n, \xxx, #hastag, @user, and \

In [602]:
for filepath in tqdm(glob.glob('../data/tweets_final/*')):
    df = pd.read_csv(filepath,lineterminator='\n', encoding= 'latin-1')
    df['text'] = df['text'].apply(lambda x: re.sub(pattern_sub, '', repr(str(x).strip("b'")).strip("'")))
    df.to_csv(filepath, index=False)

100%|███████████████████████████████████████████| 20/20 [00:08<00:00,  2.30it/s]


In [603]:
# Number of tweets per keyword and language
filepath = '../data/tweets_final/*'
df = stats_keyword_lang(filepath)

100%|███████████████████████████████████████████| 20/20 [00:01<00:00, 10.45it/s]


In [604]:
df.sort_values('nrow_non_null', ascending = False)

Unnamed: 0,keyword,lang,nrow,nrow_non_null,pct_non_null
9,russia,en,23308,23308,100.0
16,putin,en,23027,23027,100.0
1,ukraine,en,22690,22690,100.0
3,russia,fr,22454,22454,100.0
13,russia,de,22450,22450,100.0
0,ukraine,fr,22442,22442,100.0
12,putin,fr,22439,22439,100.0
4,putin,de,22432,22432,100.0
18,ukraine,de,22431,22431,100.0
17,ukraine,pt,22422,22422,100.0


In [605]:
df.groupby('keyword').sum().sort_values('nrow_non_null', ascending = False)

Unnamed: 0_level_0,nrow,nrow_non_null,pct_non_null
keyword,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
russia,112932,112932,500.0
putin,112238,112238,500.0
ukraine,110782,110782,500.0
zelenskyy,60717,60717,500.0


In [606]:
df[df['lang']!='en']['nrow_non_null'].sum()

305862

### Select subset of tweets and delete potential fake accounts

In [608]:
#delete potential fake accounts
keywords = ['ukraine', 'russia', 'zelenskyy', 'putin']
for keyword in keywords:
    for filepath in tqdm(glob.glob(f'../data/tweets_final/tweets_{keyword}*')):
        df = pd.read_csv(filepath,lineterminator='\n', encoding= 'latin-1')    
        # Select tweets before 2022-04-16
        df['date'] = pd.to_datetime(df['date'], errors='coerce', format='%Y-%m-%d %H:%M:%S')
        df = df[df['date']<='2022-04-16']
        # Delete potential fake accounts
        df['friends_ount'] = pd.to_numeric(df['friends_ount'], errors= 'coerce').fillna(0)
        df['followers_ount'] = pd.to_numeric(df['followers_ount'], errors= 'coerce').fillna(0) 
        df = df[~((df['friends_ount'] <= 20)&( df['followers_ount'] <= 5)& df['acct_desc'].isnull())]
        filepath_new = re.sub('tweets_final', 'tweets_translated', filepath)
        df.to_csv(filepath_new, index=False)

100%|█████████████████████████████████████████████| 5/5 [00:00<00:00,  6.27it/s]
100%|█████████████████████████████████████████████| 5/5 [00:00<00:00,  5.74it/s]
100%|█████████████████████████████████████████████| 5/5 [00:00<00:00,  9.90it/s]
100%|█████████████████████████████████████████████| 5/5 [00:00<00:00,  5.94it/s]


In [609]:
# Number of tweets per keyword and language
filepath = '../data/tweets_translated/*'
df = stats_keyword_lang(filepath)

100%|███████████████████████████████████████████| 20/20 [00:00<00:00, 55.95it/s]


In [610]:
df.groupby('lang').sum().sort_values('nrow_non_null', ascending = False)

Unnamed: 0_level_0,nrow,nrow_non_null,pct_non_null
lang,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
en,14347,14347,400.0
de,11465,11465,400.0
fr,11305,11305,400.0
es,9895,9895,400.0
pt,8946,8946,400.0


In [611]:
df[df['lang']!='en']['nrow_non_null'].sum()

41611

### Translation of tweets to English

In [628]:
# Pretrained models
model_es_en = AutoModelWithLMHead.from_pretrained("Helsinki-NLP/opus-mt-es-en")
model_pt_en = AutoModelForSeq2SeqLM.from_pretrained("unicamp-dl/translation-pt-en-t5")
model_fr_en = AutoModelWithLMHead.from_pretrained("Helsinki-NLP/opus-mt-fr-en")
model_de_en = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-de-en")

# Pretrained tokenizers
tokenizer_es_en = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-es-en")
tokenizer_pt_en = AutoTokenizer.from_pretrained("unicamp-dl/translation-pt-en-t5")
tokenizer_fr_en = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-fr-en")
tokenizer_de_en = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-de-en")

# Translation pipelines
pipeline_es_en = pipeline("translation_es_to_en", model=model_es_en, tokenizer=tokenizer_es_en)
pipeline_pt_en = pipeline("text2text-generation", model=model_pt_en, tokenizer=tokenizer_pt_en)
pipeline_fr_en = pipeline("translation_fr_to_en", model=model_fr_en, tokenizer=tokenizer_fr_en)
pipeline_de_en = pipeline("translation_de_to_en", model=model_de_en, tokenizer=tokenizer_de_en)

def translate_es_en(text):
    translated_text = pipeline_es_en(text, max_length=400)[0]['translation_text']
    return translated_text
def translate_pt_en(text):
    translated_text = pipeline_pt_en(text, max_length=400)[0]['generated_text']
    return translated_text
def translate_fr_en(text):
    translated_text = pipeline_fr_en(text, max_length=400)[0]['translation_text']
    return translated_text
def translate_de_en(text):
    translated_text = pipeline_de_en(text, max_length=400)[0]['translation_text']
    return translated_text



In [627]:
langs = ['es', 'pt', 'fr', 'de']
for lang in langs:
    filepaths = glob.glob(f'../data/tweets_translated/*_{lang}.csv')
    for filepath in tqdm(filepaths):
        print(filepath)
        df = pd.read_csv(filepath,lineterminator='\n', encoding= 'latin-1')
        if lang == 'es':
            df['text'] = df['text'].apply(translate_es_en)
            df['location'] = df['location'].apply(lambda x: translate_es_en(x).strip('. ') if not pd.isna(x) else x)
        elif lang == 'pt':
            df['text'] = df['text'].apply(translate_pt_en)
            df['location'] = df['location'].apply(lambda x: translate_pt_en(x).strip('. ') if not pd.isna(x) else x)
        elif lang == 'fr':
            df['text'] = df['text'].apply(translate_fr_en)
            df['location'] = df['location'].apply(lambda x: translate_fr_en(x).strip('. ') if not pd.isna(x) else x)
        else:
            df['text'] = df['text'].apply(translate_de_en)
            df['location'] = df['location'].apply(lambda x: translate_de_en(x).strip('. ') if not pd.isna(x) else x)
        df.to_csv(filepath, index=False)

  0%|                                                     | 0/4 [00:00<?, ?it/s]

../data/tweets_translated/tweets_putin_pt.csv


 25%|█████████                           | 1/4 [10:16:31<30:49:34, 36991.40s/it]

../data/tweets_translated/tweets_zelenskyy_pt.csv


 50%|██████████████████▌                  | 2/4 [11:04:04<9:23:40, 16910.16s/it]

../data/tweets_translated/tweets_russia_pt.csv


 75%|███████████████████████████▊         | 3/4 [21:03:16<7:06:45, 25605.17s/it]

../data/tweets_translated/tweets_ukraine_pt.csv


100%|███████████████████████████████████████| 4/4 [32:02:43<00:00, 28840.99s/it]


In [437]:
# Number of tweets per keyword and language
filepath = '../data/tweets_translated/*'
df = stats_keyword_lang(filepath)

100%|███████████████████████████████████████████| 70/70 [01:03<00:00,  1.10it/s]


In [None]:
df.groupby('keyword').sum().sort_values('nrow_non_null', ascending = False)

### Concatenate translated tweets per keyword

In [28]:
# Concatenate translated tweets per keywords
keywords = ['ukraine', 'russia', 'zelenskyy', 'putin']
for keyword in keywords:
    df = pd.DataFrame()
    for filepath in glob.glob(f'../data/tweets_translated/tweets_{keyword}_*.csv'):
        df_temp = pd.read_csv(filepath, lineterminator='\n', encoding= 'latin-1')
        df = pd.concat([df, df_temp]).sort_values('date')
    df.to_csv(f'../data/tweets_translated/tweets_{keyword}_translated.csv', index=False)

### Combine English tweets with non-English tweets

In [35]:
col_to_keep = ['tweet_id', 'acct_desc', 'date', 'location', 'friends_ount','followers_ount', 'text', 'compound']
for keyword in keywords:
    df = pd.read_csv(f'../data/tweets_en/tweets_{keyword}_en.csv', usecols=col_to_keep, lineterminator='\n', encoding='latin-1')
    df2 = pd.read_csv(f'../data/tweets_translated/tweets_{keyword}_translated.csv', lineterminator='\n', encoding='latin-1')
    pd.concat([df, df2]).to_csv(f'../data/tweets_en/tweets_{keyword}_en.csv', index = False)