In [91]:
import pandas as pd
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt

from deep_translator import GoogleTranslator
from transformers import pipeline
from geopy.geocoders import Nominatim
import preprocessor as p

import os
from colorama import Fore
import re

In [151]:
tqdm.pandas()
p.set_options(p.OPT.URL, p.OPT.MENTION, p.OPT.RESERVED, p.OPT.EMOJI, p.OPT.SMILEY, p.OPT.NUMBER)

data_loc = '../data/'
tweets_loc = 'TWEETS/'
tweets_processed_loc = 'TWEETS_PROCESSED/'

# Reparations on data2h

In [None]:
with open(data_loc+'data2h_refugees.OLDcsv', 'rb') as f:
    doc = f.read()
print(doc.decode('utf-8'))

In [None]:
data2h_refugees_REPAIRED = [[e for e in l.split(';')] for l in data2h_refugees.split('\n')][1:]
data2h_refugees_REPAIRED = pd.DataFrame(data2h_refugees_REPAIRED[1:], columns=data2h_refugees_REPAIRED[0])
data2h_refugees_REPAIRED["individuals"] = data2h_refugees_REPAIRED.individuals.apply(int)
data2h_refugees_REPAIRED.to_csv(data_loc+'data2h_refugees_REPAIRED.csv', index=False)
data2h_refugees_REPAIRED

# New datasets

In [None]:
files = os.listdir(data_loc)
files = [f for f in files if '.csv' in f]

# _files = os.listdir('TWEETS/')
# _files = ['TWEETS/'+f for f in _files if '.csv' in f]
# files += _files

files

In [None]:
def load_csv(path):
    key = path.split('.csv')[0].split('/')[-1]
    print(f"Loading {key}...")
    if '.gzip' in path:
        df = pd.read_csv(path, encoding='utf-8', compression='gzip')
    else :
        df = pd.read_csv(path)
    print(f"Loaded {key} !")
    return key, df

In [None]:
dfs = [load_csv(data_loc+f) for f in files]
dfs = dict(dfs)

In [None]:
dfs['refugees_data2h_REPAIRED']

# Twitter dataset exploration

In [None]:
tweets_files = [f for f in os.listdir(data_loc+tweets_loc) if '.csv' in f]
tweets_files

## Renaming tweets

In [None]:
translate_months = {
    'FEB':'02',
    'MAR':'03',
}

def rename_tweets(tweets_files):
    results = []
    
    for f in tweets_files:

        # Case 'UkraineCombinedTweetsDeduped_XXXX'
        if f[0] == 'U' and '_' in f:
            date = f.split('_')[1].split('.')[0]
            month = date[:3]
            day = date[3:]
            month = translate_months[month]

            beginning = f.split('_')[0]
            if len(f.split('_')) > 2 : intermediate = beginning = f.split('_')[2]
            else: intermediate = ''
            ending = '.'.join(f.split('_')[1].split('.')[1:])

            final = month+day+'_'+beginning+intermediate+'.'+ending
            results.append((f, final))
        else :
            results.append((f, f))
    
    results = [f for f in results if 'UkraineCombinedTweetsDeduped' in f[1]]
    return results

renaming_dict = dict(rename_tweets(tweets_files))

for o,n in renaming_dict.items():
    os.rename(tweets_loc+o, tweets_loc+n)

## Exploring tweets

In [None]:
tf = tweets_files[0]
tf

In [None]:
df_raw = pd.read_csv(data_loc+tweets_loc+tf, compression='gzip')
df_raw.head()

In [None]:
_to_drop=['Unnamed: 0','acctdesc', 'following', 'usercreatedts', 'coordinates', 'favorite_count']
df = df_raw.drop(columns=_to_drop)
df.set_index('tweetid', inplace=True)

ltest = df.groupby('language').first().reset_index()[['language', 'text']].values

In [None]:
ltest_tr = []

for l, t in ltest:
    try :
        print(Fore.BLACK + '############')
        translator = GoogleTranslator(source=l)
        tr = translator.translate(t)
        print(t)
        print(Fore.BLUE + tr)
        ltest_tr.append((t, tr))
    except Exception as e :
        print(Fore.RED + str(e))

### Emotions

In [None]:
def get_top_label(scores, topn=1, emo_only=True):
    s = scores[0]
    topn = min(topn, len(s))
    s = sorted(s, key=lambda d : d['score'], reverse=True)
    s = s[:topn]
    if emo_only: s = [d['label'] for d in s]
    if topn==1 : s = s[0]
    return s

In [81]:
ttest = ltest_tr[3][1]
model_path = "j-hartmann/emotion-english-distilroberta-base"
emotion_classifier = pipeline("text-classification", model=model_path, tokenizer=model_path, return_all_scores=True)

In [None]:
get_top_label(emotion_classifier(ttest))

'sadness'

### Sentiment

In [None]:
ttest = ltest_tr[3][1]

model_path = "cardiffnlp/twitter-xlm-roberta-base-sentiment"
sentiment_task = pipeline("sentiment-analysis", model=model_path, tokenizer=model_path, return_all_scores=True)

In [None]:
get_top_label(sentiment_task(ttest))

### Batch test

In [None]:
def get_translator(l):
    try :
        translator = GoogleTranslator(source=l)
        return (l, translator)
    except :
        return None

In [None]:
def _process_tweet(t):
    return t

In [None]:
def translate_batch(df, l, translator):
    tmp = df.loc[df.language==l, 'text']
    tmp = tmp.apply(_process_tweet)
    tmp = tmp.progress_apply(translator.translate)
    return tmp

In [None]:
def process_tweets(df):
    
    ## Translate
    # Get languages
    print(f"Getting all translators...")
    translators = df.language.unique()
    nlang = len(translators)
    translators = [get_translator(l) for l in translators]
    translators = [t for t in translators if t != None]
    translators = dict(translators)
    print(f"Got {len(translators)} translators ({len(translators)/nlang*100:.2f}%)!")
    
    # Translate for each available language
    for l in []:#translators.keys():
        print(f"\nStarting translation for {l}...")
        tr = translate_batch(df, l, translators[l])
        df.loc[df.language == l, 'translated'] = tr
        print(f"Translation done for {l}!")
        
    # Add english to only use translated column
    df.loc[df.language == 'en', 'translated'] = df.loc[df.language == 'en', 'text']
    
    ## Get Sentiment
    model_path = "cardiffnlp/twitter-xlm-roberta-base-sentiment"
    print(f"Starting sentiment analysis using {model_path}...")
    sentiment_task = pipeline("sentiment-analysis", model=model_path, tokenizer=model_path, return_all_scores=True)
    
    sen = df.loc[~df.translated.isna(), 'translated'].progress_apply(sentiment_task)
    sen = sen.apply(get_top_label)
    df['sentiment'] = sen
    print(f"Finished sentiment analysis!")
    
    # Get emotion
    
process_tweets(df)

In [None]:
tweets_files[0]

In [None]:
df.to_csv(data_loc+'TWEETS_SENTIMENT/EN_'+tweets_files[0])

In [None]:
df.groupby('language').count()['location'].reset_index().sort_values("location", ascending=False).plot(x='language', y='location', kind='bar', figsize=(17, 4))

## Exploring language distribution in tweets

In [None]:
for f in tweets_files:
    # load tweets
    tmp = pd.read_csv(tweets_loc+f, compression='gzip')
    # plot
    print(f"\n############### {f}")
    tmp.groupby('language').count()['location'].reset_index().sort_values("location", ascending=False).plot(x='language', y='location', kind='bar', figsize=(17, 4))
    plt.show()

## Geocode tests

Using geopy https://pypi.org/project/geopy/

In [6]:
files = [f for f in os.listdir(data_loc + tweets_loc) if 'UkraineCombinedTweets' in f]
print(f"First 3 files in {tweets_loc} folder:")
[print(f) for f in files[:3]]
print()

In [7]:
f = files[0]

In [8]:
# read dataframe
tmp = pd.read_csv(data_loc + tweets_loc + f, compression='gzip')

In [9]:
user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.127 Safari/537.36 Edg/100.0.1185.44"
geolocator = Nominatim(user_agent=user_agent)

In [10]:
tmp_en = tmp.loc[(tmp.language == 'en') & (~tmp.location.isna())]
locs = tmp_en.location.unique()
locs

In [None]:
def _extract_country(l) :
    try:
        country = geolocator.geocode(l, language = "en").address.split(',')[-1]
        country = " ".join(country.split())
    except Exception as e :
        if len(str(e)) < 400: print(e)
        else : print(str(e)[:400]+'...')
        country = ''
    # print(l + ' || ' + country)
    return country

countries = [(l, _extract_country(l)) for l in tqdm(locs)]
countries[:30]

In [None]:
for l in locs[:10] :
    loc = geolocator.geocode(l, language='en')
    print(l + "||" + loc.address.split(',')[-1])

# Complete pipeline

1. Extract a subset of all the english tweets.
2. Analyze emotions & sentiment for the subset.
3. Compute the country of origin for the tweet.

In [28]:
# Ratio to define subset
ratio = 0.2

In [30]:
## Load all files
# only containing UkraineCombinedTweets
files = [f for f in os.listdir(data_loc + tweets_loc) if 'UkraineCombinedTweets' in f]
# prints
print(f"# First 3 files in {tweets_loc} folder ({len(files)} files):")
[print(f) for f in files[:3]]
print()

In [83]:
# Getting the emotions & sentiment model
emotion_path = "j-hartmann/emotion-english-distilroberta-base"
emotion_classifier = pipeline("text-classification", model=emotion_path, tokenizer=emotion_path, return_all_scores=True)

sentiment_path = "cardiffnlp/twitter-xlm-roberta-base-sentiment"
sentiment_classifier = pipeline("sentiment-analysis", model=sentiment_path, tokenizer=sentiment_path, return_all_scores=True)

In [32]:
f = files[0]

In [158]:
## Define pipeline functions for emotion, sentiment and location retrieval

def get_top_label(scores, topn=1, emo_only=True):
    """
    scores : Output of pipeline(..) function from HuggingFace's transformers package for sentiment/emotion classifier.
    topn : Number of scores to output (if value is greater than max length, it is set to max length).
    emo_only : Remove the scores and keep only the emotions.
    """
    s = scores[0]
    topn = min(topn, len(s))
    s = sorted(s, key=lambda d : d['score'], reverse=True)
    s = s[:topn]
    if emo_only: s = [d['label'] for d in s]
    if topn==1 : s = s[0]
    return s

def _extract_emotion(t, classifier=emotion_classifier):
    """
    Extracts ranked score & emotions.
    """
    score = get_top_label(classifier(t), topn=100, emo_only=False) 
    return score

def _extract_sentiment(t, classifier=sentiment_classifier):
    """
    Extracts ranked score & sentiment (POSITIVE | NEUTRAL | NEGATIVE).
    """
    score = get_top_label(classifier(t), topn=100, emo_only=False) 
    return score    

def _extract_country(l, error_crop = 400, verbose = False) :
    """
    Tries to extract the country from the 'location' field in the tweet. If impossible, sets an empty string.
    Note: Error is printed (cropped to 400 chars.)
    """
    try:
        country = geolocator.geocode(l, language = "en").address.split(',')[-1]
        country = " ".join(country.split())
    except Exception as e :
        if verbose :
            if len(str(e)) < error_crop: print(e)
            else : print(str(e)[:error_crop]+'...')
        country = ''
    # print(l + ' || ' + country)
    return country

def _preprocess_tweet(t) :
    """
    NLP processing for the text.
    Note: tokenization is handled by the models.
    """
    # removes links, mentions, smileys, reserved words, emojis
    r = p.clean(t).strip()
    
    # Matches (?<=#)([A-Z]?[a-z]*) or (?<=[a-z])([A-Z][a-z]*)
    # First case is a lookbehind for #, followed by a detection of a sequence optionally starting with a capital letter.
    # Second case is a lookbehind for [a-z], followed by a detection of a sequence starting with a capital letter.
    # This way we do not separate hashtags with no capital letters in them e.g. #oneforall
    r = re.sub(r'(?<=#)([A-Z]?[a-z]*)|(?<=[a-z])([A-Z][a-z]*)', r' \1\2', r)
    # Replace # by a hashtag
    r = re.sub(r'(# )|( #)', '', r)
    
    return r

In [159]:
def process_df(f):
    print(f"Processing {f}...")
    
    ## Read dataframe
    tmp = pd.read_csv(data_loc + tweets_loc + f, compression='gzip')

    # Get only valid tweets
    tmp_en = tmp.loc[(tmp.language == 'en') & (~tmp.location.isna())]
    print(f"Filtered (only english & non-NaN locatio) size: {tmp_en.shape[0]:,} (Total: {tmp.shape[0]:,}; {tmp_en.shape[0] / tmp.shape[0]*100:.2f}%).")

    ## Get subset
    idx = int(ratio * tmp_en.shape[0])
    sample = tmp_en.sample(frac=1).reset_index(drop=True).iloc[:idx]
    print(f"Sampled {sample.shape[0]:,} lines ({sample.shape[0]/tmp_en.shape[0]*100:.2f}%) from filtered.")
    
    print("Extracting country...")
    sample['country'] = sample.location.progress_apply(_extract_country)
    
    processed_text = sample.text.progress_apply(_preprocess_tweet)
    print("Extracting emotion...")
    sample['emotion'] = processed_text.progress_apply(_extract_emotion)
    print("Extracting sentiment...")
    sample['sentiment'] = processed_text.progress_apply(_extract_sentiment)
    
    # Save df
    path = data_loc + tweets_processed_loc + f
    print(f"Save file to {path}...")
    sample.to_csv(path)
    print(f"#### Done processing {f}!\n")

In [160]:
for f in files:
    process_df(f)

Processing 0227_UkraineCombinedTweetsDeduped.csv.gzip...
Filtered (only english & non-NaN locatio) size: 137,362 (Total: 357,018; 38.47%).
Sampled 27,472 lines (20.00%) from filtered.
Extracting country...


  0%|          | 0/27472 [00:00<?, ?it/s]

KeyboardInterrupt: 