In [1]:
import pandas as pd 

ukrainian_tweets = pd.read_csv('data/ukrainian_only/en_tweets_first_half.csv')
ukrainian_tweets.shape

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


### drop rows with empty text and duplicates

In [115]:
ukrainian_tweets = ukrainian_tweets.dropna(subset=['text'])
ukrainian_tweets = ukrainian_tweets[ukrainian_tweets.language =='en']
ukrainian_tweets = ukrainian_tweets.drop_duplicates(subset=['text'])
ukrainian_tweets['text'] = ukrainian_tweets.text.apply(lambda x : x.lower())
ukrainian_tweets.shape

(21281, 30)

In [116]:
### Remove links 

import re

def remove_links(text : str):
    text = text = re.sub(r"\S*https?:\S*", "", text)
    return text

In [117]:
ukrainian_tweets.text = ukrainian_tweets.text.apply(lambda x : remove_links(x))

In [118]:
ukrainian_tweets.text = ukrainian_tweets.text.apply(lambda x: re.sub(r'[#@&][\S]+', '', str(x)))

In [119]:
ukrainian_tweets.text

0        dear vaccine advocate\n\ndo take the covid19 m...
2        animal shelter dogs and cats, we need your hel...
3        welcome to our shelter!\nlocated in ukraine, k...
5        👇 good news you may have missed: first  shipme...
6        opinion: the ukraine war is also being fought ...
                               ...                        
47986    russian t72 tank hits a ukrainian mine / date ...
47987    turks, putin's hand-picked  ''mediators'', con...
47989    everything as usual, just another russian gove...
47990                                 twisted firestarter!
47993    when people say the us was in better shape whi...
Name: text, Length: 21281, dtype: object

### replace all hashtags, ampersands, and character references with no space

In [120]:
# ukrainian_tweets['text'] = ukrainian_tweets.text.apply(lambda x: re.sub(r'#[\S]+','', str(x)))
# ukrainian_tweets['text'] = ukrainian_tweets.text.apply(lambda x: re.sub(r'@[\S]+','', str(x)))

In [121]:
ukrainian_tweets.text

0        dear vaccine advocate\n\ndo take the covid19 m...
2        animal shelter dogs and cats, we need your hel...
3        welcome to our shelter!\nlocated in ukraine, k...
5        👇 good news you may have missed: first  shipme...
6        opinion: the ukraine war is also being fought ...
                               ...                        
47986    russian t72 tank hits a ukrainian mine / date ...
47987    turks, putin's hand-picked  ''mediators'', con...
47989    everything as usual, just another russian gove...
47990                                 twisted firestarter!
47993    when people say the us was in better shape whi...
Name: text, Length: 21281, dtype: object

### remove emoji

In [122]:
try:
    import emoji
except ModuleNotFoundError:
    !python -m pip install emoji --upgrade
    import emoji
    
def give_emoji_free_text(text):
    return emoji.replace_emoji(text,replace='')

In [123]:
ukrainian_tweets['text_processed'] =  ukrainian_tweets['text'].apply(lambda x : give_emoji_free_text(x))

In [124]:
ukrainian_tweets['text_processed']

0        dear vaccine advocate\n\ndo take the covid19 m...
2        animal shelter dogs and cats, we need your hel...
3        welcome to our shelter!\nlocated in ukraine, k...
5         good news you may have missed: first  shipmen...
6        opinion: the ukraine war is also being fought ...
                               ...                        
47986    russian t72 tank hits a ukrainian mine / date ...
47987    turks, putin's hand-picked  ''mediators'', con...
47989    everything as usual, just another russian gove...
47990                                 twisted firestarter!
47993    when people say the us was in better shape whi...
Name: text_processed, Length: 21281, dtype: object

### Remove stopwords and punctuation

In [125]:
from nltk.corpus import stopwords

In [126]:
stopwords = stopwords.words('english')

In [127]:
import re

def remove_stopwords_and_punctuation(text : str):
    """Split text by space char and remove stopwords"""

    text_no_punctuation = re.sub(r'[^\w\s]', '', text)
    words = text_no_punctuation.split()  # Split text into words
    filtered_words = [word for word in words if word not in stopwords]  # Keep words that are not in stopwords
    return ' '.join(filtered_words)  # Join words back into a string

In [128]:
ukrainian_tweets['text_processed'] =  ukrainian_tweets['text_processed'].apply(remove_stopwords_and_punctuation)

In [129]:
ukrainian_tweets['text_processed']

0        dear vaccine advocate take covid19 mrna shot b...
2        animal shelter dogs cats need help raising fun...
3        welcome shelter located ukraine kyiv shelter n...
5        good news may missed first shipment wheat oper...
6             opinion ukraine war also fought language cnn
                               ...                        
47986    russian t72 tank hits ukrainian mine date unknown
47987    turks putins handpicked mediators continue thu...
47989    everything usual another russian government pe...
47990                                  twisted firestarter
47993    people say us better shape trump office l remi...
Name: text_processed, Length: 21281, dtype: object

In [130]:
ukrainian_tweets = ukrainian_tweets[~ukrainian_tweets['text_processed'].isna()]

### remove english characters ?

In [131]:
# def remove_english_characters(text):
#     return re.sub(r'[a-zA-Z]', '', text)

In [132]:
# ukrainian_tweets['text_processed'] = ukrainian_tweets['text_processed'].apply(remove_english_characters)

In [133]:
ukrainian_tweets['text_processed']

0        dear vaccine advocate take covid19 mrna shot b...
2        animal shelter dogs cats need help raising fun...
3        welcome shelter located ukraine kyiv shelter n...
5        good news may missed first shipment wheat oper...
6             opinion ukraine war also fought language cnn
                               ...                        
47986    russian t72 tank hits ukrainian mine date unknown
47987    turks putins handpicked mediators continue thu...
47989    everything usual another russian government pe...
47990                                  twisted firestarter
47993    people say us better shape trump office l remi...
Name: text_processed, Length: 21281, dtype: object

### remove numbers and remove extra spaces

In [134]:
def remove_numbers(text):
    return re.sub(r'\d+', '', text)

In [135]:
def remove_extra_spaces(text):
    text = re.sub(r'\s+', ' ', text).strip()
    return text.replace('_','') 

In [136]:
ukrainian_tweets['text_processed'] = ukrainian_tweets['text_processed'].apply(remove_numbers)

In [137]:
ukrainian_tweets['text_processed'] 

0        dear vaccine advocate take covid mrna shot boo...
2        animal shelter dogs cats need help raising fun...
3        welcome shelter located ukraine kyiv shelter n...
5        good news may missed first shipment wheat oper...
6             opinion ukraine war also fought language cnn
                               ...                        
47986      russian t tank hits ukrainian mine date unknown
47987    turks putins handpicked mediators continue thu...
47989    everything usual another russian government pe...
47990                                  twisted firestarter
47993    people say us better shape trump office l remi...
Name: text_processed, Length: 21281, dtype: object

In [138]:
ukrainian_tweets['text_processed'] = ukrainian_tweets['text_processed'].apply(remove_extra_spaces)

In [139]:
ukrainian_tweets['text_processed']

0        dear vaccine advocate take covid mrna shot boo...
2        animal shelter dogs cats need help raising fun...
3        welcome shelter located ukraine kyiv shelter n...
5        good news may missed first shipment wheat oper...
6             opinion ukraine war also fought language cnn
                               ...                        
47986      russian t tank hits ukrainian mine date unknown
47987    turks putins handpicked mediators continue thu...
47989    everything usual another russian government pe...
47990                                  twisted firestarter
47993    people say us better shape trump office l remi...
Name: text_processed, Length: 21281, dtype: object

### lemmatization

In [24]:
try :
    import pymorphy3
except ModuleNotFoundError:
    ! pip install pymorphy3
    ! pip install pymorphy3-dicts-uk
    import pymorphy3
except Exception as e:
    print(f"Error occured during importing pymorphy {e}")
    

Collecting pymorphy3
  Obtaining dependency information for pymorphy3 from https://files.pythonhosted.org/packages/ee/53/862f7b7f3e488e5420bebd5cf59362cb175463ad3cfddd61ade15a738dc7/pymorphy3-2.0.1-py3-none-any.whl.metadata
  Downloading pymorphy3-2.0.1-py3-none-any.whl.metadata (1.8 kB)
Collecting pymorphy3-dicts-ru (from pymorphy3)
  Obtaining dependency information for pymorphy3-dicts-ru from https://files.pythonhosted.org/packages/b0/67/469e9e52d046863f5959928794d3067d455a77f580bf4a662630a43eb426/pymorphy3_dicts_ru-2.4.417150.4580142-py2.py3-none-any.whl.metadata
  Downloading pymorphy3_dicts_ru-2.4.417150.4580142-py2.py3-none-any.whl.metadata (2.0 kB)
Downloading pymorphy3-2.0.1-py3-none-any.whl (53 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.2/53.2 kB[0m [31m965.5 kB/s[0m eta [36m0:00:00[0m[36m0:00:01[0m
[?25hDownloading pymorphy3_dicts_ru-2.4.417150.4580142-py2.py3-none-any.whl (8.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [3

In [30]:
# Create a morphological analyzer for Ukrainian
morph = pymorphy3.MorphAnalyzer(lang='uk')


def lemmatize_text_uk(text):
    words = text.split()  
    lemmatized_words = [morph.parse(word)[0].normal_form for word in words]
    return ' '.join(lemmatized_words)

In [31]:
ukrainian_tweets['text_processed'] = ukrainian_tweets['text_processed'].apply(lemmatize_text_uk) 

In [24]:
ukrainian_tweets['text_processed'] 

0        dear vaccine advocate take covid mrna shot boo...
2        animal shelter dogs cats need help raising fun...
3        welcome shelter located ukraine kyiv shelter n...
5        good news may missed first shipment wheat oper...
6        opinion ukraine war also fought language cnn h...
                               ...                        
47986    russian t tank hits ukrainian mine date unknow...
47987    turks putins handpicked mediators continue thu...
47989    everything usual another russian government pe...
47990                                  twisted firestarter
47993    people say us better shape trump office l remi...
Name: text_processed, Length: 23010, dtype: object

In [79]:
ukrainian_tweets.to_csv('./data/processed/en_tweets_processed.csv',index=False)

In [113]:
ukrainian_tweets[ukrainian_tweets.text.apply(lambda x : 'https' in x)]

Unnamed: 0.1,Unnamed: 0,userid,username,acctdesc,location,following,followers,totaltweets,usercreatedts,tweetid,...,original_tweet_username,in_reply_to_status_id,in_reply_to_user_id,in_reply_to_screen_name,is_quote_status,quoted_status_id,quoted_status_userid,quoted_status_username,extractedts,text_processed


In [114]:
ukrainian_tweets.text

0        dear vaccine advocate\n\ndo take the covid19 m...
2        animal shelter dogs and cats, we need your hel...
3        welcome to our shelter!\nlocated in ukraine, k...
5        👇 good news you may have missed: first  shipme...
6        opinion: the ukraine war is also being fought ...
                               ...                        
47986    russian t72 tank hits a ukrainian mine / date ...
47987    turks, putin's hand-picked  ''mediators'', con...
47989    everything as usual, just another russian gove...
47990                                 twisted firestarter!
47993    when people say the us was in better shape whi...
Name: text, Length: 23010, dtype: object