# 1. Imports

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import os
import re
import gc
from tqdm.notebook import tqdm

from typing import List
import string

import warnings
warnings.filterwarnings("ignore")

# 2. Data Extraction

## 2.1 Creating Empty list for csv filenames
This csv files will be extracted into one `pandas.DataFrame`

In [None]:
csv_collection: List[str] = []

Extracting csv filenames into already created empty list `csv_collection`

In [None]:
for dir_name, _, file_names in os.walk('kaggle/input/ukraine-russian-crisis-twitter-dataset-1-2-m-rows'):
    for filename in file_names:
        fullpath = os.path.join(dir_name, filename)
        csv_collection.append(fullpath)

print(',\n'.join(csv_collection[:5]))

## 2.2 Loading dataset with significant events

Load events dataset which probably could affect on that day tweets.

In [None]:
most_known_attacks = pd.read_csv('kaggle/input/most-known-attacks-on-ukraine-2022/Most Known Attacks on Ukraine 2022.csv')
most_known_battles = pd.read_csv('kaggle/input/most-known-battles-in-ukraine-2022/Most Known battles in Ukraine 2022.csv')

### Most known attacks dataset example

In [None]:
most_known_attacks.head()

### Most known battles dataset example

In [None]:
most_known_battles.head()

## 2.3 Fitlering needed data
Filtering `csv_collection` because our team decided to study only significant date events.

In [None]:
significant_dates = most_known_attacks['Date'].values.tolist()
significant_dates += most_known_battles['Start_date'].values.tolist()
significant_dates += most_known_battles['End_date'].values.tolist()
significant_dates = sorted(list(set(map(str,significant_dates))))

print(', '.join(significant_dates[:5]))

In [None]:
def extract(string: str, pattern: str):
    return re.findall(pattern, string)

In [None]:
pattern_with_dot = r'(\d{2}).(\d{2}).(\d{4})'
pattern_in_path = r'\S*(\d{4})\S*'
alpha_numeric_pattern_in_path = r'\S*\_([A-Z]{3})(\d{2})\S*'

In [None]:
significant_days = extract(' '.join(significant_dates), pattern_with_dot)
print('\n'.join(map(str, significant_days[:5])))
# del significant_dates
# gc.collect()

Transform retrieved dates into MMDD format (because the tweets dataset contains files with this format)

In [None]:
significant_days = [i[1]+i[0] for i in significant_days]

The dataset contains two date formats alpha-numerical (old version) and numerical (new version). Because data is shuffled, we created an util function to convert alphabetical month names into numerical.

In [None]:
def alpha_to_numeric_month(string: str):
    dicted = {
        'JAN': '01',
        'FEB': '02',
        'MAR': '03',
        'APR': '04',
        'MAY': '05',
        'JUN': '06',
        'JUL': '07',
        'AUG': '08',
        'SEP': '09',
        'OCT': '10',
        'NOV': '11',
        'DEC': '12'
             }
    if string not in dicted.keys():
        raise ValueError(f'Unsupported value passed! Expected {dicted.keys()}, but {string} got instead.')
    return dicted[string]

Due to inconsistent tweets dataset filenames format we forced to use for loop to filter `csv_collection` 

In [None]:
filtered_csv_collection = []
for csv in csv_collection:
    try:
        extracted_date = extract(csv, pattern_in_path)[0]
    except Exception:
        extracted_month, extracted_day = extract(csv, alpha_numeric_pattern_in_path)[0]
        alpha_month = alpha_to_numeric_month(extracted_month)
        extracted_date = ''.join([alpha_month, extracted_day])
    finally:
        if extracted_date in significant_days:
            filtered_csv_collection.append(csv)        

## Create and fill dataframe


In [None]:
types = {
    'userid': np.uint, 
    'username': object, 
    'acctdesc': object, 
    'location': object, 
    'following': np.uintc, 
    'followers': np.uintc,
    'totaltweets': np.uint, 
    'tweetid': np.uint, 
    'retweetcount': np.ushort, 
    'text': object, 
    'hashtags': object, 
    'language': object, 
    'coordinates': object,
    'favorite_count': np.uintc, 
    'is_retweet': object, 
    'original_tweet_id': np.uint,
    'original_tweet_userid': np.uint, 
    'original_tweet_username': object,
    'in_reply_to_status_id': np.uint, 
    'in_reply_to_user_id': np.uint,
    'in_reply_to_screen_name': object, 
    'is_quote_status': object, 
    'quoted_status_id': np.uint,
    'quoted_status_userid': np.uint, 
    'quoted_status_username': object
}

# Important Note
The dataset contains literally a lot of data (really a lot, because we waited more than 20 minutes to store all data from significant dates (approximately 60 dates). In these days was approximately 10.5M tweets.

In the latest version of the notebook our team dicided to view only tweets posted on the 1st of March, 2022 - the day of Kharkiv goverment building airstrike and a day before it.

**UPDATE** 
In the 4th version of the notebook the author of the dataset has suggested us to take into account difference in time (all times in the dataset are in UTC+0), so we will extend the time for spectating on data.

In [None]:
mar01_tweets = [csv for csv in filtered_csv_collection if "MAR02" in csv or "MAR01" in csv or "FEB28" in csv]
all_tweets = pd.read_csv(mar01_tweets.pop(), compression='gzip', dtype=types)
for csv in mar01_tweets:
    all_tweets = pd.concat([all_tweets, pd.read_csv(csv, compression='gzip', dtype=types)])
gc.collect()

## Important Note
If you want ot store a full version of the dataset into your Jupyter Notebook, please, uncomment the cell below. 

**Don't forget to replace `mar01_tweets` variable with `filtered_csv_collection` on previous cell.**

In [None]:
# for csv in tqdm(filtered_csv_collection, desc='Files: '):
#     temp = pd.read_csv(csv, compression='gzip', dtype=types)
#     all_tweets = pd.concat([all_tweets, temp])
#     del temp
#     gc.collect()

In [None]:
all_tweets.info()

In [None]:
all_tweets.isna().sum()

The dataset contains Not a Number (NaN) values. 

# TODO:
- [x] set index to `userid`
- [x] remove `username`
- [x] remove account description (`acctdesc`)
- [x] remove `usercreatedts`
- [x] parse `hashtags`
- [x] remove `coordinates`
- [x] remove `extractedts`
- [ ] parse `location`
- [ ] transform time into Kyiv timezone (UTC+3 in winter and UTC+2 in summer)
- [ ] extend dataset for 2 days as we need to look at tweets before and after missile attack

## Clearing data from dataset

In [None]:
all_tweets = all_tweets.set_index('userid')
all_tweets = all_tweets.drop(columns=['username', 'acctdesc', 'usercreatedts', 'coordinates', 'extractedts'])

## Parsing hashtags

Print an example of hashtags

In [None]:
all_tweets['hashtags'].values[0]

In [None]:
all_tweets['hashtags'] = all_tweets['hashtags'].apply(lambda string: extract(string, "'text': \'(\S*)'"))

## Parsing location

## TODO:
- [ ] remove text translation because we found dataset with alternate city and country names.

In [None]:
all_tweets['location'].values[:100]

In [None]:
all_tweets['location'] = all_tweets['location'].fillna('Other')

## Cleaning tweets

In [None]:
def clean_text(tweet: str) -> str:
    text = "".join([letter for letter in tweet if letter not in string.punctuation])
    return list(filter(lambda word: word != '' or word is not None, re.split(r'\W+', text)))

In [None]:
clean_text('🇺🇦 mirabella! is eating an, apple.')

In [None]:
all_tweets['text'] = all_tweets['text'].apply(lambda tweet: clean_text(tweet))

In [None]:
all_tweets['text'].head()

In [None]:
datacities = pd.read_csv('kaggle/input/geonames-all-cities-with-a-population/geonames-all-cities-with-a-population-1000.csv', sep=';')
dicttowns = dict(zip(datacities['Name'], datacities['Country name EN']))
countries = set(datacities['Country name EN'])

In [None]:
datacities = datacities[['Geoname ID', 'Name', 'ASCII Name', 'Alternate Names', 'Country Code', 'Country name EN']]

In [None]:
datacities.head()

In [None]:
datacities[datacities['Name'] == 'Ascea']['Country name EN']

In [None]:
datacities.info()

In [None]:
datacities.columns

In [None]:
datacities['Alternate Names'] = datacities['Alternate Names'].fillna('').apply(lambda x: np.array(x.split(',')))

In [None]:
!pip install jellyfish

In [None]:
from jellyfish import jaro_winkler_similarity as dist

def finder(line: str) -> str:
    if line == 'Other':
        return 'Other'
    country = max([(str(i), dist(line, str(i))) for i in countries],
                   key=lambda x: x[1])
    town = max([(str(val), dist(line, str(key))) for key, val in dicttowns.items()],
               key=lambda x: x[1])
    result = max([country, town], key=lambda x: x[1])
    return result[0] if result[1]>=0.77 else 'Other'

In [None]:
from datetime import datetime

In [None]:
all_tweets[all_tweets['location'] == 'Ukraine']["location"]

In [None]:
# counts = all_tweets["location"].value_counts()
# for location in tqdm(counts.index):
#     if location != 'Other':
#         all_tweets[all_tweets['location'] == location]["location"] = finder(location)

In [None]:
start_time = datetime.now()
print(finder('United Kingdom'))
print(datetime.now() - start_time)

In [None]:
# start_time = datetime.now()
# all_tweets['location'] = all_tweets['location'].apply(finder)
# print(datetime.now() - start_time)

## Plotting

In [None]:
plt.figure(figsize=(20, 10))
sns.barplot(x=all_tweets['language'].value_counts()[:20].index, y=all_tweets['language'].value_counts()[:20])
plt.xticks(rotation=90)
plt.yscale("log")
plt.show()

In [None]:
plt.figure(figsize=(20, 10))
plt.pie(x=all_tweets['language'].value_counts()[:20], labels=all_tweets['language'].value_counts()[:20].index)
plt.show()

In [None]:
plt.figure(figsize=(15, 20))
sns.barplot(y=all_tweets['location'].value_counts()[:100].index, x=all_tweets['location'].value_counts()[:100])
plt.xscale("log")
plt.show()

In [None]:
ua_tweets = all_tweets[all_tweets['language'] == 'uk']
en_tweets = all_tweets[all_tweets['language'] == 'en']
ru_tweets = all_tweets[all_tweets['language'] == 'ru']

In [None]:
en_ua_ru_dataset = [en_tweets, ua_tweets, ru_tweets]

In [None]:
fig, ax = plt.subplots(3, 1, figsize=(20, 55))
for dataset, subplot in zip(en_ua_ru_dataset, ax.flatten()):
    sns.barplot(y=dataset['location'].value_counts()[:100].index, x=dataset['location'].value_counts()[:100], ax=subplot)
    subplot.set_title(f"Language: {dataset['language'].iat[0]}")
    subplot.set_xticklabels(labels=dataset['location'].value_counts()[:100].index, rotation=90)
    subplot.set_xscale('log')
plt.show()

In [None]:
all_tweets["tweetcreatedts"] = pd.to_datetime(all_tweets["tweetcreatedts"])
print(all_tweets["tweetcreatedts"])

Let's see 

In [None]:
grouped_by_time = all_tweets[["tweetcreatedts", "text"]].groupby(pd.Grouper(key="tweetcreatedts",freq='H')).size().reset_index()

plt.figure(figsize=(20, 15))
sns.barplot(data=grouped_by_time, y="tweetcreatedts", x=0)
# plt.xscale('log')
# plt.xticks(rotation=90)
plt.show()