<a href="https://colab.research.google.com/github/gummy-brain/Leader-Appeal-in-Times-of-War/blob/main/Ukraine_war_Putin_Tweets_dataset_prep.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Ukraine Conflict Twitter Dataset (53.87M tweets)

2022-10-23

In [None]:
# First some preparations to keep our code clean.

# imports
import os
import csv
import gzip
import zipfile

from multiprocessing import Pool, cpu_count
from typing import Tuple

from tqdm import tqdm
import pandas as pd


In [None]:
from google.colab import drive

drive.mount('/content/gdrive', force_remount=True)

Mounted at /content/gdrive


In [None]:
# constants
DATA_FOLDER = '/content/gdrive/MyDrive/UkraineTweetsArchive'
D_TYPES = {
    'userid': 'int64',
    'username': 'str',
    'acctdesc': 'str',
    'location': 'str',
    'following': 'int64',
    'followers': 'int64',
    'totaltweets': 'int64',
    'usercreatedts': 'str',
    'tweetid': 'int64',
    'tweetcreatedts': 'str',
    'retweetcount': 'int64',
    'text': 'str',
    'hashtags': 'str',
    'language': 'category',
    'coordinates': 'str',
    'favorite_count': 'int64',
    'extractedts': 'str'
}
DATE_COLS = ['usercreatedts', 'tweetcreatedts', 'extractedts']

# functions
def load_file(input: tuple) -> pd.DataFrame:
    """ loading file function for multiprocessing. """
    path, proces_fnc = input
    with gzip.open(DATA_FOLDER + "/" + path, 'r') as f:
        df = pd.read_csv(f, index_col=0, dtype=D_TYPES, parse_dates=DATE_COLS, encoding='utf-8', quoting=csv.QUOTE_ALL)

    return proces_fnc(df)


def pre_process(df: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """ Basic pre-processing function. """
    if 'is_retweet' in df.columns:
        df = df[df['is_retweet'] != True]

    df = df[df['language'] == 'en']
    df['text'] = df['text'].str.lower()

    sample = df.sample(frac=0.01)
    pro_russia_tweets = df[df['text'].str.contains(pat = 'istandwithrussia|standwithrussia|istandwithputin|standwithputin|isupportputin|supportputin|isupportrussia|isupportrussia')]
    pro_russia_twets = pro_russia_tweets[~pro_russia_tweets['text'].str.contains(pat = "standwithzelensky|istandwithzelensky|istandwithukraine|standwithukraine|stoprussianow|stoprussia|stopputin|stopputinnow|supportukraine|isupportukraine")]
    return sample, pro_russia_tweets

In [None]:
# Get all data files.
files = os.listdir(DATA_FOLDER)

# Create inputs for load_file function. (we pass the pre processing logic sepeartely so we can use different preprocessing logic later.)
input = [(file, pre_process) for file in files]

# Create pool of workers to do file loading and some preprocessing in parallel.
with Pool(cpu_count()) as pool:
    results = pool.map(load_file, input)

en_tweet_dfs, pro_russia_tweets_dfs = zip(*results)
# Concat samples into one large sample and save it.
sample = pd.concat(en_tweet_dfs)
pro_russia_tweets = pd.concat(pro_russia_tweets_dfs)

sample.to_pickle('en_tweets_sample.pkl')
pro_russia_tweets.to_pickle('pro_russia_tweets.pkl')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return list(map(*args))
