In [None]:
import json
import pandas as pd
from tqdm import tqdm
from pathlib import Path 
from ast import literal_eval

## Tweet Collection
Tweets are collected using two methods: 
1. Twitter REST API using [Tweepy](https://github.com/tweepy/tweepy)
2. Scraping using [Scweet](https://github.com/Altimis/Scweet)

### Twitter API
Twitter API only allows to seach tweets from the last week. The tweets between Dec 3rd-10th are collected with the API. Each tweet is saved into a file named tweet_id.json under after_dec3/. 
Below, the downloaded tweets are collected and exported as a single csv file.

In [None]:
tweets = []
user_fields = ['id_str', 'screen_name', 'followers_count', 'friends_count', 'favourites_count', 'created_at']
data_dir = Path('../data/after_dec3')
for file in tqdm(data_dir.iterdir()):
    if file.is_file() and file.name.endswith('json'):
        tweet_data = json.loads(open(file).read())
        filtered = {k:v for k, v in tweet_data.items() if k != 'user'}
        user_data = {f'user_{item}': tweet_data['user'][item] for item in user_fields}
        tweets.append({**filtered, **user_data})
pd.DataFrame(tweets).to_csv(data_dir.parent / 'tweets_after_dec3.csv')

### Scraping

With scraping, we collected tweets from October 25th to December 3rd. Scraping is performed for each day and daily tweets are saved as seperate csv files under before_dec3/.
Below, the daily tweets are merged into a dataframe. 


In [None]:
tweets = []
data_dir = Path('../data/before_dec3')
for file in tqdm(data_dir.iterdir()):
    if file.is_file() and file.name.startswith('tweets_scweet'):
        tweets.append(pd.read_csv(file))

pd.concat(tweets).to_csv(data_dir.parent / 'tweets_till_dec3.csv')

## Merge Tweets

The files containing tweets are standardized and merged. 

In [None]:
data_dir = Path('../data')
df1_cols = {'UserScreenName': 'UserName', 'Embedded_text': 'TweetText', 'Timestamp': 'Timestamp', 'Tweet URL': 'Tweet URL'}
df2_cols = {'created_at': 'Timestamp', 'id_str': 'TweetID', 'full_text': 'TweetText', 'entities': 'Entities', 'user_id_str': 'UserID', 'user_screen_name': 'UserName'}

df1 = pd.read_csv(data_dir / 'tweets_till_dec3.csv')
df1 = df1.rename(columns=df1_cols)[df1_cols.values()].assign(collector='scraper')

df2 = pd.read_csv(data_dir / 'tweets_after_dec3.csv')
df2 = df2.rename(columns=df2_cols)[df2_cols.values()].assign(collector='api')

all_tweets = pd.DataFrame(df1.to_dict('records') + df2.to_dict('records'))
all_tweets.to_csv(data_dir / 'tweets.csv')