## Set up

In [None]:
import os
import re
import string
import pandas as pd
import numpy as np

In [None]:
%load_ext autoreload

In [None]:
%reload_ext autoreload

In [None]:
os.getcwd()

In [None]:
pd.options.display.max_seq_items = 10000
pd.set_option('display.max_colwidth', None)

Load environment variables

In [None]:
DATA_PATH = os.environ.get("DIR_DATA_RAW")

In [None]:
DATA_PATH

In [None]:
OUTPUT_DIR = os.environ.get("DIR_DATA_INTERIM")

Constants

In [None]:
TWEETS_FLNM = "tweet-ids-ANDherd-combinedtokens-020620" 

In [None]:
VARS_TO_KEEP = ['created_at', 'hashtags',
       'favorite_count', 'id', 'reweet_id', 'retweet_screen_name', 'lang', 'place',
       'possibly_sensitive', 'retweet_count', 'text',
       'user_screen_name', 'user_followers_count', 'user_location', 'user_name', 'user_screen_name.1',
       'user_time_zone']

## Get data

In [None]:
tweets_raw = pd.read_csv(os.path.join(DATA_PATH, TWEETS_FLNM + '.csv'))

In [None]:
tweets_raw
# only 15,465... ! 
# Where they not hydrated because the tweet was not found or because of rate limit of the Twitter API?

In [None]:
tweets_raw.columns

In [None]:
# filter only relevant columns
tweets_raw = tweets_raw[VARS_TO_KEEP]

In [None]:
tweets_raw[:5]

## Explore

- number/% of geolocated tweets
- number/% of english language tweets (other languages to be dropped)
- number/% of uncommented retweets (to be dropped as simply duplicates)

In [None]:
# general look
tweets_raw.nunique()

There are 6,938 unique texts.

And 15,465 unique tweets: WARNING this is less than the number of the dehydrated tweets acquired...

### Uncommented retweets

To be dropped as they are exact duplicates of other tweets already in the dataset.

How to identify them?

- If original tweets was longer than 140 characters: (1) the tweet is a retweet (`retweet_id` is not `NaN`), and (2) the tweet ends in an ellipsis.
    **Rationale**: for retweets that do not include a comment where the original tweet is longer than 140 characters, the Twitter API returns a truncated full_text.  
    
- If original tweets was not longer than 140 characters: (1) the tweet is a retweet, (2) it starts with RT, and (3) its text is the exact duplication of an existing tweet

In [None]:
uncommented_above140_retweets = tweets_raw[tweets_raw.text.str.endswith("\u2026")]

In [None]:
uncommented_above140_retweets
# 9,888

In [None]:
# how many also have a null reweet_id? Should be none
uncommented_above140_retweets.isnull().reweet_id.value_counts()    # there are 220

In [None]:
# let's see who these 220 are
uncommented_above140_retweets[uncommented_above140_retweets.isnull().reweet_id]

In [None]:
# let's remove these 220 from the uncommented retweets as that's not what they are
uncommented_above140_retweets = uncommented_above140_retweets[uncommented_above140_retweets.notnull().reweet_id]

In [None]:
uncommented_above140_retweets
# 9,668

In [None]:
tweets_to_go_ids1 = uncommented_above140_retweets.id

**Important NOTE**

The original tweets of these uncommented >140cha retweets may not be present in our dataset as original tweets. However, as for Twitter's policy, uncommented retweets of tweets longer than 140 character are truncated when accessed via the Stream API, so there is no way we can get access to the full text, we will have to exclude them anyway. We cannot, in fact, analyse the sentiment of incomplete text in a reliable way. 

Let's try to identify any other (i.e., up to 140 character) uncommented retweet

In [None]:
retweets = tweets_raw[(tweets_raw.text.str.startswith("RT @"))]

In [None]:
retweets.reweet_id.isnull().value_counts()

In [None]:
# How many of these retweets are not in the above-140 uncommented rewteets already identified?
retweets[~retweets.id.isin(uncommented_above140_retweets.id)]
# 288

In [None]:
# We need to understand which ones of these 440 retweets are also uncommented dulicates of exisiting tweets (so to be dropped)
extra_retweets = retweets[~retweets.id.isin(uncommented_above140_retweets.id)]

In [None]:
# Let's clean the text by removing the "RT @name-original-sender " initial part
extra_retweets = extra_retweets.copy()
extra_retweets['cleaned_retweet'] = [re.sub(r"^RT @.*: ", "", t) for t in extra_retweets.text]

In [None]:
extra_retweets[['text', 'cleaned_retweet']]

In [None]:
pd.DataFrame(extra_retweets.cleaned_retweet.value_counts())

Some, those whose count > 1, are definitely uncommented retweets of indentical original tweets - to be removed.

In [None]:
# let's keep one of each, in case the original tweet is not present in the dataset
extra_retweets.drop_duplicates(subset ="cleaned_retweet", keep = "first")

In [None]:
# ids of duplicate retweets (to be removed)
tweets_to_go_ids2 = extra_retweets[extra_retweets.duplicated('cleaned_retweet')].id

In [None]:
tweets_to_go_ids2

In [None]:
# What's left?
extra_retweets[~extra_retweets.id.isin(tweets_to_go_ids2)]

In [None]:
# are they maybe duplicates of tweets of original tweets in the dataset?
tweets_to_investigate = extra_retweets[~extra_retweets.id.isin(tweets_to_go_ids2)]

In [None]:
tweets_to_investigate_text = tweets_to_investigate.cleaned_retweet     #128

In [None]:
tweets_raw[tweets_raw.text.isin(tweets_to_investigate_text)].count()   # 35 are "copies" of original tweets in the dataset

In [None]:
pd.DataFrame(tweets_to_investigate_text.value_counts())

In [None]:
tweets_to_go_ids3 = tweets_to_investigate[tweets_to_investigate.cleaned_retweet.isin(
    tweets_raw[tweets_raw.text.isin(tweets_to_investigate_text)].text)].id

In [None]:
len(tweets_to_go_ids3)   #ok

In [None]:
print(len(tweets_to_go_ids1))
print(len(tweets_to_go_ids2))

### Let's remove from the dataset all the uncommented retweets that are duplicate of original tweets already in the dataset 

In [None]:
print(len(set(tweets_to_go_ids1)))
print(len(set(tweets_to_go_ids2)))
print(len(set(tweets_to_go_ids3)))
# GOOD :-), each is unique

In [None]:
tweets_to_go_ids_all = list(tweets_to_go_ids1) + list(tweets_to_go_ids2) + list(tweets_to_go_ids3)

In [None]:
len(tweets_to_go_ids_all)

In [None]:
len(tweets_raw.index) # 15465

In [None]:
tweets_original = tweets_raw[~tweets_raw.id.isin(tweets_to_go_ids_all)]

In [None]:
len(tweets_original.index)

### English vs. non-English tweets

Non-English tweets will be dropped as they are not part of our population of interest.


In [None]:
count_by_lang = pd.DataFrame(tweets_original.groupby('lang').id.count()) 

In [None]:
count_by_lang.rename(columns={'id': 'count_ids'}, inplace=True)

In [None]:
count_by_lang['prop'] = round(count_by_lang['count_ids'] / sum(count_by_lang['count_ids']),3)

In [None]:
count_by_lang

What are the "und" (undefined) ones?

In [None]:
tweets_original[tweets_original.lang == "und"]

They seem to be "garbage" so let's exclude them from further investigation.

### Exclude all non-English tweets

In [None]:
tweets_original_en = tweets_original[tweets_original.lang == 'en']

In [None]:
len(tweets_original_en.index)

### Geolocation

We'll look at the geolocation of users.

In [None]:
tweets_original_en.place.value_counts(dropna=False)

The large majority are not geolocated.

### Save the data

In [None]:
# define file name and file path
output_name = "tweets_original_en"
output_filepath = os.path.join(OUTPUT_DIR, output_name + ".csv")

In [None]:
tweets_original_en.to_csv(output_filepath)