# Sampling.ipynb

### This notebook can be used to extract random samples of a set of Tweets or split a set of Tweets randomly.

Author: Erik Puijk <br>
Date  : February 3, 2022

In [1]:
import json
import math
import random
from datetime import datetime
import pandas as pd

In [2]:
def read_tweets(path):
    """ Read the Tweets from a given text file and return in JSON-format. """
    
    content = ""
    
    try:
        with open(path, 'r') as f:
            content = json.loads(f.read())
    except IOError:
        print("I/O error")
        
    print("Total Tweets read: %s" % (len(content)))

    return content

In [3]:
def random_split(lst, sample_ratio):
    """ Randomly split a given list into two lists with to a given ratio and return the lists. """
    
    sample_size = math.ceil(len(lst)*sample_ratio)
    sample = random.sample(lst, sample_size)
    
    return sample, [tweet for tweet in lst if tweet not in sample]

In [4]:
def random_sample(lst, sample_size):
    """ Randomly sample a given amount of Tweets from a given list and return the samples in a list. """
    
    if sample_size > len(lst):
        print("Error: sample size exceeds list size.")
        return []
    else:
        sample = random.sample(lst, sample_size)
    
    return sample

In [5]:
def write_sample(tweets, name, ext):
    """ Write the Tweets from a given list to a text file in CSV or JSON format. """
    
    if ext == "json":
        path = "samples/" + name + ".txt"
        
        try:
            with open(path, 'w') as f:
                json.dump(tweets, f)
        except IOError:
            print("I/O error")
    elif ext == "csv":
        path = "samples/" + name + ".csv"
        
        # Set id as first column
        df = pd.json_normalize(tweets)
        id_col = df.pop("id")
        df.insert(0, "id", id_col)

        df.to_csv(path, index=False)
    else:
        print("Error: no extension format specified.")

In [10]:
all_tweets = read_tweets('source/tweets_all.txt')

# Select 10% for gold-standard set
gold_standard, rest = random_split(all_tweets, 0.1)

# From that 10%, select 50 Tweets for inter-coder validation
intercoder_sample = random_sample(gold_standard, 50)

print("All Tweets len: " + str(len(all_tweets)))
print("Gold-standard len: " + str(len(gold_standard)))
print("Rest len: " + str(len(rest)))
print("Intercoder_sample: " + str(len(intercoder_sample)))

write_sample(intercoder_sample, "intercoder_sample", "csv")
write_sample(gold_standard, "gold_standard", "csv")
write_sample(gold_standard, "gold_standard", "json")
write_sample(rest, "rest", "json")