# Sampling.ipynb

### This notebook can be used to extract random samples of a set of Tweets or split a set of Tweets randomly.

Author: Erik Puijk <br>
Date  : February 3, 2022

In [32]:
import json
import math
import random
from datetime import datetime
import pandas as pd

In [33]:
def read_tweets(path):
    """ Read the Tweets from a given text file and return in JSON-format. """
    
    content = ""
    
    try:
        with open(path, 'r') as f:
            content = json.loads(f.read())
    except IOError:
        print("I/O error")
        
    print("Total Tweets read: %s" % (len(content)))

    return content

In [36]:
tweets = read_tweets("samples/2. coding/gold_standard.txt")

Total Tweets read: 467


In [18]:
def random_split(lst, sample_ratio):
    """ Randomly split a given list into two lists with to a given ratio and return the lists. """
    
    sample_size = math.ceil(len(lst)*sample_ratio)
    sample = random.sample(lst, sample_size)
    
    return sample, [tweet for tweet in lst if tweet not in sample]

In [19]:
#sample, rest = random_split(tweets, 0.1)

In [20]:
#print(len(sample))
#print(len(rest))

467
4201


In [37]:
def random_sample(lst, sample_size):
    """ Randomly sample a given amount of Tweets from a given list and return the samples in a list. """
    
    if sample_size > len(lst):
        print("Error: sample size exceeds list size.")
        return []
    else:
        sample = random.sample(lst, sample_size)
    
    return sample

In [38]:
sample = random_sample(tweets, 50)

In [23]:
def write_csv(tweets):
    """ Write the Tweets from a given list to a preset text file in CSV. """
    
    now = datetime.now()
    path = "samples/" + now.strftime("%Y%m%d_%H%M%S") + "_sample.csv"
    
    # Set id as first column
    df = pd.json_normalize(tweets)
    id_col = df.pop("id")
    df.insert(0, "id", id_col)
    
    df.to_csv(path, index=False)

In [29]:
def write_json(tweets):
    """ Write the Tweets from a given list to a preset text file in JSON. """
    
    now = datetime.now()
    path = "samples/" + now.strftime("%Y%m%d_%H%M%S") + "_sample.txt"
    
    try:
        with open(path, 'w') as f:
            json.dump(tweets, f)
    except IOError:
        print("I/O error")

In [39]:
write_csv(sample)