## Make Dataset TOS Compliant

In [1]:
import pandas as pd
import glob
import os

In [2]:
def clean_datasets():
    """ 
    Takes the full datasets scraped from each miner and outputs a clean dataset
    containing only the tweet IDs. This makes it compliant with Twitter's TOS.
    """
    
    path = 'full_data/'
    
    tweet_ids = []

    for filename in glob.glob(os.path.join(path, '*.csv')):
        print(filename)
        try:
            df_file = pd.read_csv(filename, usecols=['id'],  dtype= {'id': int})
            tweet_ids.extend(df_file['id'].values.tolist())
        except Exception as e:
            print(e)
            pass
     
    print('Creating dataframe...')
    df = pd.DataFrame({'tweet_id':tweet_ids})
    print('Removing duplicates...')
    df = df.drop_duplicates()
    print(len(df))
    print('Saving dataframe...')
    df.to_csv("clean_data/clean_data.csv", index=False)
    
clean_datasets()

full_data/congress_20161107-210118.csv
full_data/congress_20161107-210134.csv
full_data/congress_20161108-053855.csv
full_data/congress_20161108-053906.csv
full_data/congress_20161108-132431.csv
full_data/congress_20161108-132444.csv
full_data/congress_20161108-162723.csv
full_data/congress_20161108-162733.csv
full_data/democrat_20161107-210107.csv
full_data/democrat_20161108-053848.csv
full_data/democrat_20161108-132423.csv
full_data/democrat_20161108-162714.csv
full_data/election2016_20161107-210044.csv
full_data/election2016_20161108-053835.csv
full_data/election2016_20161108-132408.csv
full_data/election2016_20161108-162659.csv
full_data/election_20161107-210035.csv
full_data/election_20161108-053622.csv
full_data/election_20161108-132405.csv
full_data/election_20161108-162655.csv
full_data/electionday_20161107-210050.csv
full_data/electionday_20161108-053838.csv
full_data/electionday_20161108-132412.csv
full_data/electionday_20161108-162703.csv
full_data/gop_20161107-210101.csv
fu

In [12]:
import os

def split(filehandler, delimiter=',', row_limit=1000000, 
    output_name_template='clean_data/tweets_%s.csv', output_path='.', keep_headers=True):
    """
    A quick bastardization of the Python CSV library.

    Arguments:

        `row_limit`: The number of rows you want in each output file. 10,000 by default.
        `output_name_template`: A %s-style template for the numbered output files.
        `output_path`: Where to stick the output files.
        `keep_headers`: Whether or not to print the headers in each output file.

    Example usage:

        >> from toolbox import csv_splitter;
        >> csv_splitter.split(open('/home/ben/input.csv', 'r'));

    """

    reader = csv.reader(filehandler, delimiter=delimiter)
    current_piece = 1
    current_out_path = os.path.join(
         output_path,
         output_name_template  % current_piece
    )
    current_out_writer = csv.writer(open(current_out_path, 'w'), delimiter=delimiter)
    current_limit = row_limit
    if keep_headers:
        headers = next(reader)
        current_out_writer.writerow(headers)
    for i, row in enumerate(reader):
        if i + 1 > current_limit:
            current_piece += 1
            current_limit = row_limit * current_piece
            current_out_path = os.path.join(
               output_path,
               output_name_template  % current_piece
            )
            current_out_writer = csv.writer(open(current_out_path, 'w'), delimiter=delimiter)
            if keep_headers:
                current_out_writer.writerow(headers)
        current_out_writer.writerow(row)

In [13]:
split(open('clean_data/clean_data.csv', 'r'))