# User-defined variables

In [1]:
# Specify filenames and directories here
# Specify the directory containing user tweets .json + .gz files
TWEETS_DIRECTORY = "./users-new/"

# Specify the name of the CSV file containing personality scores for each user
TRAIN_LABEL_FILE = "./valid_uids.csv"

# Specify how many of each file type to include in the details section when searching for files
# A value of 10 would mean up to 10 failures AND 10 successes are reported, they are separate.
MAX_FILES_LISTED_PER_TYPE = 5

# Imports

In [2]:
import sys
import os
import re
import gzip
import json
import pandas as pd

from IPython.display import HTML, display
from decimal import Decimal

# File Definitions

In [3]:
directories = ['./vectorized/', './filtered/', './users-new/', './prediction/']
files = ['./svm.model', './random_forest.model']

uids = []
for uid in pd.read_csv(TRAIN_LABEL_FILE):
    uids.append(int(Decimal(uid)))

# a list of files related to tweets, as pairs of the format [list of files, notes for display]
tweetLists = [[[TWEETS_DIRECTORY + str(uid) + '.json.gz' for uid in uids], 'Scraped Tweets'],
              [[TWEETS_DIRECTORY + str(uid) + '.json.gz.json' for uid in uids], 'Metadata about scraping'],
              [["./vectorized/" + str(uid) + '.csv' for uid in uids], 'Vectorized tweets'],
              [["./filtered/" + str(uid) + '.csv' for uid in uids], 'Filtering tweets'],
              [["./prediction/" + str(uid) + '.csv' for uid in uids], 'Personality predictions']
             ]


# Gathering results

In [4]:
results = [["<b>File Name</b>", "<b>Exists</b>", "<b>Notes</b>"]]

fileStatTable = [["<b>File Type</b>", "<b># Found</b>", "<b># Missing</b>"]]

statListDir = ['Directory', 0, 0]
for d in directories:
    exists = os.path.exists(d)
    if exists:
        statListDir[1] += 1
    else:
        statListDir[2] += 1
    results.append([d, exists, 'Directory'])
fileStatTable.append(statListDir)

statListFile = ['Misc. File', 0, 0]
for f in files:
    exists = os.path.isfile(d)
    if exists:
        statListFile[1] += 1
    else:
        statListFile[2] += 1
    results.append([f, exists, 'Misc. File'])
fileStatTable.append(statListFile)

for listPair in tweetLists:
    numMissing = 0
    numFound = 0
    printedErrorM = False
    printedErrorF = False
    for expectedFile in listPair[0]:
        exists = os.path.exists(expectedFile)
        if exists:
            numFound += 1
            if numFound >= MAX_FILES_LISTED_PER_TYPE:
                if not printedErrorF:
                    print('Warning: Not recording additional successes for ' + listPair[1] + ' for brevity.')
                    printedErrorF = True
                continue
        else:
            numMissing += 1
            if numMissing >= MAX_FILES_LISTED_PER_TYPE:
                if not printedErrorM:
                    print('Warning: Not recording additional failures for ' + listPair[1] + '. Too many missing files.')
                    printedErrorM = True
                continue
        results.append([expectedFile, exists, listPair[1]])
    fileStatTable.append([listPair[1], numFound, numMissing])



# Beautify Results

In [5]:
for row in results:
    if row[1] == True:
        row[1] = '<font color="green">Yes</font>'
    elif row[1] == False:
        row[1] = '<font color="red">No</font>'

# Displaying Results

In [6]:
display(HTML('<h2>Symmary:</h2>'))

display(HTML(
   '<table><tr>{}</tr></table>'.format(
       '</tr><tr>'.join(
           '<td>{}</td>'.format('</td><td>'.join(str(_) for _ in row)) for row in fileStatTable)
       )
))

display(HTML('<h2>Details:</h2>'))

display(HTML(
   '<table><tr>{}</tr></table>'.format(
       '</tr><tr>'.join(
           '<td>{}</td>'.format('</td><td>'.join(str(_) for _ in row)) for row in results)
       )
))

0,1,2
File Type,# Found,# Missing
Directory,1,3
Misc. File,0,2
Scraped Tweets,3,30427
Metadata about scraping,4,30426
Vectorized tweets,0,30430
Filtering tweets,0,30430
Personality predictions,0,30430


0,1,2
File Name,Exists,Notes
./vectorized/,No,Directory
./filtered/,No,Directory
./users-new/,Yes,Directory
./prediction/,No,Directory
./svm.model,No,Misc. File
./random_forest.model,No,Misc. File
./users-new/12432922.json.gz,No,Scraped Tweets
./users-new/172018245.json.gz,No,Scraped Tweets
./users-new/929862361.json.gz,No,Scraped Tweets
