In [1]:
#imports

import pandas as pd
import numpy as np
import csv, json

## Part One

### Functions

In [2]:
""" parameters:
        input_path (str): path of CSV file to be read  
        output_path (str): path of cleaned CSV file to write to
"""

def clean_csv(input_path, output_path):
    input_dict = []

    with open(input_path) as file:
        reader = csv.reader(file)
        next(reader, None) #Skip header
        for line in reader:
            input_dict.append(json.loads(line[0])) #Read in each line as json
        file.close()

    data = pd.DataFrame.from_dict(input_dict)
    data = data.replace(r'^\s*$', np.nan, regex=True) #Catch single space error
    data = data.dropna(how='any',axis=0)
    print(data.groupby("target").count())
    data.to_csv(output_path)    
    
    
clean_csv('./take_home_dataset.csv', './cleaned-dataset.csv')

                         text
target                       
airline_corpus            644
brown_university_corpus   644


## Part Two

In [3]:
#imports 

from indico import IndicoClient, IndicoConfig
from indico.queries import ModelGroupPredict, JobStatus

### Functions

In [4]:
""" parameters:
        dataset (list): test data to be read into model
        model_id (int): represents which model to use
    returns: 
        predictions (list): predicted classifications generated by model
"""

def get_predictions(dataset, model_id):
    my_config = IndicoConfig(
        host="try.indico.io", api_token_path="./indico_api_token.txt"
    )
    client = IndicoClient(config=my_config)

    # predict on the model
    job = client.call(
        ModelGroupPredict(
            model_id=model_id,
            data=dataset,
            load=False
        )
    )
    # retrieve your prediction results
    predictions_dict = client.call(JobStatus(id=job.id, wait=True)).result

    predictions = []
    for pred in predictions_dict:
        predictions.append(max(pred, key=pred.get))

    return predictions

In [5]:
""" parameters:
        truth (list): correct classifications provided in original dataset 
        predictions (list): predicted classifications generated by model
    returns:
        accuracy of predictions (float)
"""

def check_accuracy(truth, predictions):
    correct_entries = 0 
    for i in range(len(truth)):
        if truth[i] == predictions[i]:
            correct_entries += 1
    return correct_entries/len(truth)

### Script

In [6]:
input_data = pd.read_csv('./cleaned-dataset.csv')
predictions = get_predictions(input_data['text'].tolist(), 2103)
percent_correct = check_accuracy(input_data['target'], predictions)
print('Model predictions are {:.2%} accurate'.format(percent_correct)) 

Model predictions are 98.52% accurate
