# Analyzing Subjects:
<break> </break>
<font size=4>
The Analyzer, along with the classifier class, provide lots of functionality towards analyzing the Cool Neighbors classification data.
</font>

## Aggregating the Classifications

The Analyzer class object requires the extraction and reduction files produced via the Aggregator class. See the Aggregating example for more details.

In [None]:
# Imports
import os
from DataToolkit.Aggregator import Aggregator

# For more details about Aggregating, see the Aggregating example.

# This is the workflow ID and version for the Backyard Worlds: Cool Neighbors project's Launch-0 workflow.
workflow_id = 24299
version = 1.6

# Default names for the CSV files that are exported via Zooniverse's data exports tab.
classifications_csv = "backyard-worlds-cool-neighbors-classifications.csv"
workflows_csv = "backyard-worlds-cool-neighbors-workflows.csv"
config_directory = "Config"
extractions_directory = "Extractions"
reductions_directory = "Reductions"

# Check whether the aggregated files already exist for this workflow and version
aggregator = Aggregator(classifications_csv_filename=classifications_csv, workflow_csv_filename=workflows_csv, config_directory=config_directory, extractions_directory=extractions_directory, reductions_directory=reductions_directory)

if(os.path.exists("{}/question_extractor_workflow_{}_V{}.csv".format(aggregator.extractions_directory, workflow_id, version)) and os.path.exists("{}/question_reducer_workflow_{}_V{}.csv".format(aggregator.reductions_directory, workflow_id, version))):
    print("Aggregated files already exist, skipping aggregation.")
else:
    print("Aggregating...")
    aggregator.aggregateWorkflow(workflow_id=workflow_id, v=version)
    print("Aggregation complete.")

## Creating the Analyzer
The analyzer will be your all-inclusive tool for working with the panoptes_aggregation results, generated via the Aggregator.

In addition to the analzyer's functionality, it also has a Classifier instance within it which itself has functionality regarding the accuracies of user's and weighting classifications by user accuracy in the candidate selection process.


In [None]:
# Import
from DataToolkit.Analyzer import Analyzer

# Provide the filepaths of the aggregated files
extracted_file = "Extractions/question_extractor_workflow_24299_V1.6.csv"
reduced_file = "Reductions/question_reducer_workflow_24299_V1.6.csv"

# Subject file is optional but highly recommended, as it allows for you to work with subjects offline
# and is generally faster than the online version.
subject_file = "backyard-worlds-cool-neighbors-subjects.csv"
if(subject_file is not None):
    # If an offline analyzer has already been created and saved, you can load it instead of creating it again. You cannot
    # load an online analyzer.
    if (os.path.exists("analyzer.pickle")):
        print("Loading Analyzer...")
        analyzer = Analyzer.load()
    else:
        print("Creating Analyzer...")
        # Providing a subjects_file will default the analyzer to being offline.
        analyzer = Analyzer(extracted_file, reduced_file, subject_file)
else:
    # Not providing a subjects_file will default the analyzer to being online.
    analyzer = Analyzer(extracted_file, reduced_file)

## Getting Analyzer Information

Lots of different types of information can be extracted out of the classification data. Provided below are useful catgegorical examples:

### Getting Subjects and Users
Subject ids, usernames, and user ids are able to be retrieved via the Analyzer as well as their panoptes-client object equivalents.

In [None]:
# Import
from unWISE_verse.Spout import Spout

# Get the valid subject ids from the workflow classifications.
subject_ids = analyzer.getSubjectIDs()
print(f"Valid Subjects:", *subject_ids[0:10], "...\n")

# Get the usernames of the users who have classified.
usernames = analyzer.getUniqueUserIdentifiers(user_identifier="username")
print(f"Usernames:", *usernames[0:10], "...\n")

# Get the user ids of the users who have classified.
# include_logged_out_users must be false since logged-out users do not have user ids.
user_ids = analyzer.getUniqueUserIdentifiers(include_logged_out_users=False, user_identifier="user id")

# Get the top usernames (two modes: percentile or classification threshold).
top_usernames = analyzer.getTopUsernames(classification_threshold=None, percentile=98)
print(f"Top usernames: {top_usernames}\n")

# Login to Zooniverse with Spout to access the next two functions.
# You will need to log in to Spout to use these functions or use online mode.
login = Spout.requestLogin()
Spout.loginToZooniverse(login)
print()
# Get the subject object for a specific subject. Not disabled for offline mode, but you will need to log in to Spout
# to get the subject object.
subject_object = analyzer.getSubject(subject_ids[0])
print(f"Subject object for subject {subject_ids[0]}: {subject_object}\n")

# Get the user object for a specific user. Not disabled for offline mode, but you will need to log in to Spout
# to get the user object.
user_object = analyzer.getUser(usernames[0])
print(f"User object for user {usernames[0]}: {user_object}\n")

# Gets the user objects of the top users (two modes: percentile or classification threshold).
top_user_objects = analyzer.getTopUsers(classification_threshold=None, percentile=98)
print(f"Top user objects: {top_user_objects}\n")

### Number of Classifications

These functions pertain to getting the total number of classifications of all users, or some specific subset of users.

In [None]:
# Get the total number of classifications in the aggregated files.
print(f"Number of classifications: {analyzer.getTotalClassifications()}\n")

# Get the total number of classifications for a subjects which have at least n classifications.
n = 5
print(f"Number of classifications for subjects with at least {n} classifications: {analyzer.getSubsetOfTotalClassifications(minimum_subject_classification_count=5)}\n")

# Get the total number of classifications done by a specific user.
user_classification_count = analyzer.getTotalClassificationsByUser(usernames[0])
print(f"Total classifications by user {usernames[0]}: {user_classification_count}\n")

# Get the total number of classifications done by the top users (two modes: percentile or classification threshold).
top_users_classification_count = analyzer.getTotalClassificationsByTopUsers(classification_threshold=None, percentile=98)
print(f"Total classifications by top users: {top_users_classification_count}\n")

### Classifications
These functions allow you to obtain the classification information from specific subjects and specific users. Classifications are represented as a dictionary with keys: "yes", "no", and "total". These correspond to the binary choice users make in the task question on the Cool Neighbors' classification task.

In [None]:
# Get the classifications done for a specific subject.
subject_classifications = analyzer.getClassificationsForSubject(subject_ids[0])
print(f"Classifications for subject {subject_ids[0]}: {subject_classifications}\n")

# Get classifications done by a specific user. 
# Since a user could classify many subjects, a Pandas Dataframe of the classifications is provided.
user_classifications = analyzer.getClassificationsByUser(usernames[0])
print(f"Classifications by user {usernames[0]}: \n{user_classifications}\n")

### Plotting Classifications
Plot classification data in a variety of different and useful ways.

In [None]:
# Plot the classification distribution for all subjects.
total_subject_count = 27800
print("Plotting classification distribution for all valid subjects...\n")
analyzer.plotClassificationDistribution(total_subject_count=total_subject_count, title="Classification Distribution")

# Plot the classifications done for a specific subject.
print(f"Plotting classifications for subject {subject_ids[0]}...\n")
analyzer.plotClassificationsForSubject(subject_ids[0])

# Plot the classifications done by the top users (two modes: percentile or classification threshold).
print("Plotting classifications done by top users...\n")
analyzer.plotTotalClassificationsByTopUsers(classification_threshold=None, percentile=98)

# Plot classification timeline
print("Plotting classification timeline...\n")
analyzer.plotClassificationTimeline()

### Classification Times
Plotting information about classification times and statistics related to the classification times of users.

In [None]:
# Plot the time histogram for all classifications/users. Since there are users who have too
# few consecutive classifications, which brings up a warning, I am ignoring warnings for this function.
print("Plotting time histogram for all classifications...\n")
from DataToolkit.Decorators import ignore_warnings 
# This decorator will ignore the warnings provided by users with insufficient consecutive classifications
ignore_warnings(analyzer.plotTimeHistogramForAllClassifications)()

# Plot the time histogram for a specific user.
print(f"Plotting time histogram for user {usernames[0]}...\n")
analyzer.plotTimeHistogramForUserClassifications(usernames[0])

# Compute the time statistics for a specific user.
user_average_time, user_std_time, user_median_time = analyzer.computeTimeStatisticsForUser(usernames[0])
print(f"Average time for user {usernames[0]}: {round(user_average_time,2)} seconds\nStandard deviation: {round(user_std_time,2)} seconds\nMedian: {round(user_median_time,2)} seconds\n")


### Subject Dataframes
All csv file information is saved in the Analyzer object as Pandas Dataframes. There are three different csv sources: the subjects file, the extracted file, and the reduced file. 

-The subjects file is the exported subject csv from the Data Exports tab of Zooniverse, this is provided when using the Analyzer in offline mode. However, you can still generate subject dataframes in the online version as it will gather all the metadata information from the subject objects it gathered upon creation.

-The extracted file is the individual classifications produced via the Aggregator. This contains information regarding every classification of every subject by the user who performed the classification.

-The reduced file is the combined classifications produced via the Aggregator. This contains the combined information regarding the total number of "yes" classifcations and "no" classifications by all the users for each individual subject.

This information is accessible as needed via the following functions:

In [None]:
# Get the subject dataframe for a specific subject.
subject_dataframe_from_subject_file = analyzer.getSubjectDataframe(subject_ids[0], dataframe_type="default")
print(f"Default Subject dataframe for subject {subject_ids[0]}: \n{subject_dataframe_from_subject_file}\n")

# Get the subject dataframe for a specific subject from the extracted file.
subject_dataframe_from_extracted_file = analyzer.getSubjectDataframe(subject_ids[0], dataframe_type="extracted")
print(f"Extracted Subject dataframe for subject {subject_ids[0]}: \n{subject_dataframe_from_extracted_file}\n")

# Get the subject dataframe for a specific subject from the reduced file.
subject_dataframe_from_reduced_file = analyzer.getSubjectDataframe(subject_ids[0], dataframe_type="reduced")
print(f"Reduced Subject dataframe for subject {subject_ids[0]}: \n{subject_dataframe_from_reduced_file}\n")

# Combine subject dataframes.
subject_dataframe_0 = analyzer.getSubjectDataframe(subject_ids[0], dataframe_type="default")
subject_dataframe_1 = analyzer.getSubjectDataframe(subject_ids[1], dataframe_type="default")
combined_subject_dataframe = analyzer.combineSubjectDataframes([subject_dataframe_0, subject_dataframe_1])
print(f"Combined subject dataframe: \n{combined_subject_dataframe}\n")

# Save the subject dataframe to a CSV file.
print("Saving subject dataframe to file...")
analyzer.saveSubjectDataframeToFile(combined_subject_dataframe, "combined_subject_dataframe.csv")
print("Subject dataframe saved. \n")

# Load the subject dataframe from a CSV file.
combined_subject_dataframe_from_file = analyzer.loadSubjectDataframeFromFile("combined_subject_dataframe.csv")
print(f"Combined subject dataframe from file: \n{combined_subject_dataframe_from_file}\n")

### Subject Information
Access subject metadata to perform actions on the subjects as needed.

In [None]:
# Verify that a subject exists within the Analyzer.
subject_exists = analyzer.subjectExists(subject_ids[0])
print(f"Subject {subject_ids[0]} exists within the Analyzer: {subject_exists} \n")

# Get the subject metadata for a specific subject.
subject_metadata = analyzer.getSubjectMetadata(subject_ids[0])
print(f"Subject metadata for subject {subject_ids[0]}: \n{subject_metadata}\n")

# Get a particular subject metadata field for a specific subject.
subject_metadata_field = analyzer.getSubjectMetadataField(subject_ids[0], "ID")
print(f"Subject metadata field for subject {subject_ids[0]}: {subject_metadata_field}\n")

# Show the subject in wise-view. 
# open_in_browser = True enforces a delay of 10 seconds before finishing to avoid accidently spamming WiseView
print(f"Showing subject {subject_ids[0]} in wise-view...\n")
print(f"WiseView link for subject {subject_ids[0]}: {analyzer.showSubjectInWiseView(subject_ids[0], open_in_browser=True)}\n")

# Get the SIMBAD link for a specific subject.
simbad_link = analyzer.getSimbadLinkForSubject(subject_ids[0])
print(f"SIMBAD link for subject {subject_ids[0]}: {simbad_link}\n")

## Running Queries

The Analyzer object also has built in functionality, via the Searcher class, to allow for queries from both SIMBAD and Gaia for each subject's coordinates and field of view.

In [None]:
# Note: If you run these queries with plot=True in standard Python, you can interact with the plot and hover over points 
# to tell you their given designations in SIMBAD

# Get the subject IDs for the queries.
subject_ids = analyzer.getSubjectIDs()

# Get the SIMBAD query for a specific subject.
simbad_query = analyzer.getSimbadQueryForSubject(subject_ids[0], plot=True)
print(f"SIMBAD query for subject {subject_ids[0]}: \n{simbad_query}\n")

# Get the conditional SIMBAD query for a specific subject.
conditional_simbad_query = analyzer.getConditionalSimbadQueryForSubject(subject_ids[0], plot=True)
print(f"Conditional SIMBAD query for subject {subject_ids[0]}: \n{conditional_simbad_query}\n")

# Check if there exists a source in SIMBAD for a specific subject's FOV.
source_exists_in_simbad = analyzer.sourceExistsInSimbadForSubject(subject_ids[0])
print(f"Source exists in SIMBAD for subject {subject_ids[0]}: {source_exists_in_simbad}\n")

# Get the Gaia query for a specific subject.
gaia_query = analyzer.getGaiaQueryForSubject(subject_ids[0], plot=True)
print(f"Gaia query for subject {subject_ids[0]}: \n{gaia_query}\n")

# Get the conditional Gaia query for a specific subject.
conditional_gaia_query = analyzer.getConditionalGaiaQueryForSubject(subject_ids[0], plot=True)
print(f"Conditional Gaia query for subject {subject_ids[0]}: \n{conditional_gaia_query}\n")

# Check if there exists a source in Gaia for a specific subject's FOV.
source_exists_in_gaia = analyzer.sourceExistsInGaiaForSubject(subject_ids[0])
print(f"Source exists in Gaia for subject {subject_ids[0]}: {source_exists_in_gaia}\n")

## Finding Candidates
The Analyzer can perform two distinct actions in regards to finding acceptable Cool Neighbors candidates.

Firstly, it can go through all the subjects and determine what subset of the subjects satisfy two conditions:
- Has an acceptance ratio (ratio of yes classifications to the total number of classifications) greater than or equal to the minimum acceptance ratio.
- Has an acceptance threshold (number of yes classifications) greater than or equal to the minimum acceptance threshold.

How these values are determined per subject can be modified such that a weighted version of a user's classifications is taken into account and is incorporated into the overall acceptance ratio and acceptance threshold of each subject. This is not active by default.

All candidates which satisfy these conditions will be put into an acceptable candidates list for examination or for further scrutinizing.

Secondly, it can go through each acceptable candidate and determine if there is a source within either SIMBAD or Gaia which most likely matches up with the subject. If there is not such a source, it will be added to a csv file. Otherwise it will disregard the subject.
- For SIMBAD, the criteria for matching is that there needs to be a source within a field of view (centered on the subject's coordinates) of 120 arcseconds, plus some extra FOV to account for high proper motion objects, which has an otype of any of the following: BD*, BD?, BrownD*, BrownD?, BrownD*_Candidate, or PM*.
- For Gaia, the criteria for matching is that there needs to be a source within a field of view (centered on the subject's coordinates) of 120 arcseconds, plus some extra FOV to account for high proper motion objects, which has a total proper motion of 100 mas/yr or more.

The remaining acceptable candidates which were not found in SIMBAD, Gaia, or either of them, will be placed into their respective csv files for manual examination.

In [None]:
# Get the subject IDs for the queries.
subject_ids = analyzer.getSubjectIDs()

# Get the subject type for a specific subject.
subject_type = analyzer.getSubjectType(subject_ids[0])
print(f"Subject type for subject {subject_ids[0]}: {subject_type}\n")

# Check if a specific subject is an acceptable candidate.
is_acceptable_candidate, subject_classifications = analyzer.checkIfCandidateIsAcceptable(subject_ids[0], 0.5, acceptance_threshold=1, weighted=False)
print(f"Subject {subject_ids[0]} an acceptable candidate: {is_acceptable_candidate}")
print(f"Subject classifications for subject {subject_ids[0]}: {subject_classifications}\n")

# Find the acceptable candidates.
# Saves the acceptable candidates to a csv file.
acceptable_candidates = analyzer.findAcceptableCandidates(acceptance_ratio=0.5, save=True, weighted=False)
print(f"Acceptable candidates: {acceptable_candidates}\n")

# Sort and exclude the acceptable candidates by database.
print("Warning: This may take a while...\n")
generated_files = analyzer.sortAcceptableCandidatesByDatabase(acceptable_candidates)
print(f"Generated files: {generated_files}\n")

# To perform both the acceptable candidate finding and sorting in one step, uncomment and run the following function on its own:
print("Warning: This may take a while...\n")
#analyzer.performCandidatesSort(acceptance_ratio=0.5)
print("Acceptable candidates found and sorted!\n")

## Using the Classifier
The classifier handles user performance/accuracy in regards to classifications in the Analyzer object. The primary utility of the classifier is already built into the Analyzer object via the weighted keyword in the candidate functions, but the class itself has other functionality for detailing information about user accuracy.

In [None]:
# Get the classifier object from within the analyzer object.
classifier = analyzer.classifier

# Get the usernames from the analyzer.
usernames = analyzer.getUniqueUserIdentifiers(user_identifier="username", include_logged_out_users=True)

# Get the user accuracy for a specific user.
user_accuracy = classifier.getUserAccuracy(usernames[0], default_insufficient_classifications=True)
print(f"User accuracy for user {usernames[0]}: {user_accuracy}\n")

# Get the user verified classifications for a specific user.
verified_classifications_by_user = classifier.getUserVerifiedClassifications(usernames[0])
print(f"Verified classifications by user {usernames[0]}: {verified_classifications_by_user}\n")

# Get the user information for a specific user.
user_information = classifier.getUserInformation(usernames[0], default_insufficient_classifications=True)
print(f"User information for user {usernames[0]}: {user_information}\n")

# Get the user accuracy for all users.
user_accuracies = classifier.getAllUserAccuracies(include_logged_out_users=True, default_insufficient_classifications=True)
print("User accuracies:", *user_accuracies[0:10], "...\n")

# Get all user information.
user_information = classifier.getAllUserInformation(include_logged_out_users=True, default_insufficient_classifications=True)
print(f"User information: too much to display...\n")

# Get the most accurate users.
most_accurate_users = classifier.getMostAccurateUsernames(include_logged_out_users=True, default_insufficient_classifications=True, classification_threshold=0, verified_classifications_threshold=10, accuracy_threshold=0.0)
print("Most accurate users:", *most_accurate_users[0:10], "...\n")

# Plot user performance.
classifier.plotUserPerformance(usernames[0])

# Plot all users' performance as a histogram.
classifier.plotAllUsersPerformanceHistogram(include_logged_out_users=True, default_insufficient_classifications=True)

# Plot top users' performance as a histogram.
classifier.plotTopUsersPerformanceHistogram(classification_threshold=None, percentile=98, default_insufficient_classifications=True)

# Plot top users' performances
classifier.plotTopUsersPerformances(classification_threshold=None, percentile=98, default_insufficient_classifications=True)

# Plot most accurate users' performances
classifier.plotMostAccurateUsers(include_logged_out_users=True, default_insufficient_classifications=True, classification_threshold=0, verified_classifications_threshold=100, accuracy_threshold=0.0)

# Plot accuracy vs. number of classifications
classifier.plotAccuracyVsClassificationTotals(include_logged_out_users=True, default_insufficient_classifications=True, log_plot=True, classification_threshold=0, verified_classifications_threshold=100, accuracy_threshold=0.0)