# Streamlining the Process of Exploring Datasets

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## Importing Datasets

In [2]:
#can run this if working in the same folder as the data
def read_data(survey_section, discipline, dataset):
    ''' Loads UCUES datasets present in the Data Mural Project GitHub Data Folder and returns the dataset as a pandas
    dataframe
    
    Params:
        survey_section  str, Shortened section name, ex: Satis 
        discipline    str, Shortened discipline name, ex: Hum
        dataset    int, the sub-dataset from the specified survey section 
    '''
    #Creates the path for where the file is located in the local environment
    file_path = "{}_data_{}_{}.csv".format(survey_section, discipline, dataset)
    DF = pd.read_csv(file_path, encoding='utf-16le', sep = '\t')
    return DF

In [None]:
def read_data_from_file_path(file_path, UCUES_dataset, survey_section, discipline, dataset):
    ''' Loads UCUES datasets present in the Data Mural Project GitHub Data Folder and returns the dataset as a pandas
    dataframe
    
    Params:
        file_path     str, file_path to the dataset on your local environment, ex: /Users/omarramos/Documents/Data_Mural_Project/Data
        UCUES_dataset   str, name of the UCUES survey section and sub-dataset number, ex: Satisfaction-1
        survey_section  str, Shortened section name, ex: Satis 
        discipline    str, Shortened discipline name, ex: Hum
        dataset    int, the sub-dataset from the specified survey section 
    '''
    #Creates the path for where the file is located in the local environment
    file_path = "{}/Data/UCUES-{}/{}_data_{}_{}.csv".format(file_path, UCUES_dataset,survey_section, discipline, dataset)
    DF = pd.read_csv(file_path, encoding='utf-16le', sep = '\t')
    return DF

## Open and merge datasets

In [3]:
# example for how to use the function to open datasets for different disciplines for the same section
# and for how to create a new dataframe with all the disciplines and their scores merged together

def create_dataset(file_path, UCUES_dataset, survey_section, discipline, dataset):
    disciplines = ["Arts", "Hum", "Life", "Eng", "Health", "Phys", "Prof", "Social", "Undec"]
    dataset = dataset
    survey_section = survey_section
    file_path = file_path
    UCUES_dataset = UCUES_dataset
    
    #make array with the different datasets for each discipline
    array_dfs = []
    for discipline in disciplines:
        df = read_data_from_file_path(file_path, UCUES_dataset, survey_section, discipline, dataset)
        df[discipline] = [discipline for i in range(len(df))]
        array_dfs.append(df)
        
    #merge datasets for each discipline
    concat_df = pd.concat(array_dfs)
    
    #one-hot encode discipline columns
    concat_df = concat_df.fillna(0) #convert NaNs to 0
    for discipline in disciplines:
        concat_df.loc[concat_df[discipline] == discipline, discipline] = 1
        
    # Get one hot encoding of column with scores
    one_hot = pd.get_dummies(concat_df['Pivot Field Values'])
    # Drop column with scores as it is now encoded
    one_hot_df = concat_df.drop('Pivot Field Values',axis = 1)
    # Join the encoded df
    one_hot_df = one_hot_df.join(one_hot)
    
    #rename columns
    one_hot_df = one_hot_df.rename(columns = {"Label1": "Statement", "Calculation1": "Percent_pop", 
                                              "Total": "Pop_raw_count"})
    
    #drop duplicate rows
    new_df = one_hot_df.drop_duplicates()
    #reset index
    new_df = new_df.reset_index()
    
    #convert string percent column to actual percentages
    for x in range(len(new_df["Percent_pop"])):
        new_df["Percent_pop"][x] = float(new_df["Percent_pop"][x].replace("%", ""))/100
    
    return new_df
    