In [24]:

column_DBSCAN_dict = load_object('column_DBSCAN_dict')

In [35]:

import random

class Descriptions(object):
    '''
    Sample

    The sample is from the first wave of the National Epidemiologic Survey
    on Alcohol and Related Conditions (NESARC), the largest nationwide
    longitudinal survey of alcohol and drug use and associated psychiatric
    and medical comorbidities. Participants (N=43,093) represented the
    civilian, non-institutionalized adult population of the United States,
    and included persons living in households, military personnel living off
    base, and persons residing in the following group quarters: boarding or
    rooming houses, non-transient hotels and motels, shelters, facilities
    for housing workers, college quarters, and group homes. The NESARC
    included over sampling of Blacks, Hispanics and young adults aged 18 to
    24 years. The data analytic sample for this study included participants
    18-25 years old who reported smoking at least 1 cigarette per day in the
    past 30 days (N=1,320).

    Procedure

    Data were collected by trained U.S. Census Bureau Field Representatives
    during 2001– 2002 through computer-assisted personal interviews (CAPI).
    One adult was selected for interview in each household, and interviews
    were conducted in respondents’ homes following informed consent
    procedures.

    Measures

    Lifetime major depression (i.e. those experienced in the past 12 months
    and prior to the past 12 months) was assessed using the NIAAA, Alcohol
    Use Disorder and Associated Disabilities Interview Schedule – DSM-IV
    (AUDADIS-IV) (Grant et al., 2003; Grant, Harford, Dawson, & Chou, 1995).
    The tobacco module of the AUDADIS-IV contains detailed questions on the
    frequency, quantity, and patterning of tobacco use as well as symptom
    criteria for DSM-IV nicotine dependence. Current smoking was evaluated
    through both smoking frequency (“About how often did you usually smoke
    in the past year?”) coded dichotomously to represent presence or absence
    of daily smoking, and quantity (“On the days that you smoked in the last
    year, about how many cigarettes did you usually smoke?”), a quantitative
    variable that ranged from 1 cigarette per day to 98 cigarettes per day.
    
    '''

    def __init__(self, df, **kwargs):
        prop_defaults = {
            'random_column_count': 4,
            'categorical_labels_list': ['Very Low', 'Low', 'Moderate', 'High', 'Very High'],
            'number_column_list': number_column_list,
            'column_DBSCAN_dict': column_DBSCAN_dict,
            'informal_name_dict': informal_name_dict,
            'data_provider_dict': data_provider_dict,
            'data_procedures_dict': data_procedures_dict,
            'data_measures_dict': data_measures_dict,
            'random_column_list': None,
            'verbose_html': '',
        }

        for (prop, default) in prop_defaults.items():
            setattr(self, prop, kwargs.get(prop, default))
        
        if self.random_column_list is None:
            self.random_column_list = random.sample(self.number_column_list, self.random_column_count)
        self.df = df.dropna(how='any', subset=self.random_column_list).copy()
        self.row_count = self.df.shape[0]
        
        # Categorical Confounders?
        self.dbscan_description_dict = {}
        for column_name in self.random_column_list:
            category_column_name = column_name + '_group'
            if category_column_name not in self.df.columns:
                self.df[category_column_name] = DBSCAN(**self.column_DBSCAN_dict[column_name]).fit(self.df[column_name].as_matrix().reshape(-1,
                                                                                                                                       1)).labels_
            if category_column_name in self.df.columns:
                dbscan_labels_dict = {}
                groups_list = sorted(self.df[category_column_name].unique().tolist())
                if -1.0 in groups_list:
                    groups_list = groups_list[1:]
                for i, dbscan_category in enumerate(groups_list):
                    dbscan_category = int(dbscan_category)
                    dbscan_labels_dict[dbscan_category] = self.categorical_labels_list[i]
                dbscan_category_count = len(dbscan_labels_dict.keys())
                dbscan_description_list = []
                for group_label, group_count in self.df[category_column_name].value_counts(sort=False).iteritems():
                    group_label = int(group_label)
                    if group_label != -1:
                        
                        # African American (n = 414, 34.4%)
                        group_description = '{0} (n = {1:d}, {2:.2%})'.format(dbscan_labels_dict[group_label],
                                                                              group_count, group_count/self.row_count)

                        dbscan_description_list.append(group_description)
                self.dbscan_description_dict[column_name] = dbscan_description_list
    
    # Sample
    # Identify who or what was studied (people, animals, etc.). Identify the level of analysis studied
    # (individual, group, or aggregate). Describe observations vividly so your reader can distinguish
    # them clearly. If you group observations, use meaningful names (“Low-Income Women”) rather than
    # abbreviations (“PPM100”) or variable names (“INCOME_GRP”).
    def populate_sample_description(self):
        self.sample_description_html = ('<h3>Sample</h3><p>Gapminder contains data for all 192 UN members, ' +
                                        'aggregating data for Serbia and Montenegro, and includes data for 24 ' +
                                        'other areas (generating a total of 215 areas). All ' +
                                        str(self.row_count) + ' countries with a datapoint for each aggregate variable ' +
                                        'under consideration were drawn from this dataset. ')
        for column_name in self.random_column_list:
            self.sample_description_html += ('The ' +
                                             self.informal_name_dict[column_name] + ' composition, ' +
                                             self.data_provider_dict[column_name] + ', was ' + 
                                             conjunctify_list(self.dbscan_description_dict[column_name]) + '. ')
        self.sample_description_html += ('</p>')
    
    # Procedures
    # Explain what participants/observations experienced. Discuss whether data were collected by surveillance,
    # survey, experiment, or another method. Discuss where data were collected and the period over which they
    # were collected.
    def populate_procedures_description(self):
        self.procedures_description_html = ('<h3>Procedure</h3><p>')
        for column_name in self.random_column_list:
            self.procedures_description_html += self.data_procedures_dict[column_name] + ' '
        self.procedures_description_html += ('</p>')
    
    # Measures
    # Describe the questions or measures of your participants/observations and relate these
    # to the type of data you collected (quantitative or categorical). Again, provide meaningful descriptions
    # (“number of cigarettes smoked per day”), rather than variable names (“SQB8A”) that will have no meaning
    # to the reader. Discuss how you managed the measures for your analysis.
    def populate_measures_description(self):
        self.measures_description_html = ('<h3>Measures</h3><p>')
        for column_name in self.random_column_list:
            self.measures_description_html += ('The measure of ' +
                                               self.informal_name_dict[column_name] + ' was assessed using ' +
                                               self.data_measures_dict[column_name] + '. ')
        self.measures_description_html += ('For the current analysis, each variable was binned into five ' +
                                           'categories on a density-based spatial clustering of applications with ' +
                                           'noise (DBSCAN).')