In [1]:
import pandas as pd
import numpy as np

In [2]:
#loading data from cvs for inspection
df_station = pd.read_csv('hawaii_stations.csv')
df_measurement = pd.read_csv('hawaii_measurements.csv')

In [3]:
df_station

Unnamed: 0,station,name,latitude,longitude,elevation
0,USC00519397,"WAIKIKI 717.2, HI US",21.2716,-157.8168,3.0
1,USC00513117,"KANEOHE 838.1, HI US",21.4234,-157.8015,14.6
2,USC00514830,"KUALOA RANCH HEADQUARTERS 886.9, HI US",21.5213,-157.8374,7.0
3,USC00517948,"PEARL CITY, HI US",21.3934,-157.9751,11.9
4,USC00518838,"UPPER WAHIAWA 874.3, HI US",21.4992,-158.0111,306.6
5,USC00519523,"WAIMANALO EXPERIMENTAL FARM, HI US",21.33556,-157.71139,19.5
6,USC00519281,"WAIHEE 837.5, HI US",21.45167,-157.84889,32.9
7,USC00511918,"HONOLULU OBSERVATORY 702.2, HI US",21.3152,-157.9992,0.9
8,USC00516128,"MANOA LYON ARBO 785.2, HI US",21.3331,-157.8025,152.4


In [4]:
df_measurement.head()

Unnamed: 0,station,date,prcp,tobs
0,USC00519397,2010-01-01,0.08,65
1,USC00519397,2010-01-02,0.0,63
2,USC00519397,2010-01-03,0.0,74
3,USC00519397,2010-01-04,0.0,76
4,USC00519397,2010-01-06,,73


In [5]:


def fx_validate (v_dataframe , *args):
    '''
        Desc :    Validation of data for duplicates, nulls in dataframe for set of columns
        Type:     Plot
        Example:    x = fx_validate (df_station , ['latitude' , 'longitude'] , ['latitude', 'longitude', 'elevation'] , ['name'] , ['station'])
                    print(x)
        Output:
            Total records in data frame = 9
            --------------------------------
            For [latitude, longitude] there are no duplicates.
            --------------------------------
            For [latitude, longitude, elevation] there are no duplicates.
            --------------------------------
            For [name] there are no duplicates.
            --------------------------------
            For [station] there are no duplicates.
            --------------------------------
            There are no Nulls.
            --------------------------------
            station is of String and max length/value is  11
            name is of String and max length/value is  38
            latitude is of Numeric and max length/value is  21.5213
            longitude is of Numeric and max length/value is  -157.71139
            elevation is of Numeric and max length/value is  306.6
    '''   
    # counts
    v_fx_validate_count_star = str(len(v_dataframe))
    v_fx_validate_message = 'Total records in data frame = ' + v_fx_validate_count_star + chr(10)

    for v_fx_validate_list in args:
        v_fx_validate_s = ''
        v_fx_validate_List_name = ''
        for a in v_fx_validate_list:
            v_fx_validate_List_name  += v_fx_validate_s + a
            v_fx_validate_s = ', '
        v_fx_validate_List_name = '[' + v_fx_validate_List_name + ']'
        v_fx_validate_message += '--------------------------------'+ chr(10)
        
        v_fx_validate_uk = str(len( v_dataframe.groupby(v_fx_validate_list)) )
        if v_fx_validate_count_star == v_fx_validate_uk:
            v_fx_validate_status = 'no duplicates.'
            v_fx_validate_message += 'For '+v_fx_validate_List_name+' there are ' + v_fx_validate_status + chr(10)

        else:
            v_fx_validate_status = 'duplicates found (or probably some nulls).'
            v_fx_validate_message += 'For '+v_fx_validate_List_name+' there are ' + v_fx_validate_status + chr(10)
            v_fx_validate_message += 'Total distinct records in data frame for '+v_fx_validate_List_name+' key = ' + v_fx_validate_uk + chr(10)

    #null checking 
    v_fx_validate_message += '--------------------------------'+ chr(10)
    v_dataframe_dupcheck = v_dataframe.copy()
    v_dataframe_dupcheck.dropna(how='any',inplace=True)
    v_fx_validate_count_withnonull = str(len(v_dataframe_dupcheck))
    if v_fx_validate_count_star == v_fx_validate_count_withnonull:
        v_fx_validate_message += 'There are no Nulls.' + chr(10)    
    else:
        v_fx_validate_message += 'There are Nulls. Total records with some nulls values in columns are '+ str( int(v_fx_validate_count_star) - int(v_fx_validate_count_withnonull) ) + chr(10) 

    v_fx_validate_message += '--------------------------------'+ chr(10)
    
    colnames_numerics_only = v_dataframe.select_dtypes(include=[np.number]).columns.tolist()
    colnames_numerics_only

    for df_col_name in v_dataframe.columns:
        
        if df_col_name in colnames_numerics_only:
            df_col_len = str(v_dataframe[df_col_name].max())
            df_col_type = 'Numeric'
        else:   
            df_col_len = str(v_dataframe[df_col_name].str.encode(encoding='utf-8').str.len().max())
            df_col_type = 'String'
        
        v_fx_validate_message += df_col_name + ' is of '+ df_col_type +' and max length/value is  '+  df_col_len + chr(10)
        
    return v_fx_validate_message


In [6]:
def fx_validate_refrential_integrity(v_data_frame_parent, v_data_frame_child, v_join_column_list ):
    '''
    Desc :    Check for violation of data between two data frames for refrential constraints
    Type:     Plot
    Example:    xx = fx_validate_refrential_integrity(df_station, df_measurement, ['station'] )
                    print (xx)    
    Output:
        There are no records in child for [station] that are not in parent. Integrity passed.
        There are no records in parent for [station] that are not in child.

    '''
    v_fx_validate_s = ''
    v_fx_validate_List_name = ''
    v_return_message = ''
    for a in v_join_column_list:
        v_fx_validate_List_name  += v_fx_validate_s + a
        v_fx_validate_s = ', '
    v_fx_validate_List_name = '[' + v_fx_validate_List_name + ']'
    v_data_frame_parent['fx_validate_refrential_integrity_parent'] = 'parent'
    v_data_frame_child['fx_validate_refrential_integrity_child'] = 'child'
    v_df_fx_validate_refrential_integrity = pd.merge(v_data_frame_parent, v_data_frame_child, on=v_join_column_list, how='outer')
    v_df_fx_validate_refrential_integrity_not_in_parent = v_df_fx_validate_refrential_integrity.loc[(v_df_fx_validate_refrential_integrity['fx_validate_refrential_integrity_parent'].isnull())]
    v_df_fx_validate_refrential_integrity_not_in_child = v_df_fx_validate_refrential_integrity.loc[(v_df_fx_validate_refrential_integrity['fx_validate_refrential_integrity_child'].isnull())]
    v_df_fx_validate_refrential_integrity_not_in_parent_count = len(v_df_fx_validate_refrential_integrity_not_in_parent)
    v_df_fx_validate_refrential_integrity_not_in_child_count = len(v_df_fx_validate_refrential_integrity_not_in_child)
    if v_df_fx_validate_refrential_integrity_not_in_parent_count > 0:
        v_return_message += 'There are records '+ str(v_df_fx_validate_refrential_integrity_not_in_parent_count) +' in parent but not in child for key ' + v_fx_validate_List_name + chr(10)
        v_return_message += 'Query data frame v_df_fx_validate_refrential_integrity_not_in_parent to get the list' + chr(10)
    else :
        v_return_message += 'There are no records in child for '+ v_fx_validate_List_name +' that are not in parent. Integrity passed.'+ chr(10)
    
    if v_df_fx_validate_refrential_integrity_not_in_child_count > 0:
        v_return_message += 'There are records '+ str(v_df_fx_validate_refrential_integrity_not_in_child_count) +' in child but not in parent for key ' + v_fx_validate_List_name + chr(10)
        v_return_message += 'Query data frame v_df_fx_validate_refrential_integrity_not_in_child to get the list' + chr(10)
    else :
        v_return_message += 'There are no records in parent for '+ v_fx_validate_List_name +' that are not in child.' + chr(10)
    v_data_frame_parent.drop('fx_validate_refrential_integrity_parent', axis=1 ,  inplace=True)
    v_data_frame_child.drop('fx_validate_refrential_integrity_child', axis=1 ,  inplace=True)
    return v_return_message


In [7]:
x = fx_validate (df_measurement , ['station' , 'date'] )
print(x)

Total records in data frame = 19550
--------------------------------
For [station, date] there are no duplicates.
--------------------------------
There are Nulls. Total records with some nulls values in columns are 1447
--------------------------------
station is of String and max length/value is  11
date is of String and max length/value is  10
prcp is of Numeric and max length/value is  11.53
tobs is of Numeric and max length/value is  87



In [8]:
x = fx_validate (df_station , ['latitude' , 'longitude'] , ['latitude', 'longitude', 'elevation'] , ['name'] , ['station'])
print(x)

Total records in data frame = 9
--------------------------------
For [latitude, longitude] there are no duplicates.
--------------------------------
For [latitude, longitude, elevation] there are no duplicates.
--------------------------------
For [name] there are no duplicates.
--------------------------------
For [station] there are no duplicates.
--------------------------------
There are no Nulls.
--------------------------------
station is of String and max length/value is  11
name is of String and max length/value is  38
latitude is of Numeric and max length/value is  21.5213
longitude is of Numeric and max length/value is  -157.71139
elevation is of Numeric and max length/value is  306.6



In [9]:
xx = fx_validate_refrential_integrity(df_station, df_measurement, ['station'] )
print (xx)

There are no records in child for [station] that are not in parent. Integrity passed.
There are no records in parent for [station] that are not in child.



In [10]:
##deleting nulls
df_measurement.dropna(how='any',inplace=True)

In [11]:
x = fx_validate (df_measurement , ['station' , 'date'] )
print(x)

Total records in data frame = 18103
--------------------------------
For [station, date] there are no duplicates.
--------------------------------
There are no Nulls.
--------------------------------
station is of String and max length/value is  11
date is of String and max length/value is  10
prcp is of Numeric and max length/value is  11.53
tobs is of Numeric and max length/value is  87



In [12]:
xx = fx_validate_refrential_integrity(df_station, df_measurement, ['station'] )
print (xx)

There are no records in child for [station] that are not in parent. Integrity passed.
There are no records in parent for [station] that are not in child.



In [13]:
# creating clean data spreadsheet 

df_station.to_csv("clean_hawaii_stations.csv", index=False, header=True)
df_measurement.to_csv("clean_hawaii_measurements.csv", index=False, header=True)