In [None]:
import pandas as pd
import numpy as np

pd.set_option('max_columns', 30)

In [None]:
matching_features = pd.read_csv('2-matching_features_final.csv')
extracted_data = pd.read_csv('1-extracted_data_final.csv')


In [None]:
extracted_data.shape


In [None]:
set(matching_features.columns).difference(set(extracted_data.columns))


In [None]:
extracted_data


In [None]:
matching_features


In [None]:
matching_features.drop_duplicates('_id')

In [None]:
matching_features.columns

In [None]:
joined = extracted_data.merge(matching_features, on = '_id')
joined = joined.drop([col for col in joined.columns if col.endswith('_y')], axis = 1)
joined


In [None]:
joined.dtypes


In [None]:
joined['helper'] = (joined['price'] == joined['price_value_blended'])
joined['helper'].value_counts()


In [None]:
joined[joined['helper'] == False][['price', 'price_value_blended']].drop_duplicates()

In [None]:
joined['helper'] = (joined['package'] == joined['package_value'])
joined['helper'].value_counts()


In [None]:
joined[joined['helper'] == False][['package', 'package_value']].drop_duplicates()

In [None]:
joined['thc_weight'].value_counts()

In [None]:
def develop_matching_features(extracted_data_path: str) -> pd.DataFrame:
    """This function takes the path of the exctracted_data csv file as a string 
    and returns the matching_features as a Pandas dataframe."""
    
    extracted_data = pd.read_csv(extracted_data_path)
    
    # get price from price_value_blended
    extracted_data['price'] = extracted_data['price_value_blended']
    
    # get package from package_value
    extracted_data['package'] = extracted_data['package_value']
    
    # get thc_weight from cannabinoid_thc_value and adding units of mg
    extracted_data['thc_weight'] = extracted_data.apply(
        lambda x: float('nan')
            if np.isnan(x['cannabinoid_thc_value'])
            else str(int(x['cannabinoid_thc_value'])) + 'mg'
        , axis = 1)
    
    # get cbd_weight from cannabinoid_cbd_value and adding units of mg
    extracted_data['cbd_weight'] = extracted_data.apply(
        lambda x: float('nan')
            if np.isnan(x['cannabinoid_cbd_value'])
            else str(int(x['cannabinoid_cbd_value'])) + 'mg'
        , axis = 1)
    
    # get id_product from brand_name and flavor_ingestibles 
    extracted_data['id_product'] = (extracted_data['brand_name'] + 
                                    '-' + 
                                    extracted_data['flavor_ingestibles']
                                   ).str.replace(' ', '-').str.replace('\'', '')
    
    # remove hyphens from brand_name
    extracted_data['brand_name'] = extracted_data['brand_name'].str.replace('-', '')
    # replace nan with smokiez TODO FIX THIS
    extracted_data['brand_name'] == np.where(
        extracted_data['brand_name'].isna()
        , 'smokiez'
        , extracted_data['brand_name']
    )
    # rename stiiizy to stiizy 
    extracted_data['brand_name'] = extracted_data['brand_name'].str.replace(
        'stiiizy', 'stiizy')
    # rename smokiez edibles to smokiez 
    extracted_data['brand_name'] = extracted_data['brand_name'].str.replace(
        'smokiez edibles', 'smokiez')
    
    # flavor_ingestibles fixes
    extracted_data['flavor_ingestibles'] = extracted_data['flavor_ingestibles'].str.replace(
        'ñ', 'n')
    extracted_data['flavor_ingestibles'] = extracted_data['flavor_ingestibles'].str.replace(
        'sour blue raspberry', 'blue raspberry')
    extracted_data['flavor_ingestibles'] = extracted_data['flavor_ingestibles'].str.replace(
        'root beer float', 'rootbeer float')    
    extracted_data['flavor_ingestibles'] = extracted_data['flavor_ingestibles'].str.replace(
        'assorted', 'assorted flavors') 
#     extracted_data['flavor_ingestibles'] = extracted_data['flavor_ingestibles'].str.replace(
#         'watermelon', 'sweet watermelon') 
    extracted_data['flavor_ingestibles'] = extracted_data['flavor_ingestibles'].str.replace(
        'cocoa banana cream pie', 'banana cream pie') 
    extracted_data['flavor_ingestibles'] = extracted_data['flavor_ingestibles'].str.replace(
        'tropical fruit', 'tropical') 
    extracted_data['flavor_ingestibles'] = extracted_data['flavor_ingestibles'].str.replace(
        'sour blackberry', 'blackberry') 
    extracted_data['flavor_ingestibles'] = extracted_data['flavor_ingestibles'].str.replace(
        'rainbow sorbet', 'rainbow sherbet') 
    extracted_data['flavor_ingestibles'] = extracted_data['flavor_ingestibles'].str.replace(
        'plum', 'indica plum') 
    extracted_data['flavor_ingestibles'] = extracted_data['flavor_ingestibles'].str.replace(
        'chile lime', 'mango & chile lime') 
#     extracted_data['flavor_ingestibles'] = extracted_data['flavor_ingestibles'].str.replace(
#         'peach', 'sour peach') 
#     extracted_data['flavor_ingestibles'] = extracted_data['flavor_ingestibles'].str.replace(
#         'pineapple mango', 'spicy pineapple mango') 
#     extracted_data['flavor_ingestibles'] = extracted_data['flavor_ingestibles'].str.replace(
#         'mango | pineapple', 'spicy pineapple mango') 
#     extracted_data['flavor_ingestibles'] = extracted_data['flavor_ingestibles'].str.replace(
#         'blueberry', 'blueberry lemonade') 
#     extracted_data['flavor_ingestibles'] = extracted_data['flavor_ingestibles'].str.replace(
#         'strawberry', 'sativa strawberry') 
    
    # reorder columns
    extracted_data = extracted_data[['_id'
                                      , 'brand_name'
                                      , 'flavor_ingestibles'
                                      , 'thc_weight'
                                      , 'cbd_weight'
                                      , 'cannabis_strain_type'
                                      , 'price'
                                      , 'package'
                                      , 'day_report_date'
                                      , 'data_source_name'
                                      , 'product_slug'
                                      , 'id_product']]
    
    return extracted_data
    
    

In [None]:
test = develop_matching_features('1-extracted_data_final.csv')
test


In [None]:
joined = test.merge(matching_features, on = '_id')
joined


In [None]:
joined['helper'] = joined['flavor_ingestibles_x'] == matching_features['flavor_ingestibles']
joined[joined['helper'] == False][[col for col in joined.columns if 'flavor_ingestibles' in col 
                                   or '_id' in col]]

In [None]:
joined[(joined['helper'] == False) & joined['brand_name_x'].isna()][
    [col for col in joined.columns if 'brand_name' in col]
]['brand_name_y'].value_counts()

In [None]:
joined.dtypes

In [None]:
def test_develop_matching_features():
    expected = pd.read_csv('2-matching_features_final.csv').drop(
        columns = ['id_product', 'brand_name', 'flavor_ingestibles']
    ).sort_values('_id').reset_index(drop = True)
    
    actual = develop_matching_features('1-extracted_data_final.csv').drop(
        columns = ['id_product', 'brand_name', 'flavor_ingestibles']
    ).sort_values('_id').reset_index(drop = True)
    
    pd.testing.assert_frame_equal(expected, actual)
    

In [None]:
test_develop_matching_features()