## Extraction

In [277]:
# %matplotlib inline

# Write column names

In [278]:
def listToString(s): 
    
    # initialize an empty string
    str1 = "\n" 
    
    # return string  
    return (str1.join(s))

# Libraries Import

In [279]:
import sys
import numpy as np
import pandas as pd
from datetime import datetime

# For import export of model
import pickle

# from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [280]:
# sys.argv[1]

with open('longevity_template.pickle', 'rb') as f:
    fragrance, profile, weather = pickle.load(f)

In [281]:
# fragrance_df = pd.read_json(fragrance)

fragrance_df    =  pd.DataFrame(data=eval(fragrance), index=[0])
profile_df      =  pd.DataFrame(data=eval(profile),   index=[0])
weather_df      =  pd.DataFrame(data=eval(weather),   index=[0])

In [282]:
# fragrance_df
# profile_df
# weather_df

#### Fixing Weather Keys

In [283]:
def fix_weather_keys(df):
    old_weather_columns = df.columns
    new_weather_columns = []
    for i in range(len(old_weather_columns) - 2):
        new_weather_columns.append(old_weather_columns[i] + '_avg')
    new_weather_columns.extend(old_weather_columns[-2:])
    df.columns = new_weather_columns
    return df

In [284]:
weather_df = fix_weather_keys(weather_df)
# weather_df

In [285]:
df = pd.concat([fragrance_df, profile_df, weather_df], axis=1)

In [286]:
df.head()

Unnamed: 0,fragrance,fragrance_gender,fragrance_type,brand,brand_tier,fp_id,gender,dob,sweat,height,...,uv_index_avg,temp_feels_like_avg,atm_pressure_avg,clouds_avg,visibility_avg,wind_speed_avg,rain_avg,snow_avg,weather_main,weather_desc
0,Ein Parfüm,Female,Eau de Toilette,Luxury Brand,High,1,Male,2019-11-12,82,54,...,12.2475,86.71875,1013,56.875,60000,12.035,1.515,0,Rain,light rain


## Cleaning

In [287]:
# df = df[df.columns.difference(['fba_country_name', 'fba_time_zone', 'suitability', 'sustainability', 'sillage', 'like', 'users_id', 'users_check', 'fba_location_country', 'fba_location_zone'], sort=False)]
df = df[df.columns.difference(['weather_desc'], sort=False)]

In [288]:
# df

In [289]:
# df.dtypes

In [290]:
# Missing Values
df['rain_avg'].fillna(int(0), inplace=True)
df['snow_avg'].fillna(int(0), inplace=True)

In [291]:
df['number_of_sprays'] = 7
df['apply_time'] = datetime.now().strftime('%Y-%m-%d 12:00:00')

In [292]:
# df = df.convert_dtypes()

# Datetime
df['dob']               = df['dob'].astype('datetime64[ns]')
df['apply_time']        = df['apply_time'].astype('datetime64[ns]')

# df.dtypes

In [293]:
# Calcualting Age

now = pd.to_datetime('now')
df['age'] = (now - df['dob']).dt.total_seconds() / (60*60*24*365.25)
df.drop(['dob'],axis=1, inplace=True)

In [294]:
# Sorting out Dates

# Apply Time
df['apply_time_year']          = df['apply_time'].dt.year
df['apply_time_month']         = df['apply_time'].dt.month
df['apply_time_day']           = df['apply_time'].dt.day
df['apply_time_hour']          = df['apply_time'].dt.hour
df['apply_time_minute']        = df['apply_time'].dt.minute
df['apply_time_weekday_name']  = df['apply_time'].dt.day_name()

# Type Cast
df['age']               = df['age'].astype('float')

# Drop Apply Time & Wear Off Time
df.drop(['apply_time'],axis=1, inplace=True)

In [295]:
categorical_columns = df.select_dtypes(include=['object']).columns.values
df = df.convert_dtypes()
# categorical_columns

In [296]:
def resolve_categorical_variables(df, column_names_arr):

    with open('longevity_dummies.pickle', 'rb') as f:
        cat_df = pickle.load(f)
        
    # Adding the rest
    for column_name in column_names_arr:
        
        new_df = pd.DataFrame(df[column_name].unique())
        new_df.insert(1, 'index', new_df.index)

        df[column_name] = cat_df[column_name].transform(new_df.to_numpy())
    
    return df

In [297]:
# with open('longevity_categorical_variables.pickle', 'rb') as f:
#     print(pickle.load(f)[0][1])

# with open('longevity_dummies.pickle', 'rb') as f:
#     print(pickle.load(f)['weather_main'])

In [298]:
# new = resolve_categorical_variables(df, np.append(categorical_columns, ('fp_id')))
# new

In [299]:
df = resolve_categorical_variables(df, np.append(categorical_columns, ('fp_id')))

In [300]:
# print(df.shape)
# print(df.duplicated(keep='first').sum())

In [301]:
df.head()

Unnamed: 0,fragrance,fragrance_gender,fragrance_type,brand,brand_tier,fp_id,gender,sweat,height,weight,...,snow_avg,weather_main,number_of_sprays,age,apply_time_year,apply_time_month,apply_time_day,apply_time_hour,apply_time_minute,apply_time_weekday_name
0,-1.0,0.0,1.0,-1.0,-1.0,0.0,1.0,82,54,54,...,0,2.0,7,1.492783,2021,5,10,12,0,1.0


In [302]:
# Columns with any null values
# df.fillna(0)
# df.columns[df.isna().any()].tolist()

In [303]:
# df.dtypes

In [304]:
# Save columne names
# file = open('longevity_column_names_after_cleaning_drop_prediction.txt', 'w')

# dump information to that file
# file.write(listToString(df.columns))

# close the file
# file.close()

# df.columns

#### Comparing current columns with training columns

In [305]:
# with open('longevity_column_names_check.pickle', 'rb') as f:
#     train_columns = pickle.load(f)
    
# print(len(train_columns), len(df.columns))

In [306]:
# np.setdiff1d(df.columns, train_columns, assume_unique = True)

In [307]:
# np.setdiff1d(train_columns, df.columns, assume_unique = True)

In [308]:
# set([x for x in df.columns if list(df.columns).count(x) > 1])

# Model

In [309]:
with open('longevity_model.pickle', 'rb') as f:
    longevity_model = pickle.load(f)

In [310]:
# longevity_model

In [311]:
y_pred = longevity_model.predict(df)
print(y_pred)

[60.]


# END