## Extraction

In [1]:
%matplotlib inline

# Write column names

In [2]:
def listToString(s): 
    
    # initialize an empty string
    str1 = "\n" 
    
    # return string  
    return (str1.join(s))

# Libraries Import

In [3]:
import numpy as np
import pandas as pd
from datetime import datetime

# For import export of model
import pickle

# from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [4]:
# unidecode(sys.argv[1])

with open('longevity_template.pickle', 'rb') as f:
    fragrance, profile, weather = pickle.load(f)

In [5]:
# fragrance_df = pd.read_json(fragrance)

fragrance_df    =  pd.DataFrame(data=eval(fragrance), index=[0])
profile_df      =  pd.DataFrame(data=eval(profile),   index=[0])
weather_df      =  pd.DataFrame(data=eval(weather),   index=[0])

In [6]:
# fragrance_df
# profile_df
# weather_df

#### Fixing Weather Keys

In [7]:
def fix_weather_keys(df):
    old_weather_columns = df.columns
    new_weather_columns = []
    for i in range(len(old_weather_columns) - 2):
        new_weather_columns.append(old_weather_columns[i] + '_avg')
    new_weather_columns.extend(old_weather_columns[-2:])
    df.columns = new_weather_columns
    return df

In [8]:
weather_df = fix_weather_keys(weather_df)
weather_df

Unnamed: 0,temp_avg,hum_avg,dew_point_avg,uv_index_avg,temp_feels_like_avg,atm_pressure_avg,clouds_avg,visibility_avg,wind_speed_avg,rain_avg,snow_avg,weather_main,weather_desc
0,81.835,74.625,73.16875,12.2475,87.1925,1012.875,36.125,60000,11.80125,1.59375,0,Rain,light rain


In [9]:
df = pd.concat([fragrance_df, profile_df, weather_df], axis=1)

In [10]:
df.head()

Unnamed: 0,fragrance,fragrance_gender,fragrance_type,brand,brand_tier,fp_id,gender,dob,sweat,height,...,uv_index_avg,temp_feels_like_avg,atm_pressure_avg,clouds_avg,visibility_avg,wind_speed_avg,rain_avg,snow_avg,weather_main,weather_desc
0,Swag20,Unisex,Eau de Cologne,Venom,High,1,Male,2019-11-12,82,54,...,12.2475,87.1925,1012.875,36.125,60000,11.80125,1.59375,0,Rain,light rain


## Cleaning

In [11]:
# df = df[df.columns.difference(['fba_country_name', 'fba_time_zone', 'suitability', 'sustainability', 'sillage', 'like', 'users_id', 'users_check', 'fba_location_country', 'fba_location_zone'], sort=False)]
df = df[df.columns.difference(['weather_desc'], sort=False)]

In [12]:
df

Unnamed: 0,fragrance,fragrance_gender,fragrance_type,brand,brand_tier,fp_id,gender,dob,sweat,height,...,dew_point_avg,uv_index_avg,temp_feels_like_avg,atm_pressure_avg,clouds_avg,visibility_avg,wind_speed_avg,rain_avg,snow_avg,weather_main
0,Swag20,Unisex,Eau de Cologne,Venom,High,1,Male,2019-11-12,82,54,...,73.16875,12.2475,87.1925,1012.875,36.125,60000,11.80125,1.59375,0,Rain


In [13]:
df.dtypes

fragrance               object
fragrance_gender        object
fragrance_type          object
brand                   object
brand_tier              object
fp_id                    int64
gender                  object
dob                     object
sweat                    int64
height                   int64
weight                   int64
profession              object
skin_type               object
season                  object
fp_country              object
temp_avg               float64
hum_avg                float64
dew_point_avg          float64
uv_index_avg           float64
temp_feels_like_avg    float64
atm_pressure_avg       float64
clouds_avg             float64
visibility_avg           int64
wind_speed_avg         float64
rain_avg               float64
snow_avg                 int64
weather_main            object
dtype: object

In [14]:
# Missing Values
df['rain_avg'].fillna(int(0), inplace=True)
df['snow_avg'].fillna(int(0), inplace=True)

In [15]:
df['number_of_sprays'] = 7
df['apply_time'] = datetime.now().strftime('%Y-%m-%d 12:00:00')

In [16]:
# df = df.convert_dtypes()

# Datetime
df['dob']               = df['dob'].astype('datetime64[ns]')
df['apply_time']        = df['apply_time'].astype('datetime64[ns]')

df.dtypes

fragrance                      object
fragrance_gender               object
fragrance_type                 object
brand                          object
brand_tier                     object
fp_id                           int64
gender                         object
dob                    datetime64[ns]
sweat                           int64
height                          int64
weight                          int64
profession                     object
skin_type                      object
season                         object
fp_country                     object
temp_avg                      float64
hum_avg                       float64
dew_point_avg                 float64
uv_index_avg                  float64
temp_feels_like_avg           float64
atm_pressure_avg              float64
clouds_avg                    float64
visibility_avg                  int64
wind_speed_avg                float64
rain_avg                      float64
snow_avg                        int64
weather_main

In [17]:
# Calcualting Age

now = pd.to_datetime('now')
df['age'] = (now - df['dob']).dt.total_seconds() / (60*60*24*365.25)
df.drop(['dob'],axis=1, inplace=True)

In [18]:
# Sorting out Dates

# Apply Time
df['apply_time_year']          = df['apply_time'].dt.year
df['apply_time_month']         = df['apply_time'].dt.month
df['apply_time_day']           = df['apply_time'].dt.day
df['apply_time_hour']          = df['apply_time'].dt.hour
df['apply_time_minute']        = df['apply_time'].dt.minute
df['apply_time_weekday_name']  = df['apply_time'].dt.day_name()

# Type Cast
df['age']               = df['age'].astype('float')

# Drop Apply Time & Wear Off Time
df.drop(['apply_time'],axis=1, inplace=True)

In [19]:
categorical_columns = df.select_dtypes(include=['object']).columns.values
df = df.convert_dtypes()
categorical_columns

array(['fragrance', 'fragrance_gender', 'fragrance_type', 'brand',
       'brand_tier', 'gender', 'profession', 'skin_type', 'season',
       'fp_country', 'weather_main', 'apply_time_weekday_name'],
      dtype=object)

In [20]:
def resolve_categorical_variables(df, column_names_arr):

    # Load dummies from training
    with open('longevity_categorical_variables.pickle', 'rb') as f:
        cat_df = pickle.load(f)
        
    for i in cat_df.columns:
        df[i] = 0

        
    # Adding the rest
    for column_name in column_names_arr:
        
        pred_dummies = pd.get_dummies(df[column_name], prefix=column_name)
        
        unavailable = np.setdiff1d(pred_dummies.keys(), cat_df.columns, assume_unique = True)
        
        if not unavailable:
            df.drop(pred_dummies.keys()[0], errors='ignore', axis=1, inplace=True)
            df = pd.concat([df, pred_dummies], axis = 1)
        
        df.drop([column_name], axis=1, inplace=True)
    
    return df

In [21]:
df = resolve_categorical_variables(df, np.append(categorical_columns, ('fp_id')))

  if not unavailable:


In [22]:
print(df.shape)
print(df.duplicated(keep='first').sum())

(1, 88)
0


In [23]:
df.drop(['uv_index_avg', 'visibility_avg', 'apply_time_minute', 'apply_time_hour', 'atm_pressure_avg', 'clouds_avg', 'temp_feels_like_avg', 'wind_speed_avg'], axis=1, inplace=True)
df.head()

Unnamed: 0,sweat,height,weight,temp_avg,hum_avg,dew_point_avg,rain_avg,snow_avg,number_of_sprays,age,...,fp_id_20,fp_id_21,fragrance_gender_Unisex,brand_tier_High,gender_Male,skin_type_Very Oily,season_Spring,weather_main_Rain,apply_time_weekday_name_Monday,fp_id_1
0,82,54,54,81.835,74.625,73.16875,1.59375,0,7,1.511461,...,0,0,1,1,1,1,1,1,1,1


In [24]:
# Columns with any null values
# df.fillna(0)
df.columns[df.isna().any()].tolist()

[]

In [25]:
# df.dtypes

In [26]:
# Save columne names
file = open('longevity_column_names_after_cleaning_drop_prediction.txt', 'w')

# dump information to that file
file.write(listToString(df.columns))

# close the file
file.close()

# df.columns

#### Comparing current columns with training columns

In [27]:
with open('longevity_column_names_check.pickle', 'rb') as f:
    train_columns = pickle.load(f)
    
# print(len(train_columns), len(df.columns))

In [28]:
np.setdiff1d(df.columns, train_columns, assume_unique = True)

array([], dtype=object)

In [29]:
np.setdiff1d(train_columns, df.columns, assume_unique = True)

array([], dtype=object)

In [30]:
set([x for x in df.columns if list(df.columns).count(x) > 1])

set()

# Model

In [31]:
with open('longevity_model.pickle', 'rb') as f:
    longevity_model = pickle.load(f)

In [32]:
longevity_model

StackingRegressor(estimators=[('br', BaggingRegressor(random_state=0)),
                              ('lnr', LinearRegression()),
                              ('en',
                               ElasticNet(random_state=0, selection='random')),
                              ('en2',
                               ElasticNet(random_state=0, selection='random')),
                              ('llar', LassoLars(random_state=0)),
                              ('knr', KNeighborsRegressor(weights='distance'))],
                  final_estimator=DecisionTreeRegressor(random_state=0))

In [33]:
new_test = df

y_pred = longevity_model.predict(df)
print(y_pred)

[169.89795918]


# END