In [1]:
import pandas as pd 
import numpy as np 
from sklearn.ensemble import RandomForestRegressor 
import warnings
warnings.filterwarnings(action='ignore', category=UserWarning) 
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error 
from sklearn.ensemble import *
from sklearn import ensemble
from sklearn.linear_model import *
import pickle
from sklearn.metrics import make_scorer
from sklearn.model_selection import cross_val_score, train_test_split
import gc
import joblib



def mdape(y_true,y_pred, **kwargs):
    mdape = abs((y_pred-y_true)/y_true)
    return mdape
my_scorer = make_scorer(mdape, greater_is_better=False)

def reduce_memory_usage(df):
    """ The function will reduce memory of dataframe
    Note: Apply this function after removing missing value"""
    intial_memory = df.memory_usage().sum()/1024**2
    print('Intial memory usage:',intial_memory,'MB')
    for col in df.columns:
        mn = df[col].min()
        mx = df[col].max()
        if df[col].dtype != object:            
            if df[col].dtype == int:
                if mn >=0:
                    if mx < np.iinfo(np.uint8).max:
                        df[col] = df[col].astype(np.uint8)
                    elif mx < np.iinfo(np.uint16).max:
                        df[col] = df[col].astype(np.uint16)
                    elif mx < np.iinfo(np.uint32).max:
                        df[col] = df[col].astype(np.uint32)
                    elif mx < np.iinfo(np.uint64).max:
                        df[col] = df[col].astype(np.uint64)
                else:
                    if mn > np.iinfo(np.int8).min and mx < np.iinfo(np.int8).max:
                        df[col] = df[col].astype(np.int8)
                    elif mn > np.iinfo(np.int16).min and mx < np.iinfo(np.int16).max:
                        df[col] = df[col].astype(np.int16)
                    elif mn > np.iinfo(np.int32).min and mx < np.iinfo(np.int32).max:
                        df[col] = df[col].astype(np.int32)
                    elif mn > np.iinfo(np.int64).min and mx < np.iinfo(np.int64).max:
                        df[col] = df[col].astype(np.int64)
            if df[col].dtype == float:
                df[col] =df[col].astype(np.float32)
    
    red_memory = df.memory_usage().sum()/1024**2
    print('Memory usage after complition: ',red_memory,'MB')


SEED = 2020

usecols = ['DateUpdateListing','Price','Latitude','Longitude','PostalCode',	
                'StateCode','City',	'Neighborhood',	'PropertyType',	'YearBuilt','Beds',	'Baths']

# dtypes = {
#         'DateUpdateListing'            : 'category',
#         'Price'           : 'int32',
#         'Latitude'        : 'float16',
#         'Longitude'            : 'float16',
#         'PostalCode'       : 'int32',
#         'StateCode' : 'category',
#         'City'            : 'category',
#         'Neighborhood'           : 'object',
#         'PropertyType'        : 'category',
#         'YearBuilt'            : 'category',
#         'Beds'       : 'int8',
#         'Baths' : 'int8',
#         }


df= pd.read_csv(r'rent_new.csv', sep=';', usecols=usecols)

numcol = df.select_dtypes(include=[np.number]).columns.drop('Price')
catcol = df.select_dtypes(include=[np.object]).columns


def imputer(df):
    '''NaN values imputainon to mode for cats and 0 for nums'''
    df.Neighborhood = df.Neighborhood.fillna(df.City)
    df.YearBuilt = df.YearBuilt.replace(np.nan, df.YearBuilt.mode()[0]) 
    df.PropertyType = df.PropertyType.replace(np.nan, df.PropertyType.mode()[0]) 
    df = df.dropna(subset=['Price', 'Latitude', 'Longitude', 'StateCode', 'DateUpdateListing', 'City'])
    for col in df:
        if col in catcol:
            df.loc[:, col]=df.loc[:, col].fillna(df.loc[:, col].mode()[0])
        else: df.loc[:, col].fillna(0, inplace=True)
    return df

def encoder(df):

    '''value encoder by dictionaries'''
    state = pd.read_csv('StateCodeDict.csv', header=None, index_col=0, squeeze=True).to_dict()
    df.StateCode = df.StateCode.map(state)
    post = pd.read_csv('PostalCodeDict.csv', header=None, index_col=0, squeeze=True).to_dict()
    df.PostalCode = df.PostalCode.map(post)
    city = pd.read_csv('CityDict.csv', header=None, index_col=0, squeeze=True).to_dict()
    df.City = df.City.map(city)
    neighborhood = pd.read_csv('NeighborhoodDict.csv', header=None, index_col=0, squeeze=True).to_dict()
    df.Neighborhood = df.Neighborhood.map(neighborhood)
    prop = pd.read_csv('PropertyTypeDict.csv', header=None, index_col=0, squeeze=True).to_dict()
    df.PropertyType = df.PropertyType.map(prop)
    year = pd.read_csv('YearBuiltDict.csv', header=None, index_col=0, squeeze=True).to_dict()
    df.YearBuilt = df.YearBuilt.map(year)
    

    return(df)

def feature_transformator(df):
    df.City = df.City+'_'+df.StateCode
    df['BedBath'] = df.Beds + df.Baths
    df.DateUpdateListing = pd.to_datetime(df.DateUpdateListing)
    df['YearRent'] = df['DateUpdateListing'].dt.year
    # df['MonthRent'] = df['DateUpdateListing'].dt.month

    return df

def drop_outliers(df):

    df = df[(df.Price<df.Price.quantile(0.96)) & (df.Price>df.Price.quantile(0.02))]
    df = df[((df.Longitude<-50)&(df.Longitude>-140))]
    df.Longitude = abs(df.Longitude)
    df = df[(df.Latitude<50)]
    df = df[df.BedBath<=16]
    df = df[df.YearRent>=2021]
    cols_to_drop = ['DateUpdateListing', 'YearRent', 'BedBath']
    df = df.drop(cols_to_drop, axis=1) 

    return df

def feature_extractor(df):
    df = imputer(df)
    df = feature_transformator(df)
    df = encoder(df)
    df = drop_outliers(df)
    df = df.fillna(-1)
    reduce_memory_usage(df)
    return df



def main(df):
    df = feature_extractor(df)
    X = df.drop('Price', axis=1).reset_index(drop=True)
    y = df.Price
    del df
    gc.collect()
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=SEED)
    print(X_train.head(2))
    rf = RandomForestRegressor(max_depth=35, n_estimators=100, random_state=2020, n_jobs=-1)
    rf.fit(X_train, y_train)
    scores = (-1 * cross_val_score(rf, X_train, y_train,
                              cv=5,
                            scoring=my_scorer))
    score = mdape(y_test, rf.predict(X_test))
    print('Median Percentage Error: %.3f' % score.median())
    del X_train, X_test, y_train, y_test
    gc.collect()
    # save the model to disk
    filename = 'finalized_model.pkl'
    joblib.dump(rf, open(filename,'wb'), compress=3, protocol=-1)
    return print ('Model Saved')

In [2]:
main(df)

Intial memory usage: 48.03387451171875 MB
Memory usage after complition:  44.03105163574219 MB
         Latitude  Longitude  PostalCode  StateCode  City  Neighborhood  \
462411  34.390980  84.950264        4526         10    32           137   
458818  29.786694  95.201752       10832         44  4523         24784   

        PropertyType  YearBuilt  Beds  Baths  
462411             9        215     3      3  
458818            11        215     1      1  


exception calling callback for <Future at 0x1b490509610 state=finished raised TerminatedWorkerError>
Traceback (most recent call last):
  File "C:\Users\Dmitry\AppData\Local\Programs\Python\Python39\lib\site-packages\joblib\externals\loky\_base.py", line 625, in _invoke_callbacks
    callback(self)
  File "C:\Users\Dmitry\AppData\Local\Programs\Python\Python39\lib\site-packages\joblib\parallel.py", line 340, in __call__
    self.parallel.dispatch_next()
  File "C:\Users\Dmitry\AppData\Local\Programs\Python\Python39\lib\site-packages\joblib\parallel.py", line 768, in dispatch_next
    if not self.dispatch_one_batch(self._original_iterator):
  File "C:\Users\Dmitry\AppData\Local\Programs\Python\Python39\lib\site-packages\joblib\parallel.py", line 834, in dispatch_one_batch
    self._dispatch(tasks)
  File "C:\Users\Dmitry\AppData\Local\Programs\Python\Python39\lib\site-packages\joblib\parallel.py", line 753, in _dispatch
    job = self._backend.apply_async(batch, callback=cb)
  File "C:\

Median Percentage Error: 0.079
Model Saved
