### READ DATA

In [None]:
import pandas as pd
import numpy as np

In [None]:
train = pd.read_csv('../input/train.csv')
test = pd.read_csv('../input/test.csv')

In [None]:
print(train.shape)
print(test.shape)

### TARGET

In [None]:
target = train.price_doc

In [None]:
target.hist()

In [None]:
from math import log
log_target = np.array([log(x) for x in target])
pd.DataFrame(log_target).hist(bins = 100)

### EXPLORE VARIABLES

### CATEGORICAL VARIABLES

In [None]:
time = train.timestamp
target = train.price_doc
dtypes = train.iloc[:,2:-1].dtypes
cat_cols = dtypes[dtypes == object].index
num_cols = dtypes[dtypes != object].index

In [None]:
def categorical_summary(data, col):
    filled_values = sum(data[col].notnull())
    missing = sum(data[col].isnull())
    cardinality = len(data[col].unique())
    print(col.upper())
    print('-------------------------------')
    print('filled count: %s' %filled_values)
    print('missing count: %s' %missing)
    print('cardinality: %s' %cardinality)
    print(data[col].value_counts())
    print('\n')

In [None]:
### inspect categorical variables in training set
for col in cat_cols:
    categorical_summary(train, col)

In [None]:
### inspect categorical variables in test set
for col in cat_cols:
    categorical_summary(test, col)

#### CATEGORICAL VS RESPONSE

In [None]:
### E.G. ECOLOGY HAS AN ORDINAL EFFECT ON PRICE RANGE
### APPLICATION OF ONE-HOT AND LABEL ENCODING IS NECESSARY FOR ECOGLOGY
### DO JUST ONE-HOT ENCODE TO OTHER CATEGORICALS
train.groupby(cat_cols[-1])["price_doc"].describe().unstack()

In [None]:
### AREA CODES CAN BE USED TO GENERATE AREAL FEATURES SUCH AS MEDIAN HOUSE PRICE,MIN-MAX HOUSE PRICE, RANGE, ETC...
train.groupby(cat_cols[1])["price_doc"].median().head(5)

### NUMERICAL VARIABLES

In [None]:
### CHECK FOR OTHER CAT FEATURES INSIDE NUMERICAL
for col in num_cols:
    print(col)
    print(len(train[col].unique()))
    print(train[col].unique())
    print("-------------------------------------")

In [None]:
### HERE DISTRIBUTION OF POPULATION VARYING FOR DIFFERENT AGES VS GENDER IS UNNECESSARY SINCE IT IS 50% A LAW OF NATURE :)
### PERHAPS POPULATION IN THE AREA IS IMPORTANT AND OTHER SOCIO-ECONOMICAL FEATURES AFFECTED BY GENDER SUCH AS LABOR PARTICIPATION RATES
### OTHER THAN THAT FEATURE WE CAN GENERATE MORE GENERAL FEATURES AS XXX_ALL / FULL_ALL
### AND LATER WE CAN GET RID OF THE RAW COUNTS AND REDUCE THE DIMENSION A LITTLE BIT :)

population_cols = num_cols[28:55]
train[population_cols].head()

Notes on num_cols features:

 1. full_sq - life_sq  = not_life_sq
 2. life_sq / full_sq = life_sq_ratio
 3. floor ordinal 
 4. max_floor ordinal
 5. material ordinal,  one-hot encoding also
 6. timestamp - build_year = build_age
 7. state, one-hot encoding and use as ordinal
 8. get ratios for xxx_all / full_all
 9. get ratios of buildings by year build_count_before_yyyy / raion_build_count_with_builddate_info
 10. get ratios of buildingsbuild_count_xxx / raion_build_count_with_material_info 




THERE ARE A LOT OF RADIAL COUNTS VARYING FROM 500m to 5000m

### ENCODE + NEW FEATURES

In [None]:
### MERGE TRAIN TEST FOR ENCODING
train_copy = train.copy()
test_copy = test.copy()
test_copy.index = range(len(train_copy), len(test_copy) + len(train_copy))
merged = train_copy.append(test_copy)
merged = merged[train.columns]

In [None]:
### DO ONE HOT ENCODE
### DO NOT INCLUDE SUB AREA 
### FILL MISSING DATA OF ECOLOGY WITH NP.NAN
merged.ecology = merged.ecology.apply(lambda x: np.NaN if x == "no data" else x)
### ONE-HOT AND ECOLOGY
encoding_cols = [col for col in cat_cols if col != "sub_area"] + ["material", "state"]
merged_encoded = pd.get_dummies(columns=encoding_cols, data = merged)

def label_encode_ecology(x):
    if x == "poor":
        return 0
    elif x == "good":
        return 1
    elif x == "satisfactory":
        return 2
    elif x == "excellent":
        return 3
    else:
        return x
    
merged_encoded["ecology_labels"] = merged.ecology.apply(lambda x: label_encode_ecology(x))
merged_encoded["material"] = merged.material
merged_encoded["state"] = merged.state

In [None]:
"""
Notes on num_cols features:

 1. full_sq - life_sq  = not_life_sq
 2. life_sq / full_sq = life_sq_ratio
 3. floor ordinal 
 4. max_floor ordinal
 5. material ordinal,  one-hot encoding also
 6. timestamp - build_year = build_age
 7. state, one-hot encoding and use as ordinal
 8. get ratios for xxx_all / full_all
 9. get ratios of buildings by year build_count_before_yyyy / raion_build_count_with_builddate_info
 10. get ratios of buildingsbuild_count_xxx / raion_build_count_with_material_info 

"""
merged_encoded["non_life_sq"] =  merged_encoded.full_sq - merged_encoded.life_sq 
merged_encoded["life_sq_ratio"] = merged_encoded.life_sq  /  merged_encoded.full_sq



from datetime import datetime
def build_year(x):
    try:
        return datetime.strptime(str(int(x)), "%Y")
    except:
        return x
def timestamp(x):
    try:
        return datetime.strptime(x, "%Y-%m-%d")
    except:
        return x
merged_encoded.build_year = merged_encoded.build_year.apply(lambda x: build_year(x))
merged_encoded.timestamp = merged_encoded.timestamp.apply(lambda x: timestamp(x))




build_age = []
for x,y in zip(merged_encoded.timestamp,merged_encoded.build_year):
    try:
        build_age.append(x.year - y.year)
    except:
        build_age.append(np.NaN)
merged_encoded["build_age"] = np.array(build_age)


population_cols = ['full_all', 'young_all', 'work_all', 'ekder_all', '0_6_all', '7_14_all',
                   '0_17_all',  '16_29_all',  '0_13_all']
for col in population_cols[1:]:
    col_name = col + "_percent"
    merged_encoded[col_name] = merged_encoded[col] / merged_encoded["full_all"]
    
building_type_cols = ['raion_build_count_with_material_info', 'build_count_block', 'build_count_wood',
 'build_count_frame', 'build_count_brick', 'build_count_monolith', 'build_count_panel', 'build_count_foam',
 'build_count_slag', 'build_count_mix']

for col in building_type_cols[1:]:
    col_name = col + "_percent"
    merged_encoded[col_name] = merged_encoded[col] / merged_encoded["raion_build_count_with_material_info"]
    
building_year_cols = ['raion_build_count_with_builddate_info', 'build_count_before_1920',
 'build_count_1921-1945', 'build_count_1946-1970', 'build_count_1971-1995', 'build_count_after_1995']

for col in building_year_cols[1:]:
    col_name = col + "_percent"
    merged_encoded[col_name] = merged_encoded[col] / merged_encoded["raion_build_count_with_builddate_info"]

In [None]:
### WE WILL DEAL WITH NEARBY CAFE, FACILITY AND ETC FEATURES LATER IN DIMENSIONALITY REDUCTION
### MACRO FEATURES WILL BE DEALT SEPARATELY AS TIME SERIES