In [1]:
#imports
from zipfile import ZipFile


import numpy as np
import pandas as pd
import math
import datetime
from scipy.stats import uniform, randint

from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.metrics import mean_squared_error
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import cross_val_score, GridSearchCV, KFold, RandomizedSearchCV

In [2]:
#constants

train_data_path =  "ads_train.csv"
test_data_path =  "ads_test.csv"
downtown_lat = 39.920966
downtown_lng = 32.854116


#for adjust_feature function
train_remove_list = [
    'quarter_name_Altınova', 'quarter_name_Beyceğiz', 'quarter_name_Dutluk',
    'quarter_name_Erzurum', 'quarter_name_Harman', 'quarter_name_Hüseyingazi',
    'quarter_name_Kemalpaşa', 'quarter_name_Menderes', 'quarter_name_Ostim',
    'quarter_name_Yeşilöz', 'type_Benzin İstasyonu', 'type_Dağ Evi', 'type_Eczane',
    'type_Köşk', 'type_Sanal & Hazır Ofis', 'type_Taksi Durağı', 'type_Villa Katı',
    'type_Yalı Dairesi', 'type_Çiftlik Evi'
]


#for adjust_feature function
test_remove_list = [
    
    'quarter_name_Bacı', 'quarter_name_Derbent', 'quarter_name_Kıbrısköy',
    'quarter_name_Oyaca Yeşilçam', 'quarter_name_Subaşı', 'quarter_name_Çoğlu',
    'type_Yazlık'
]

#Corr_lists above 0.7

corr_list = [
            'quarter_name_Çalış',
            'quarter_name_Ortabereket',
#             'quarter_name_Hacıkara',
#             'lng'
            # 'quarter_name_Hasanoğlan Bahçelievler',
            # 'quarter_name_Saray',
            # 'quarter_name_Saracalar',
            # 'district_Polatlı',
            # 'type_Daire',
            # 'quarter_name_Merkez',
            # 'quarter_name_Gülpınar',
            # 'quarter_name_Fatih',
            # 'lat',
            # 'type_Çiftlik'
             
]


In [3]:
#Functions

def remove_extra_features(data, train):
    remove_list = ['city', 'currency', 'ad_title']
    data = data.drop(remove_list, axis=1)
    if train:
        data = data.drop(35588)
        data = data.reset_index(drop=True)
        data.m2 = data.m2.astype(np.int32)
        data = data.drop('ad_id', axis=1)
    return data


def dummy_category(data, train):
    
    remove_list = ['type', 'quarter_name', 'district']
    cat_data = data.filter(['type', 'quarter_name', 'district'])
    tempDf = pd.get_dummies(cat_data)
    data = pd.concat([data,tempDf], axis=1)
    
    if train:
        data = data.drop(remove_list, axis=1)
    else:
        data = data.drop(['type', 'quarter_name', 'district'], axis=1)
    return data


def downtown_distance(data):
    
    distance = list()
    i=0
    for i in range(0, len(data.lat)):
        la = data.lat[i]
        ln = data.lng[i]
        location_dist = math.sqrt((la-downtown_lat)**2 + (ln-downtown_lng)**2)
        distance.append(float(location_dist))
    tempDf = pd.DataFrame({'distance':distance})
    data = pd.concat([data, tempDf], axis=1)
#     remove_list = ['lng']
#     data = data.drop(remove_list, axis=1)
    return data


def setdiff_sorted(array1,array2,assume_unique=False):
    ans = np.setdiff1d(array1,array2,assume_unique).tolist()
    if assume_unique:
        return sorted(ans)
    return ans

def adjust_features(train_data, test_data):
    
    cat_train_data = train_data.drop(['price','lat', 'lng', 'm2', 'posted', 'population'], axis=1)
    cat_test_data = test_data.drop(['lat', 'lng', 'm2', 'posted', 'population'], axis=1)
    
    cat_train_col_list = cat_train_data.columns
    cat_test_col_list = cat_test_data.columns
    
    train_data = train_data.drop(train_remove_list, axis=1)
    test_data = test_data.drop(test_remove_list, axis=1)
    return train_data, test_data


def convert_posted(data):
    data['date'] = data['posted'].apply(lambda x: datetime.datetime.strptime(str(x), '%Y%m%d').date())
    data['year'] = data['posted'].apply(lambda x: datetime.datetime.strptime(str(x), '%Y%m%d').year)
    data['month'] = data['posted'].apply(lambda x: datetime.datetime.strptime(str(x), '%Y%m%d').month)
    data['day'] = data['posted'].apply(lambda x: datetime.datetime.strptime(str(x), '%Y%m%d').day)
    data = data.drop(['date', 'posted'], axis=1)
    return data


def local_submission(clf, train_data, test_data):
    y=train_data.price
    X= train_data.drop('price', axis=1)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
    clf = clf.fit(X_train, y_train)
    pred = clf.predict(X_test)
    print("Loss is : {}".format(mean_squared_error(y_test, pred)))
    
def make_submission(clf, train_data, test_data):
    
    y_train = train_data.price
    X_train = train_data.drop('price', axis=1)
    X_test = test_data.drop('ad_id', axis=1)
    clf = clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    tempDf = test_data.ad_id
    preds = pd.DataFrame({'Predicted': y_pred})
    tempDf = pd.concat([tempDf, preds], axis=1)
    tempDf.rename(columns = {'ad_id':'Id'}, inplace = True)
    tempDf.Id = tempDf.astype(np.int32)
    tempDf.to_csv('submission.csv', index=False)


def remove_correlated_features(train_data, test_data):
    
    train_data = train_data.drop(corr_list, axis=1)
    test_data = test_data.drop(corr_list, axis=1)
    return train_data, test_data
def show_correlations():
    
    corr_matrix = train_data.corr().abs()
    # the matrix is symmetric so we need to extract upper triangle matrix without diagonal (k = 1)
    sol = (corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
                 .stack()
                 .sort_values(ascending=False))
    # first element of sol series is the pair with the bigest correlation
    corDf = pd.DataFrame(sol)
    return corDf
        

In [4]:
train_data = pd.read_csv(train_data_path)
test_data = pd.read_csv(test_data_path)

train_data = remove_extra_features(train_data, True)
test_data = remove_extra_features(test_data, False)

train_data = dummy_category(train_data, True)
test_data = dummy_category(test_data, False)

train_data = downtown_distance(train_data)
test_data = downtown_distance(test_data)

train_data, test_data = adjust_features(train_data, test_data)

train_data = convert_posted(train_data)
test_data = convert_posted(test_data)

train_data, test_data = remove_correlated_features(train_data, test_data)

In [5]:
clf = BaggingRegressor(XGBRegressor())
make_submission(clf, train_data, test_data)

