In [None]:
import pandas as pd
import plotly
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import datetime
plt.style.use('ggplot')
%matplotlib inline
%pylab inline
from sklearn.preprocessing import MinMaxScaler

first_date = "2012-04"
last_date = "2017-05"

# Auxiliary functions

Binarizing boolean values

In [None]:
def to_bin(logical):
    if logical:
        return 1
    else:
        return 0

Calculating list of all months

In [None]:
def get_months_list(start_date, finish_date):
    months_list = [start_date]
    current_date = start_date   
    while (current_date != finish_date):
        year = int(current_date[0:4])
        month = int(current_date[5:7])
        month = month + 1
        if (month == 13):
            month = 1
            year += 1
        if (month < 10):
            current_date = str(year) + "-0" + str(month)
        else:
            current_date = str(year) + "-" + str(month)
        months_list.append(current_date)       
    return months_list
        
months_list = get_months_list(first_date, last_date)


Initializing list of key-date with empty lists

In [None]:
def init_flat_dates(data):
    flat_dates = dict()
    for name in data['flat_name']:
        flat_dates[name] = []
    return flat_dates

Returns list of flat of different types

In [None]:
def devide_flat_types(data, type_names):
    res = dict()
    for cur_type in type_names:
        res[cur_type] = [data["flat_name"][row] for row in range(data.shape[0]) if data[cur_type][row] == 1]
    return res

# Function for visual analysis

In [None]:
def discrete_distribution_vis(data, column):
    x_values = data[column].unique()
    y_values = [len(data[data[column] == val]) for val in data[column].unique()]
    plt.bar(x_values, y_values)
    plt.xticks(x_values, x_values)
    plt.title(column + " distribution")
    plt.show()    
    
    """
    data['floor'].value_counts().plot(kind='bar', label='Floor')
    plt.legend()
    plt.title('Floor distribution');
    """    
def сontinious_distribution_vis(data, column):
    sns.distplot(data[column])
    
def dict_bar_vis(dictionary, name):    
    y_values = list(dictionary.values())
    x_values = np.arange(len(y_values))
    x_labels = list(dictionary.keys())
    plt.bar(x_values, y_values)
    #x_labels = map(lambda x: x[5:7], list(dictionary.keys()))
    #datetime.datetime.strptime('24052010', "%d%m%Y").date()
    inds = np.argsort(np.array(x_labels))    
    y_values = list(np.array(y_values)[inds])
    x_labels = list(np.array(x_labels)[inds])
    plt.xticks(x_values, x_labels)    
    plt.title(name)
    plt.show()
    
    
def lead_visualize_bar(dictionary):    
    x_values = np.arange(len(months_list))
    for flat_type in dictionary.keys():
        y_values = list(dictionary[flat_type])
        plt.bar(x_values, y_values)
        plt.xticks(x_values, x_values)    
        plt.title(str(flat_type))
        plt.show()
    
    
def lead_visualize_plot(dictionary):    
    x_values = np.arange(len(months_list))
    for flat_type in dictionary.keys():
        y_values = list(dictionary[flat_type])
        plt.plot(x_values, y_values)
        #plt.xticks(x_values, x_values)    
    plt.title("Lead tendency")
    plt.show()
    
   

# Extraction of time-independent features

In [None]:
def transform_flat_area(data):
    """
    Replace area of flat with scaling, normalized value
    """
    data["wholeprojectarea"] /= data["roomquantity"]
    data["wholeprojectarea"] = (data["wholeprojectarea"] - data["wholeprojectarea"].mean()) / data["wholeprojectarea"].std() 
    return data

def transform_room_quantity(data):
    """
    Vectorization of room quantity feature
    """
    rooms = pd.get_dummies(data["roomquantity"], prefix="room")
    data = pd.concat([data, rooms], axis=1)
    data = data.drop("roomquantity", 1)
    data = data.drop("number_flat", 1)
    return data

def transform_floor(data):  
    """
    Vectorization of floor by group
    """
    limits = [1, 2, 5, 9, 13, 19, 20]
    names = ["floors_" + str(limits[i]) + "-" + str(limits[i+1]-1) for i in range(len(limits)-1)]
    for num in range(len(limits)-1):
        data[names[num]] = [to_bin(data["floor"][row] >= limits[num] and data["floor"][row] < limits[num+1]) for row in range(data.shape[0])]
    data = data.drop("floor", 1)
    return data
    
def transform_categorical(data, column, regime="dum"):
    """
    Deletion columns or vectorization depend on regime
    """
    if regime == "dum":
        new_columns = pd.get_dummies(data[column], prefix=column)
        data = pd.concat([data, new_columns], axis=1)     
    data = data.drop(column, 1) 
    return data    
    
def extract_basic_attributes(basic_data):
    """
    Extract basic attributes of all flats independent on time
    """   
    #print(list(basic_data.columns))
    basic_data = transform_flat_area(basic_data)
    basic_data = transform_room_quantity(basic_data)
    basic_data = transform_floor(basic_data)
    basic_data = transform_categorical(basic_data, "sector", regime="dum")
    basic_data = transform_categorical(basic_data, "corpus", regime="del")    
    #print(list(basic_data.columns))
    return basic_data

# Extraction of time-dependent features

In [None]:
def add_demand_month(data, lead_table): 
    tmp = data['date'][0][0:7]   
    new_feature = []    
    for row in range(data.shape[0]):  
        if(data["studio"][row] == 1):
            type_flat = 'studio'
        if(data["room_1"][row] == 1):
            type_flat = '1k'
        if(data["room_2"][row] == 1):
            type_flat = '2k'
        if(data["room_3"][row] == 1):
            type_flat = '3k'
        if(data["room_4"][row] == 1):
            type_flat = '4k'
        new_feature.append(lead_table[type_flat][months_list.index(str(data['date'][row][0:7]))])
        #data["demand_month"] = [lead_table['1k'][months_list.index(str(data['date'][row][0:7]))] for row in range(data.shape[0])]
    data["demand_month"] = new_feature
    data["demand_month"] = (data["demand_month"] - data["demand_month"].min()) / (data["demand_month"].max() - data["demand_month"].min())
    return data

In [None]:
def add_time_depend_features(data, lead_table, gis_data, history_data):
    data = add_demand_month(data, lead_table)
    
    return data

## Calculating popularity of flats

Calculate month popularity for all type of flats for those dates for which there is an information in the table

In [None]:
def make_lead_table(data): 
    lead_list = dict()
    flat_types = ["studio", "1k", "2k", "3k", "4k"]
    lead_table = []
    
    #Create the dictionary (type: list of lead-dates) 
    for num in range(len(flat_types)):
        lead_list[num] = [data["CreatedOn"][row] for row in range(data.shape[0]) if data["room_" + flat_types[num]][row] == 1]
    
    for num in range(len(flat_types)):
        lead_table.append(dict())
        for item in lead_list[num]:
            date = item[0:7]            
            if (date not in lead_table[num].keys()):
                lead_table[num][date] = 1                
            else:
                lead_table[num][date] += 1
                
    #Calculating all flats demands      
    lead_table.append(dict())
    for row in range(data.shape[0]):
        date = data["CreatedOn"][row][0:7]
        if (date not in lead_table[5].keys()):
            lead_table[5][date] = 1                
        else:
            lead_table[5][date] += 1
    return lead_table

Calculate month popularity for all type of flats for all dates

In [None]:
def make_full_lead_table(data): 
    res = dict()
    flat_types = ["studio", "1k", "2k", "3k", "4k", "all"]
    lead_table = make_lead_table(data)    
    for type_num in range(len(flat_types)):
        res[flat_types[type_num]] = []
        for month_num in range(len(months_list)):
            if (months_list[month_num] not in lead_table[type_num].keys()):
                res[flat_types[type_num]].append(0)
            else:
                res[flat_types[type_num]].append(lead_table[type_num][months_list[month_num]])
    return res

# Calculating key-dates

Calculating key-dates conneting with lead information

In [None]:
def extract_lead_dates(data, flat_dates, flat_type_names, lead_table):
    flat_lead_types = ["studio", "1k", "2k", "3k", "4k", "all"]
    flat_types = ["studio", "room_1", "room_2", "room_3", "room_4"]
    key_dates_part = dict() # tendency of particular type
    key_dates_all = dict() # common tendency
    key_dates_not = dict() # not popular dates
    for i in range(len(flat_types)):
        key_dates_part[flat_types[i]] = [months_list[ind] + "-01" for ind in range(len(months_list)) if lead_table[flat_lead_types[i]][ind] / lead_table["all"][ind] > 2.1 * np.mean(lead_table[flat_lead_types[i]]) / np.mean(lead_table["all"])] 
        key_dates_all[flat_types[i]] = [months_list[ind] + "-01" for ind in range(len(months_list)) if lead_table[flat_lead_types[i]][ind] > 5 * np.mean(lead_table[flat_lead_types[i]])] 
        key_dates_not[flat_types[i]] = [months_list[ind] + "-01" for ind in range(len(months_list)) if lead_table[flat_lead_types[i]][ind] == 0 and lead_table["all"][ind] != 0] 
        
        for flat_name in flat_type_names[flat_types[i]]:
            flat_dates[flat_name].extend(key_dates_part[flat_types[i]])
            flat_dates[flat_name].extend(key_dates_all[flat_types[i]])
            not_len = len(key_dates_not[flat_types[i]])
            if (not_len > 0):
                np.random.shuffle(key_dates_not[flat_types[i]])                
                flat_dates[flat_name].extend(key_dates_not[flat_types[i]][0 : int(min(3, not_len * 1.0 / 4))])
    return flat_dates

Extracting dates of key-dates:

In [None]:
def extract_dates(data, flat_dates, column_name):
    for row in range(data.shape[0]):
        if (data['flat_name'][row] in flat_dates.keys()):
            flat_dates[data['flat_name'][row]].append(data[column_name][row])
    return flat_dates

Adding dates to data

In [None]:
def add_dates_to_data(data, dates):
    columns = list(data.columns)
    if 'date' not in columns:
        columns.append('date')
    new_data = pd.DataFrame(columns=columns)   
    for row in range(data.shape[0]):  
        for date in dates[data['flat_name'][row]]:         
            df_tmp = pd.DataFrame(index=np.arange(0, 1), columns = list(data.columns))
            df_tmp.loc[0] = list(data.loc[row])   
            df_tmp['date'] = date 
            new_data = new_data.append(df_tmp, ignore_index=True)            
    return new_data

# Reading data

In [None]:
initial_data = pd.read_csv("data/flat.csv", delimiter = ';')    
lead_data = pd.read_csv("data/lead.csv", delimiter = ';')
deal_data = pd.read_csv("data/opportunity.csv", delimiter = ';')
gis_data = pd.read_csv("data/gis.csv", delimiter = ';')
history_data = pd.read_csv("data/history.csv", delimiter = ';')

# Visual analysis

In [None]:
"""
discrete_distribution_vis(basic_data, "floor")
discrete_distribution_vis(basic_data, "corpus")
discrete_distribution_vis(basic_data, "sector")
сontinious_distribution_vis(basic_data, "wholeprojectarea")
flat_types = ["studio", "1k", "2k", "3k", "4k", "all"]
lead_table = make_lead_table(lead_data)
for i in range(len(flat_types)):
    dict_bar_vis(lead_table[i], flat_types[i])
"""
full_lead_table = make_full_lead_table(lead_data)
#lead_visualize_bar(full_lead_table)
lead_visualize_plot(full_lead_table)

Transform dates

In [None]:
import datetime
def transform_date(data):
    delta = list(map(lambda x: int(x[0:4])*365 + (int(x[5:7]))*30 - int(first_date[0:4])*365-int(first_date[5:7])*30, data['date']))
    #data['date'] = list(map(lambda x: (datetime.datetime.strptime(str(x), '%Y-%m-%d')-datetime.datetime.strptime(str(first_date)+"-01", '%Y-%m-%d')).days(), data['date']))
    #print(year)
    delta = np.array(delta) 
    data['date'] = (delta - delta.mean()) / delta.std()
    return data

Create labels

In [None]:
def make_labels(flat_dates_positive, flat_dates_negative): 
    labels = list(np.ones(len(flat_dates_positive)))
    labels.extend(list(np.zeros(len(flat_dates_negative))))
    labels = list(map(int, labels))
    return labels

# Makong train set

In [None]:
data = extract_basic_attributes(initial_data)
lead_table = make_full_lead_table(lead_data)
flat_type_names = devide_flat_types(data, ["studio", "room_1", "room_2", "room_3", "room_4"])
flat_dates_negative = init_flat_dates(data)
flat_dates_positive = init_flat_dates(data)
flat_dates_negative = extract_lead_dates(data, flat_dates_negative, flat_type_names, lead_table) 
flat_dates_positive = extract_dates(deal_data, flat_dates_positive, "contractdate")
flat_dates_negative = extract_dates(gis_data, flat_dates_negative, "datefrom")
flat_dates_negative = extract_dates(history_data, flat_dates_negative, "datefrom")

data = add_dates_to_data(data, flat_dates_positive)
size_positive = data.shape[0]
labels = np.ones(size_positive)
#print(labels.shape)

data = add_dates_to_data(data, flat_dates_negative)
size_negative = data.shape[0] - size_positive
labels = np.append(labels, np.zeros(size_negative))
#labels = np.array(list(labels).extend(list(np.zeros(size_negative))))
#labels = make_labels(flat_dates_positive, flat_dates_negative)
#print(labels.shape)

data = add_time_depend_features(data,lead_table, gis_data, history_data)

print(list(data.columns))
print(data.shape[0])
#data = transform_date(data)
data = data.drop("flat_name", 1)
data = data.drop("date", 1)

# Feature selection

Dimensionality reduction

In [None]:
from sklearn.decomposition import PCA
data_for_pca = data
pca = PCA(n_components=2)
pca.fit(data_for_pca)
data_pca = pca.transform(data_for_pca)

# Traininig

In [None]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.grid_search import GridSearchCV

kNN, k-Fold

In [None]:
from sklearn.neighbors import KNeighborsClassifier

X = np.array(data)
print(X)
#X = data.pca
y = np.array(labels)

n_neighbors_array = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 15]
best_n_neighbors_vals = []

kf = KFold(n_splits=2)
for train_index, test_index in kf.split(X):    
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    knn_clf = KNeighborsClassifier()
    grid = GridSearchCV(knn_clf, param_grid={'n_neighbors': n_neighbors_array})
    grid.fit(X_train, y_train)
    best_n_neighbors = grid.best_estimator_.n_neighbors
    best_n_neighbors_vals.append(best_n_neighbors)
    scores = cross_val_score(grid, X_test, y_test, cv=5)
print("Values of best parameters:")
print(best_n_neighbors_vals)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))


In [None]:

data_test = extract_basic_attributes(initial_data)
lead_table = make_full_lead_table(lead_data)
flat_type_names = devide_flat_types(data_test, ["studio", "room_1", "room_2", "room_3", "room_4"])
test_date = '2017-05-30'
test_dates = dict()
for flat_name in data_test['flat_name']:
    test_dates[flat_name] = [test_date]
data_test = add_dates_to_data(data_test, test_dates)
data_test = add_time_depend_features(data_test,lead_table, gis_data, history_data)
data_test = transform_date(data_test)
flat_names = data_test['flat_name']
data_test = data_test.drop("flat_name", 1)
data_test = data_test.drop("date", 1)

data_test_processed = np.array(data_test)
result = grid.predict_proba(np.array(data_test_processed))


In [None]:
#print(result)
result_pos = result[:,[1]].reshape(result.shape[0])
print(result_pos)

In [None]:
result_pos = list(map(lambda x: to_bin(x > 0.5), result_pos))

In [None]:
result_to_csv = pd.DataFrame(columns=['flat_name','prediction'])  
result_to_csv['flat_name'] = flat_names
result_to_csv['prediction'] = result_pos
result_to_csv

In [None]:
result_to_csv.to_csv("predictions.csv")