In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

from subprocess import check_output
print(check_output(["ls", "../input"]).decode("utf8"))

# Any results you write to the current directory are saved as output.

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss

# read data

In [None]:
df = pd.read_json(open("../input/train.json", "r"))

In [None]:
print(df.shape)

In [None]:
df.head()

In [None]:
print(df.shape)

## Classifying Building ID

The text information is very worthwhile but not the easiest to address. Ideally, we'd parse the information in the displayed address, building description, and the building ID (which is stored as a string). We'll start with the easiest one first (building ID) because it's just one element. After we incorporate that, we can burn the other bridges as we come to them. 

Building ID could be interesting to analyze, but it's not in the right format for Sklearn so let's try building ID

In [None]:
#first lets get a sense of the most popular building IDs, how many are there? 
#this uses two methods, seperated onto different lines for clarity 
print(
    df['building_id']
          .value_counts()
          .nlargest(50)
     )


In [None]:
#This is a little more than I care to look it! I'll plot it - a picture is worth at least 1000 words. 

df['street_address'].value_counts().plot(kind = 'hist', bins = 50)

Let's try the preprocessing tool! - Note, this example commits a cardinal sin, it configures the label encoder with test and training data. If we think this is a good approach, we'll have to move some things around and fix them in the future! 

In [None]:
from sklearn import preprocessing

le = preprocessing.LabelEncoder()
le.fit(df['building_id'])



We'll have to transform our data so that the system can use it - 

In [None]:
df['building_id'] = le.fit_transform(df['building_id']) 

df['building_id'].head()

## Text analysis

In [None]:
##make models / stuff for each interest level? 
import re
import nltk

from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
english_stemmer=nltk.stem.SnowballStemmer('english')

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer


In [None]:
## first we need to split the data and get the features so that we can train and validate this process


(
   X_train_low
 , X_val_low
 , y_train_low
 , y_val_low
) = train_test_split(df["description"][df["interest_level"] == "low"], df["interest_level"][df["interest_level"] == "low"], test_size=0.60)

(
   X_train_medium
 , X_val_medium
 , y_train_medium
 , y_val_medium
) = train_test_split(df["description"][df["interest_level"] == "low"], df["interest_level"][df["interest_level"] == "low"], test_size=0.60)


(
   X_train_high
 , X_val_high
 , y_train_high
 , y_val_high
) = train_test_split(df["description"][df["interest_level"] == "low"], df["interest_level"][df["interest_level"] == "low"], test_size=0.60)


In [None]:
def description_to_wordlist( description, remove_stopwords=True):
    # Function to convert a document to a sequence of words,
    # optionally removing stop words.  Returns a list of words.

    # 1. Remove non-letters
    description_text = re.sub("[^a-zA-Z]"," ", description)
    #
    # 2. Convert words to lower case and split them
    words = review_text.lower().split()
    #
    # 3. Optionally remove stop words (True by default)
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        words = [w for w in words if not w in stops]

    b=[]
    stemmer = english_stemmer #PorterStemmer()
    for word in words:
        b.append(stemmer.stem(word))

    # 5. Return a list of words
    return(b)

In [None]:
description_low = []
for description in X_train_low['description']:
    description_low.append( " ".join(description_to_wordlist(review)))
   
description_med = []
for description in X_train_med['description']:
    description_med.append( " ".join(description_to_wordlist(review)))

description_high = []
for description in X_train_high['description']:
    description_high.append( " ".join(description_to_wordlist(review)))


# naive feature engineering

In [None]:
df["num_photos"] = df["photos"].apply(len)
df["num_features"] = df["features"].apply(len)
df["num_description_words"] = df["description"].apply(lambda x: len(x.split(" ")))
df["created"] = pd.to_datetime(df["created"])
df["created_year"] = df["created"].dt.year
df["created_month"] = df["created"].dt.month
df["created_day"] = df["created"].dt.day
df["price_per_bedroom"] =  df["bedrooms"] / df["price"] 
df["price_per_bathroom"] = df["bathrooms"] / df["price"] 

In [None]:
df.columns

In [None]:
num_feats = ["bathrooms", "bedrooms", "latitude", "longitude", "price",
             "num_photos", "num_features", "num_description_words",
             "created_year", "created_month", "created_day", "building_id",
              "price_per_bedroom", "price_per_bathroom"]

X = df[num_feats]
y = df["interest_level"]
X.head()

# train model

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.45)

In [None]:
clf = RandomForestClassifier(n_estimators=1500, )
clf.fit(X_train, y_train)
y_val_pred = clf.predict_proba(X_val)
log_loss(y_val, y_val_pred)

# Train another model

Let's try a bagging approachd!

In [None]:
from sklearn.ensemble import BaggingClassifier
b1 = BaggingClassifier(n_estimators=2000, )
b1.fit(X_train, y_train)
y_val_pred = b1.predict_proba(X_val)
log_loss(y_val, y_val_pred)

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
gbc = GradientBoostingClassifier()
gbc.fit(X_train, y_train)
y_val_pred = gbc.predict_proba(X_val)
log_loss(y_val, y_val_pred)

In [None]:
from sklearn import svm
clf = svm.SVC()
clf.fit(X_train, y_train)
y_val_pred = clf.predict_proba(X_val)
log_loss(y_val, y_val_pred)

In [None]:
from sklearn.neighbors import KNeighborsClassifier
neigh = KNeighborsClassifier(n_neighbors=3)
neigh.fit(X_train, y_train)
y_val_pred = neigh.predict_proba(X_val)
log_loss(y_val, y_val_pred)

# make prediction

In [None]:
df = pd.read_json(open("../input/test.json", "r"))
print(df.shape)
df["num_photos"] = df["photos"].apply(len)
df["num_features"] = df["features"].apply(len)
df["num_description_words"] = df["description"].apply(lambda x: len(x.split(" ")))
df["created"] = pd.to_datetime(df["created"])
df["created_year"] = df["created"].dt.year
df["created_month"] = df["created"].dt.month
df["created_day"] = df["created"].dt.day
X = df[num_feats]

y = clf.predict_proba(X)

In [None]:
labels2idx = {label: i for i, label in enumerate(clf.classes_)}
labels2idx

In [None]:
sub = pd.DataFrame()
sub["listing_id"] = df["listing_id"]
for label in ["high", "medium", "low"]:
    sub[label] = y[:, labels2idx[label]]
sub.to_csv("submission_rf.csv", index=False)