In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from datetime import date, datetime
from datetime import timedelta
from pandas.plotting import register_matplotlib_converters
from sklearn.manifold import TSNE

register_matplotlib_converters()
sns.set()

In [2]:
# Import the data and making a dataframe out of it
#file_name = '../Data/Final_data.cleaned.csv'
file_name = 'Final_data_cleaned_missing_houses.csv'
df = pd.read_csv(file_name)

# Rename the index columns
df = df.rename(columns={"ID-nummer" : "Index"})
df = df.set_index('Index')
df.index = pd.to_datetime(df.index)

# Adding the dummy variable: Hours
df['Hours'] = df.index.hour

# Adding the dummy variable: Season
Y = 2000 #Leap year to allow input for a leap day (Y-02-29)
seasons = [(1, (date(Y,  1,  1),  date(Y,  3, 20))),
           (2, (date(Y,  3, 21),  date(Y,  6, 20))),
           (3, (date(Y,  6, 21),  date(Y,  9, 22))),
           (4, (date(Y,  9, 23),  date(Y, 12, 20))),
           (1, (date(Y, 12, 21),  date(Y, 12, 31)))]

def get_season(now):
    if isinstance(now, datetime):
        now = now.date()
    now = now.replace(year=Y)
    return next(season for season, (start, end) in seasons
                if start <= now <= end)

df['Season'] = df.index.to_series().apply(lambda x: get_season(x.to_pydatetime()))

# Adding the dummy variable: DayTime
def partofday(hours):
    if (hours >= 0 and hours < 6): 
        return 1
    elif(hours >= 6 and hours < 12): 
        return 2
    elif(hours >= 12 and hours < 18): 
        return 3
    else: 
        return 4   

df['Part_of_the_day'] = df['Hours'].apply(lambda x: partofday(x))

# Adding the dummy variable: Day of the week
def get_day_of_week(date):
    if isinstance(datetime, type(datetime)):
        return date.weekday() + 1
    return np.NaN

df['Day_of_the_week'] = df.index.to_series().apply(lambda x: get_day_of_week(x.to_pydatetime()))

display(df.head())
display(df.describe())

Unnamed: 0_level_0,H01_prod,H02_prod,H03_prod,H04_prod,H06_prod,H07_prod,H08_prod,H09_prod,H11_prod,H13_prod,...,H27_cons,H28_cons,H29_cons,H31_cons,H32_cons,H33_cons,Hours,Season,Part_of_the_day,Day_of_the_week
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2017-09-12 00:15:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.096,0.025,0.024,0.299,0.027,0.022,0,3,1,2
2017-09-12 00:30:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.152,0.018,0.028,0.325,0.021,0.042,0,3,1,2
2017-09-12 00:45:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.265,0.022,0.015,0.341,0.029,0.035,0,3,1,2
2017-09-12 01:00:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.062,0.019,0.031,0.35,0.029,0.038,1,3,1,2
2017-09-12 01:15:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.219,0.021,0.279,0.397,0.022,0.026,1,3,1,2


Unnamed: 0,H01_prod,H02_prod,H03_prod,H04_prod,H06_prod,H07_prod,H08_prod,H09_prod,H11_prod,H13_prod,...,H27_cons,H28_cons,H29_cons,H31_cons,H32_cons,H33_cons,Hours,Season,Part_of_the_day,Day_of_the_week
count,54745.0,54745.0,54745.0,54745.0,54745.0,54745.0,54745.0,54745.0,54745.0,54745.0,...,54745.0,54745.0,54745.0,54745.0,54745.0,54745.0,54745.0,54745.0,54745.0,54745.0
mean,0.095002,0.063986,0.044213,0.057322,0.058406,0.081739,0.037795,0.085831,0.035831,0.055902,...,0.097359,0.128774,0.122767,0.091781,0.123045,0.064269,11.460024,2.494237,2.493598,4.013097
std,0.189129,0.143698,0.096173,0.119551,0.12583,0.158919,0.086679,0.163675,0.084409,0.122652,...,0.117275,0.394303,0.269185,0.123843,0.340717,0.105511,6.926316,1.189704,1.11854,1.995417
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.01,0.0,0.0,0.0,0.0,0.004,5.0,1.0,1.0,2.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.042,0.014,0.016,0.043,0.025,0.031,11.0,2.0,2.0,4.0
75%,0.088,0.036,0.024,0.043,0.041,0.08,0.017,0.09,0.014,0.037,...,0.149,0.023,0.041,0.115,0.049,0.048,17.0,4.0,3.0,6.0
max,0.911,0.77,0.503,0.64,0.768,0.787,0.467,0.805,0.486,0.826,...,1.129,2.509,2.24,1.163,2.501,1.093,23.0,4.0,4.0,7.0


## TSNE

In [3]:
df.shape

(54745, 60)

In [4]:
tsne_model = TSNE(n_components = 2, verbose = 1, random_state = 32, n_iter = 500)
tsne_tfidf = tsne_model.fit_transform(df)

[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 54745 samples in 0.204s...
[t-SNE] Computed neighbors for 54745 samples in 13.077s...
[t-SNE] Computed conditional probabilities for sample 1000 / 54745
[t-SNE] Computed conditional probabilities for sample 2000 / 54745
[t-SNE] Computed conditional probabilities for sample 3000 / 54745
[t-SNE] Computed conditional probabilities for sample 4000 / 54745
[t-SNE] Computed conditional probabilities for sample 5000 / 54745
[t-SNE] Computed conditional probabilities for sample 6000 / 54745
[t-SNE] Computed conditional probabilities for sample 7000 / 54745
[t-SNE] Computed conditional probabilities for sample 8000 / 54745
[t-SNE] Computed conditional probabilities for sample 9000 / 54745
[t-SNE] Computed conditional probabilities for sample 10000 / 54745
[t-SNE] Computed conditional probabilities for sample 11000 / 54745
[t-SNE] Computed conditional probabilities for sample 12000 / 54745
[t-SNE] Computed conditional probabilities for sa

In [5]:
tsne_tfidf.shape

(54745, 2)

In [6]:
import bokeh.plotting as bp
from bokeh.models import HoverTool, BoxSelectTool
from bokeh.models import ColumnDataSource
from bokeh.plotting import figure, show, output_notebook, reset_output
from bokeh.palettes import d3
import bokeh.models as bmo
from bokeh.io import save, output_file

In [7]:
tsne_tfidf_df = pd.DataFrame(tsne_tfidf)
tsne_tfidf_df.columns = ["x", "y"]
tsne_tfidf_df["hours"] = df["Hours"].values
tsne_tfidf_df["season"] = df["Season"].values
tsne_tfidf_df["part_of_the_day"] = df["Part_of_the_day"].values
tsne_tfidf_df["day_of_the_week"] = df["Day_of_the_week"].values

In [8]:
tsne_tfidf_df.head()

Unnamed: 0,x,y,hours,season,part_of_the_day,day_of_the_week
0,22.988554,12.728756,0,3,1,2
1,22.98844,12.728777,0,3,1,2
2,22.988676,12.728676,0,3,1,2
3,-18.768335,22.270998,1,3,1,2
4,-18.768057,22.271063,1,3,1,2


In [9]:
output_notebook()
plot_tfidf = bp.figure(plot_width = 700, plot_height = 600, 
                       title = "tf-idf clustering of stock market news",
                       tools = "pan, wheel_zoom, box_zoom, reset, hover, previewsave",
                       x_axis_type = None, y_axis_type = None, min_border = 1)

# palette = d3["Category10"][len(tsne_tfidf_df["asset_name"].unique())]
# color_map = bmo.CategoricalColorMapper(factors = tsne_tfidf_df["asset_name"].map(str).unique(), 
#                                        palette = palette)

plot_tfidf.scatter(x = "x", y = "y", 
#                    color = {"field": "asset_name", "transform": color_map}, 
#                    legend = "asset_name",
                   source = tsne_tfidf_df,
                   alpha = 0.7)
hover = plot_tfidf.select(dict(type = HoverTool))
hover.tooltips = {"hours": "@hours", "season": "@season"}

show(plot_tfidf)

## K-Means

In [10]:
from sklearn.cluster import MiniBatchKMeans

kmeans_model = MiniBatchKMeans(n_clusters = 50, # don't have time to find the best number
                               init = "k-means++",
                               n_init =  1,
                               init_size = 1000, 
                               batch_size = 1000, 
                               verbose = 0, 
                               max_iter = 1000)

In [11]:
kmeans = kmeans_model.fit(df)
kmeans_clusters = kmeans.predict(df)
kmeans_distances = kmeans.transform(df)

In [12]:
kmeans.score(df)

-152908.81955649864

In [13]:
tsne_kmeans = tsne_model.fit_transform(kmeans_distances)

[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 54745 samples in 0.092s...
[t-SNE] Computed neighbors for 54745 samples in 4.648s...
[t-SNE] Computed conditional probabilities for sample 1000 / 54745
[t-SNE] Computed conditional probabilities for sample 2000 / 54745
[t-SNE] Computed conditional probabilities for sample 3000 / 54745
[t-SNE] Computed conditional probabilities for sample 4000 / 54745
[t-SNE] Computed conditional probabilities for sample 5000 / 54745
[t-SNE] Computed conditional probabilities for sample 6000 / 54745
[t-SNE] Computed conditional probabilities for sample 7000 / 54745
[t-SNE] Computed conditional probabilities for sample 8000 / 54745
[t-SNE] Computed conditional probabilities for sample 9000 / 54745
[t-SNE] Computed conditional probabilities for sample 10000 / 54745
[t-SNE] Computed conditional probabilities for sample 11000 / 54745
[t-SNE] Computed conditional probabilities for sample 12000 / 54745
[t-SNE] Computed conditional probabilities for sam

In [14]:
tsne_kmeans_df = pd.DataFrame(tsne_kmeans)
tsne_kmeans_df.columns = ["x", "y"]
tsne_kmeans_df["cluster"] = kmeans_clusters
tsne_kmeans_df["hours"] = df["Hours"].values
tsne_kmeans_df["season"] = df["Season"].values
tsne_kmeans_df["part_of_the_day"] = df["Part_of_the_day"].values
tsne_kmeans_df["day_of_the_week"] = df["Day_of_the_week"].values

In [15]:
colormap = np.array(["#6d8dca", "#69de53", "#723bca", "#c3e14c", "#c84dc9", "#68af4e", "#6e6cd5", "#e3be38", 
                     "#4e2d7c", "#5fdfa8", "#d34690", "#3f6d31", "#d44427", "#7fcdd8", "#cb4053", "#5e9981",
                     "#803a62", "#9b9e39", "#c88cca", "#e1c37b", "#34223b", "#bdd8a3", "#6e3326", "#cfbdce", 
                     "#d07d3c", "#52697d", "#194196", "#d27c88", "#36422b", "#b68f79", "#00ffff", "#33ff33",
                     "#ffff99", "#99ff33", "#ff6666", "#666600", "#99004c", "#808080", "#a80a0a", "#a4924c",
                     "#4a8e92", "#92734a", "#7d4097", "#4b4097", "#c0c0c0", "#409794", "#1a709b", "#a7dcf6",
                     "#b1a7f6", "#eea7f6"])

In [16]:
plot_kmeans = bp.figure(plot_width = 700, plot_height = 600, 
                       title = "k-means clustering of stock market news",
                       tools = "pan, wheel_zoom, box_zoom, reset, hover, previewsave",
                       x_axis_type = None, y_axis_type = None, min_border = 1)

source = ColumnDataSource(data = dict(x = tsne_kmeans_df["x"], y = tsne_kmeans_df["y"],
                                      color = colormap[kmeans_clusters],
                               #       headline = tsne_kmeans_df["headline"],
                               #       asset_name = tsne_kmeans_df["asset_name"],
                               #       cluster = tsne_kmeans_df["cluster"]
                                     ))

plot_kmeans.scatter(x = "x", y = "y", color = "color", source = source)
hover = plot_kmeans.select(dict(type = HoverTool))
hover.tooltips = {"hours": "@hours", "season": "@season"}
show(plot_kmeans)