In [19]:
#Creating classes for time series analysis. 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [23]:
class WeatherReader:

    """This class will provide access to weather data (temperature, humidity etc).  Data will be retrieved as a 
    CSV file, but will be provided as a dataframe"""
    
    def __init__(self, path, weathertype):
        'Get CSV file and transform into dataframe.  This assumes that only a .csv file will be given'
        'path = directory path where data can be found'
        'weathertype = choice of temp, humidity, or temp/humidity'
        self.path = path
        self.df_file = pd.read_csv(path, index_col='datetime', parse_dates=['datetime'])
        self.weathertype = weathertype

    def validations(self):
        'Pre-processing steps go here'
        
        
    def get_data(self, starttime, endtime):
        """Extract data from the weather dataframe using specific start and stop times.  Weathertype is temp,
        humidity, etc"""
        if self.weathertype == 'temp':
            data_set = self.df_file[starttime:endtime].temp
        elif self.weathertype == 'temp/humidity':
            data_set = self.df_file[starttime:endtime].temp.append(self.df_file[starttime:endtime].humidity*100) 
        elif self.weathertype == 'humidity':
            data_set = self.df_file[starttime:endtime].humidity*100
        return data_set
    
    
    def get_daily_data(self, date):
        pd_date = pd.to_datetime(date)
        'Returns 24 hours of weather data'
        data =  self.get_data(date, date + pd.DateOffset(hours=23)).values
        if len(data) != 24:
                print('This date does not have 24 hours: ', date)
                print('It has', len(data), 'hours')
                print(" ")
        if len(data) > 24:
            print("Removing extra hours")
            hours = date + pd.DateOffset(hours=23)
            print(hours)
        return data
    
    
    

In [31]:
wr = WeatherReader('/home/KOSUWeather.csv', 'temp')

In [34]:
dates = pd.date_range('2015-11-01', '2015-11-02', freq='D')
wr.get_daily_data(dates[0])
#wr.get_data('2015-11-01 00:00:00', '2015-11-06 00:00:00')

This date does not have 24 hours:  2015-11-01 00:00:00
It has 25 hours
 
Removing extra hours
2015-11-01 23:00:00


array([52.77, 52.61, 52.87, 52.68, 52.87, 52.75, 52.68, 53.19, 53.24,
       53.31, 54.92, 57.34, 60.41, 62.8 , 64.07, 64.73, 65.28, 64.42,
       62.61, 59.14, 55.82, 54.24, 53.64, 51.98, 50.73])

In [15]:
import pandas as pd
import numpy as np
import random

class DictionaryBuilder:
    """Builds a dictionary of centroids and date values using a k-means algorithm"""
    
    def __init__(self, data_source, startdate, enddate, num_clust, num_iters):
    
        self.data_source = data_source            #weather data and dates from WeatherReader class
        self.startdate = startdate                #start date for k-means processing
        self.enddate = enddate                    #end date for k-means processing
        self.num_clust = num_clust                #number of clusters wanted
        self.num_iters = num_iters                #number of interations wanted            
    
    
    def get_dates(self):
        dates = pd.date_range(self.startdate, self.enddate, freq='D')  
        np.random.shuffle(dates.values)
        
        tds = []
        kept_dates = []
        for d in dates:
             # get the temperature time series for this date
             ts = self.data_source.get_daily_data(d)
             if len(ts) == 24:
                 tds.append(ts)
                 kept_dates.append(d)
    

    def euclid_dist(ts1, ts2):
        euclid = np.sqrt(sum((ts1 - ts2)**2))
        return euclid


    def k_means(self, data, centroids):
        import random
    #     centroids = random.sample(data, num_clust)
        counter = 0
        for n in range(num_iter):
            counter += 1
            print(counter)
            assignments = {}
            for ind,i in enumerate(data):
                min_dist = float('inf')
                closest_clust = None
                for c_ind,j in enumerate(centroids):
                    cur_dist = euclid_dist(i,j)
                    if cur_dist < min_dist:
                        min_dist = cur_dist
                        closest_clust = c_ind
                if closest_clust in assignments:
                    assignments[closest_clust].append(ind)
                else:
                    assignments[closest_clust] = [ind]  
                        
        # recalculate centroids
        for key in assignments:
            clust_sum = 0
            for k in assignments[key]:
                clust_sum = clust_sum + data[k]
                print(centroids)
            centroids[key] = [m/len(assignments[key]) for m in clust_sum]
            
        return centroids, assignments

    
    def create_dictionary(self):
        init_clusts = random.sample(tds, num_clust)    #find data points for number of clusters
        centroids, assignments = k_means(tds, self.num_clust, self.num_iters, init_clusts)


In [8]:
ds = WeatherReader('/home/KOSUWeather.csv', 'temp')

In [16]:
db = DictionaryBuilder(ds, '2016-01-15', '2016-02-15', 10, 20)

In [11]:
db.startdate


'2016-01-15'

In [12]:
db.num_clust

10