In [1]:
import numpy as np
import pandas as pd

In [2]:
class DataManager:
    def __init__(self):
        self.feature = None
        self.rdf = None # raw df
        self.data = None # dictionary of processed dfs data
        self.X = None
        self.y = None
        self.tdata = None
        
    def read_training_data(self, file_name):
        df = pd.read_csv(file_name, encoding='big5', na_values=['NR'])
        df = df.fillna(0.0)
        df = df.rename(columns={
            df.columns[0]: 'date',
            df.columns[1]: 'location',
            df.columns[2]: 'feature'
        })
        df.date = pd.to_datetime(df.date)
        df = df.drop(columns=['location'])
        self.feature = sorted(df.feature.unique())
        self.rdf = df
        self.rdf_to_data()
        return 
    
    def rdf_to_data(self):
        rdf = self.rdf
        data = {month: {feature: df.drop(columns=['date', 'feature']).values.flatten() for feature, df in mdf.groupby(mdf.feature)} for month, mdf in rdf.groupby(rdf.date.dt.month)}
        self.data = data
        return 
    
    def select_feature_to_mdfs(self, feature_list=None):
        if feature_list is None:
            feature_list = self.feature
        mdfs = {month: pd.DataFrame(columns=feature_list) for month in range(1, 13)}
#         mdfs = {month: pd.DataFrame(columns=sorted(feature_list)) for month in range(1, 13)}

        for month, fdata in self.data.items():
            for feature in feature_list:
                mdfs[month][feature] = fdata[feature]
            
        return mdfs
    
    
    def chunk_examples(self, mdfs, chunk_size):
        X = []
        y = []
        for month, mdf in mdfs.items():
            nrows, ncols = mdf.shape

            for i in range(nrows-chunk_size):
                X.append(mdf.iloc[i:i+chunk_size].T.values.flatten())
                y.append(mdf.loc[i+chunk_size, 'PM2.5'])
                
        X = np.array(X)
        y = np.array(y).reshape((-1, 1))
        return X, y
    
    def read_testing_data(self, file_name):
        df = pd.read_csv(file_name, header=None, na_values=['NR'])
        df = df.rename(columns={0: 'id', 1: 'feature'})
        df = df.fillna(0.0)
        self.tdata = df
        return
    
    def select_testing_feature(self, feature_list=None):
        if feature_list is None:
            feature_list = self.feature
            
        iddfs = {i: df for i, df in self.tdata.groupby(self.tdata.id)}
        
        for i, df in iddfs.items():
            columns = df.feature
            df = df.drop(columns=['id', 'feature']).T
            df.columns = columns
#             df = df[sorted(feature_list)]
            df = df[feature_list]
            iddfs[i] = df
        return iddfs
            

In [3]:
def normalize_data(X):
    mu = np.mean(X, axis=0)
    sigma = np.std(X, axis=0)
    X_n = (X - mu) / sigma
    return X_n, mu, sigma

In [4]:
def norm_eq(X, y):
    theta = np.dot(np.dot(np.linalg.pinv(np.dot(X.T, X)), X.T), y)
    return theta

In [5]:
def predict(theta, instance):
    h_theta = np.dot(instance, theta)
    return h_theta

In [6]:
dm = DataManager()

In [7]:
dm.read_training_data('./train.csv')

In [8]:
mdfs = dm.select_feature_to_mdfs(['PM2.5', 'PM10'])

In [9]:
X, y = dm.chunk_examples(mdfs, chunk_size=9)

In [10]:
X_n, mu, sigma = normalize_data(X)

In [11]:
norm_theta = norm_eq(np.insert(X_n, obj=0, values=1, axis=1), y)

In [12]:
dm.read_testing_data('./test.csv')

In [13]:
tX = dm.select_testing_feature(['PM2.5', 'PM10'])

In [14]:
results = []
for i, tdf in tX.items():
    instance_n = (tdf.T.values.flatten() - mu) / sigma
#     print(tdf.T.values.flatten())
    results.append((i, predict(norm_theta, np.insert(instance_n.reshape((1, -1)), obj=0, values=1, axis=1))[0, 0]))
results = np.array(results)

In [18]:
pd.DataFrame(results, columns=['id', 'value']).to_csv('reproduce.csv', index=False)

In [17]:
r

Unnamed: 0,id,value
0,id_0,35.39705691920548
1,id_1,13.717200901715366
2,id_10,27.51830842022988
3,id_100,23.54911400851201
4,id_101,51.14323288143199
5,id_102,25.490830295143553
6,id_103,36.42587945163847
7,id_104,29.97024579255351
8,id_105,24.710147116082467
9,id_106,18.116450975911278


In [88]:
dm.rdf[dm.rdf.feature=='PM2.5']

Unnamed: 0,date,feature,0,1,2,3,4,5,6,7,...,14,15,16,17,18,19,20,21,22,23
9,2014-01-01,PM2.5,58.0,59.0,57.0,59.0,64.0,64.0,60.0,55.0,...,36.0,38.0,40.0,42.0,46.0,57.0,76.0,74.0,71.0,50.0
27,2014-01-02,PM2.5,41.0,45.0,48.0,43.0,19.0,9.0,10.0,14.0,...,48.0,44.0,38.0,34.0,40.0,54.0,65.0,68.0,75.0,76.0
45,2014-01-03,PM2.5,82.0,69.0,62.0,55.0,66.0,63.0,48.0,24.0,...,44.0,38.0,38.0,35.0,38.0,38.0,42.0,41.0,45.0,46.0
63,2014-01-04,PM2.5,53.0,55.0,51.0,45.0,43.0,44.0,43.0,39.0,...,73.0,71.0,65.0,65.0,70.0,82.0,91.0,98.0,104.0,105.0
81,2014-01-05,PM2.5,107.0,103.0,93.0,83.0,77.0,74.0,69.0,65.0,...,75.0,72.0,68.0,65.0,63.0,70.0,76.0,85.0,83.0,84.0
99,2014-01-06,PM2.5,75.0,70.0,68.0,67.0,65.0,60.0,58.0,65.0,...,90.0,98.0,86.0,65.0,44.0,43.0,43.0,52.0,63.0,73.0
117,2014-01-07,PM2.5,65.0,58.0,49.0,51.0,45.0,46.0,38.0,39.0,...,55.0,57.0,54.0,54.0,53.0,58.0,75.0,85.0,85.0,69.0
135,2014-01-08,PM2.5,56.0,51.0,54.0,52.0,54.0,53.0,60.0,65.0,...,33.0,36.0,35.0,45.0,45.0,42.0,44.0,41.0,40.0,28.0
153,2014-01-09,PM2.5,24.0,23.0,15.0,12.0,8.0,14.0,14.0,20.0,...,17.0,22.0,21.0,15.0,8.0,13.0,17.0,18.0,11.0,7.0
171,2014-01-10,PM2.5,3.0,2.0,4.0,8.0,14.0,15.0,7.0,4.0,...,32.0,30.0,24.0,16.0,14.0,22.0,31.0,29.0,21.0,16.0


In [82]:
X.shape

(5652, 18)

In [48]:
b.columns = c

In [49]:
b

feature,AMB_TEMP,CH4,CO,NMHC,NO,NO2,NOx,O3,PM10,PM2.5,RAINFALL,RH,SO2,THC,WD_HR,WIND_DIREC,WIND_SPEED,WS_HR
2,35.0,1.6,0.17,0.02,1.6,4.4,6.0,61.0,42.0,39.0,0.0,50.0,3.0,1.7,269.0,260.0,3.0,2.3
3,35.0,1.7,0.15,0.01,1.7,4.7,6.5,60.0,42.0,43.0,0.0,51.0,2.9,1.7,261.0,265.0,3.5,2.9
4,35.0,1.7,0.14,0.01,2.0,4.6,6.6,54.0,35.0,29.0,0.0,51.0,2.4,1.7,269.0,276.0,3.2,3.3
5,34.0,1.6,0.14,0.01,2.1,4.5,6.6,53.0,37.0,23.0,0.0,54.0,2.4,1.6,267.0,267.0,3.3,3.6
6,33.0,1.6,0.17,0.03,1.9,5.6,7.5,51.0,34.0,25.0,0.0,58.0,2.1,1.6,270.0,278.0,3.1,3.0
7,31.0,1.6,0.22,0.04,1.8,7.6,9.5,51.0,41.0,27.0,0.0,63.0,2.6,1.7,278.0,278.0,2.0,2.0
8,30.0,1.7,0.3,0.08,1.8,11.0,13.0,46.0,41.0,32.0,0.0,66.0,2.9,1.7,276.0,278.0,1.9,2.0
9,29.0,1.7,0.37,0.13,1.6,12.0,14.0,45.0,49.0,26.0,0.0,67.0,2.8,1.8,281.0,283.0,1.6,1.8
10,29.0,1.7,0.42,0.11,1.7,15.0,17.0,42.0,51.0,40.0,0.0,65.0,2.9,1.8,243.0,220.0,1.2,0.9


In [60]:
a = np.array([[7.7]])

In [73]:
a.reshape((1,))

array([7.7])

In [75]:
a[0, 0]

7.7

In [20]:
X[147, :]

array([51., 45., 46., 38., 39., 36., 47., 55., 60., 73., 77., 62., 63.,
       70., 79., 93., 87., 96.])

In [98]:
for i in [ 77.,  74.,  69.,  65.,  66.,  75.,  79.,  91.,  92., 116., 112.,97., 107., 112., 127., 133., 122., 122.]:
    print(i in X[100, :])

True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True


In [18]:
mdfs[1]

Unnamed: 0,PM2.5,PM10
0,58.0,91.0
1,59.0,104.0
2,57.0,87.0
3,59.0,91.0
4,64.0,83.0
5,64.0,76.0
6,60.0,73.0
7,55.0,66.0
8,54.0,88.0
9,61.0,97.0


In [21]:
y

array([[61.],
       [67.],
       [72.],
       ...,
       [51.],
       [48.],
       [49.]])

In [22]:
norm_theta

array([[28.39755839],
       [-4.45330872],
       [ 2.12965279],
       [-3.32096469],
       [ 2.43711245],
       [ 9.89636874],
       [-1.27055567],
       [-0.03792362],
       [-3.23876269],
       [25.09174963],
       [ 2.41332511],
       [-0.51619257],
       [-1.28303912],
       [ 0.88566275],
       [-1.82602789],
       [-0.25125377],
       [-1.35480615],
       [ 3.62710105],
       [ 3.06946685]])

In [None]:
[[28.39755839],
       [-4.45330872],
       [ 2.12965279],
       [-3.32096469],
       [ 2.43711245],
       [ 9.89636874],
       [-1.27055567],
       [-0.03792362],
       [-3.23876269],
       [25.09174963],
       [ 2.41332511],
       [-0.51619257],
       [-1.28303912],
       [ 0.88566275],
       [-1.82602789],
       [-0.25125377],
       [-1.35480615],
       [ 3.62710105],
       [ 3.06946685]]