In [1]:
import numpy as np
import pandas as pd

In [2]:
class DataManager:
    def __init__(self):
        self.feature = None
        self.rdf = None # raw df
        self.data = None # dictionary of processed dfs data
        self.X = None
        self.y = None
        self.tdata = None
        
    def read_training_data(self, file_name):
        df = pd.read_csv(file_name, encoding='big5', na_values=['NR'])
        df = df.fillna(0.0)
        df = df.rename(columns={
            df.columns[0]: 'date',
            df.columns[1]: 'location',
            df.columns[2]: 'feature'
        })
        df.date = pd.to_datetime(df.date)
        df = df.drop(columns=['location'])
        self.feature = sorted(df.feature.unique())
        self.rdf = df
        self.rdf_to_data()
        return 
    
    def rdf_to_data(self):
        rdf = self.rdf
        data = {month: {feature: df.drop(columns=['date', 'feature']).values.flatten() for feature, df in mdf.groupby(mdf.feature)} for month, mdf in rdf.groupby(rdf.date.dt.month)}
        self.data = data
        return 
    
    def select_feature_to_mdfs(self, feature_list=None):
        if feature_list is None:
            feature_list = self.feature
#         mdfs = {month: pd.DataFrame(columns=feature_list) for month in range(1, 13)}
        mdfs = {month: pd.DataFrame(columns=sorted(feature_list)) for month in range(1, 13)}

        for month, fdata in self.data.items():
            for feature in feature_list:
                mdfs[month][feature] = fdata[feature]
            
        return mdfs
    
    
    def chunk_examples(self, mdfs, chunk_size):
        X = []
        y = []
        for month, mdf in mdfs.items():
            nrows, ncols = mdf.shape

            for i in range(nrows-chunk_size):
#                 X.append(mdf.iloc[i:i+chunk_size].T.values.flatten())
                X.append(mdf.iloc[i:i+chunk_size].values.flatten())
                y.append(mdf.loc[i+chunk_size, 'PM2.5'])
                
        X = np.array(X)
        y = np.array(y).reshape((-1, 1))
        return X, y
    
    def read_testing_data(self, file_name):
        df = pd.read_csv(file_name, header=None, na_values=['NR'])
        df = df.rename(columns={0: 'id', 1: 'feature'})
        df = df.fillna(0.0)
        self.tdata = df
        return
    
    def select_testing_feature(self, feature_list=None):
        if feature_list is None:
            feature_list = self.feature
            
        iddfs = {i: df for i, df in self.tdata.groupby(self.tdata.id)}
        
        for i, df in iddfs.items():
            columns = df.feature
            df = df.drop(columns=['id', 'feature']).T
            df.columns = columns
            df = df[sorted(feature_list)]
#             df = df[feature_list]
            iddfs[i] = df
        return iddfs
            

In [3]:
def normalize_data(X):
    mu = np.mean(X, axis=0)
    sigma = np.std(X, axis=0)
    X_n = (X - mu) / sigma
    return X_n, mu, sigma

In [4]:
def norm_eq(X, y):
    theta = np.dot(np.dot(np.linalg.pinv(np.dot(X.T, X)), X.T), y)
    return theta

In [5]:
def predict(theta, instance):
    h_theta = np.dot(instance, theta)
    return h_theta

In [6]:
dm = DataManager()

In [7]:
dm.read_training_data('./train.csv')

In [8]:
mdfs = dm.select_feature_to_mdfs(['PM2.5', 'PM10'])

In [9]:
X, y = dm.chunk_examples(mdfs, chunk_size=9)

In [10]:
X_n, mu, sigma = normalize_data(X)

In [11]:
norm_theta = norm_eq(np.insert(X_n, obj=0, values=1, axis=1), y)

In [12]:
dm.read_testing_data('./test.csv')

In [13]:
tX = dm.select_testing_feature(['PM2.5', 'PM10'])

In [14]:
results = []
for i, tdf in tX.items():
    instance_n = (tdf.values.flatten() - mu) / sigma
#     instance_n = (tdf.T.values.flatten() - mu) / sigma

#     print(tdf.T.values.flatten())
    results.append((i, predict(norm_theta, np.insert(instance_n.reshape((1, -1)), obj=0, values=1, axis=1))[0, 0]))
results = np.array(results)

In [16]:
pd.DataFrame(results, columns=['id', 'value'])

Unnamed: 0,id,value
0,id_0,35.397056919205596
1,id_1,13.717200901715609
2,id_10,27.518308420229843
3,id_100,23.549114008512184
4,id_101,51.143232881431636
5,id_102,25.490830295143706
6,id_103,36.42587945163841
7,id_104,29.97024579255344
8,id_105,24.710147116082457
9,id_106,18.1164509759114


In [16]:
mdfs[1]

Unnamed: 0,PM10,PM2.5
0,91.0,58.0
1,104.0,59.0
2,87.0,57.0
3,91.0,59.0
4,83.0,64.0
5,76.0,64.0
6,73.0,60.0
7,66.0,55.0
8,88.0,54.0
9,97.0,61.0


In [17]:
dm.tdata

Unnamed: 0,id,feature,2,3,4,5,6,7,8,9,10
0,id_0,AMB_TEMP,35.00,35.00,35.00,34.00,33.00,31.00,30.00,29.00,29.00
1,id_0,CH4,1.60,1.70,1.70,1.60,1.60,1.60,1.70,1.70,1.70
2,id_0,CO,0.17,0.15,0.14,0.14,0.17,0.22,0.30,0.37,0.42
3,id_0,NMHC,0.02,0.01,0.01,0.01,0.03,0.04,0.08,0.13,0.11
4,id_0,NO,1.60,1.70,2.00,2.10,1.90,1.80,1.80,1.60,1.70
5,id_0,NO2,4.40,4.70,4.60,4.50,5.60,7.60,11.00,12.00,15.00
6,id_0,NOx,6.00,6.50,6.60,6.60,7.50,9.50,13.00,14.00,17.00
7,id_0,O3,61.00,60.00,54.00,53.00,51.00,51.00,46.00,45.00,42.00
8,id_0,PM10,42.00,42.00,35.00,37.00,34.00,41.00,41.00,49.00,51.00
9,id_0,PM2.5,39.00,43.00,29.00,23.00,25.00,27.00,32.00,26.00,40.00


In [18]:
iddfs = dm.select_testing_feature(['PM2.5', 'PM10'])

In [20]:
iddfs['id_0']

feature,PM10,PM2.5
2,42.0,39.0
3,42.0,43.0
4,35.0,29.0
5,37.0,23.0
6,34.0,25.0
7,41.0,27.0
8,41.0,32.0
9,49.0,26.0
10,51.0,40.0


In [22]:
mdfs[1]

Unnamed: 0,PM10,PM2.5
0,91.0,58.0
1,104.0,59.0
2,87.0,57.0
3,91.0,59.0
4,83.0,64.0
5,76.0,64.0
6,73.0,60.0
7,66.0,55.0
8,88.0,54.0
9,97.0,61.0


In [60]:
a = np.array([[7.7]])

In [73]:
a.reshape((1,))

array([7.7])

In [75]:
a[0, 0]

7.7

In [20]:
X[147, :]

array([51., 45., 46., 38., 39., 36., 47., 55., 60., 73., 77., 62., 63.,
       70., 79., 93., 87., 96.])

In [98]:
for i in [ 77.,  74.,  69.,  65.,  66.,  75.,  79.,  91.,  92., 116., 112.,97., 107., 112., 127., 133., 122., 122.]:
    print(i in X[100, :])

True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True


In [18]:
mdfs[1]

Unnamed: 0,PM2.5,PM10
0,58.0,91.0
1,59.0,104.0
2,57.0,87.0
3,59.0,91.0
4,64.0,83.0
5,64.0,76.0
6,60.0,73.0
7,55.0,66.0
8,54.0,88.0
9,61.0,97.0


In [21]:
y

array([[61.],
       [67.],
       [72.],
       ...,
       [51.],
       [48.],
       [49.]])

In [22]:
norm_theta

array([[28.39755839],
       [-4.45330872],
       [ 2.12965279],
       [-3.32096469],
       [ 2.43711245],
       [ 9.89636874],
       [-1.27055567],
       [-0.03792362],
       [-3.23876269],
       [25.09174963],
       [ 2.41332511],
       [-0.51619257],
       [-1.28303912],
       [ 0.88566275],
       [-1.82602789],
       [-0.25125377],
       [-1.35480615],
       [ 3.62710105],
       [ 3.06946685]])

In [18]:
test_theta = np.array([[28.39755839],
       [-4.45330872],
       [ 2.12965279],
       [-3.32096469],
       [ 2.43711245],
       [ 9.89636874],
       [-1.27055567],
       [-0.03792362],
       [-3.23876269],
       [25.09174963],
       [ 2.41332511],
       [-0.51619257],
       [-1.28303912],
       [ 0.88566275],
       [-1.82602789],
       [-0.25125377],
       [-1.35480615],
       [ 3.62710105],
       [ 3.06946685]]).flatten()

In [21]:
norm_theta

array([[28.39755839],
       [ 2.41332511],
       [-4.45330872],
       [-0.51619257],
       [ 2.12965279],
       [-1.28303912],
       [-3.32096469],
       [ 0.88566275],
       [ 2.43711245],
       [-1.82602789],
       [ 9.89636874],
       [-0.25125377],
       [-1.27055567],
       [-1.35480615],
       [-0.03792362],
       [ 3.62710105],
       [-3.23876269],
       [ 3.06946685],
       [25.09174963]])

In [20]:
for i in test_theta:
    print(i in norm_theta)

False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
