In [1]:
import numpy as np
import pandas as pd

In [2]:
class DataManager:
    def __init__(self):
        self.rdf = None # raw df
        self.data = None # dictionary of processed dfs data
        self.X = None
        self.y = None
        
    def read_training_data(self, file_name):
        df = pd.read_csv(file_name, encoding='big5', na_values=['NR'])
        df = df.fillna(0.0)
        df = df.rename(columns={
            df.columns[0]: 'date',
            df.columns[1]: 'location',
            df.columns[2]: 'feature'
        })
        df.date = pd.to_datetime(df.date)
        df = df.drop(columns=['location'])
        self.rdf = df
        return 
    
    def rdf_to_data(self):
        rdf = self.rdf
        data = {month: {feature: df.drop(columns=['date', 'feature']).values.flatten() for feature, df in mdf.groupby(mdf.feature)} for month, mdf in rdf.groupby(rdf.date.dt.month)}
        self.data = data
        return 
    
    def select_feature_to_mdfs(self, feature_list):
        mdfs = {month: pd.DataFrame(columns=sorted(feature_list)) for month in range(1, 13)}
        for month, fdata in self.data.items():
            for feature in feature_list:
                mdfs[month][feature] = fdata[feature]
            
        return mdfs
    
    
    def chunk_examples(self, mdfs, chunk_size):
        for month, mdf in mdfs.items():
            
        
        

In [2]:
def read_training_data(file_name, encoding='big5'):
    df = pd.read_csv(file_name, encoding=encoding, na_values=['NR'])
    df = df.fillna(0)
    return df

In [3]:
def preprocess_training_data(df):
    df = df.rename(
        columns={
            df.columns[0]: 'date',
            df.columns[1]: 'location',
            df.columns[2]: 'feature'
    })
        
    df.date = pd.to_datetime(df.date)
    
    df = df.drop(columns=['location'])

    return df
    

In [53]:
def split_df_by_month(df):
    df.index = df.date.dt.month
    df = df.drop(columns=['date'])
    return list(df.groupby([df.index]))

In [54]:
def split_df_by_feature(df):
    df.index = df.feature
    df = df.drop(columns=['feature'])
    return list(df.groupby([df.index]))

In [55]:
def flatten_df_by_month_df(df):
    new_mdfs = []
    mdfs = split_df_by_month(df)
    for mdf in mdfs:
        month = mdf[0]
        df = mdf[1]
        fdfs = split_df_by_feature(df)
        new_df = pd.DataFrame(columns=df.feature.unique())
        for fdf in fdfs:
            feature = fdf[0]
            df = fdf[1]
            new_df[feature] = df.values.flatten()
        new_df = new_df[['PM2.5', 'PM10']]
        new_mdfs.append(new_df)
        
    return new_mdfs

mdfs = flatten_df_by_month_df(df)

In [56]:
def chunk_training_examples(mdfs, chunk_size):
    X = []
    y = []
    for mdf in mdfs:
        nrow, ncol = mdf.shape
        for i in range(nrow-chunk_size):
            X.append(mdf.iloc[i:i+chunk_size].values.flatten())
            y.append(mdf.loc[i+chunk_size, 'PM2.5'])
    
    X = np.array(X)
    y = np.array(y).reshape((-1, 1))
    return X, y

X, y = chunk_training_examples(mdfs, chunk_size=9)

In [57]:
def read_testing_data(file_name):
    df = pd.read_csv(file_name, header=None, na_values=['NR'])
    df = df.rename(columns={0: 'id', 1: 'feature'})
    df = df.fillna(0)
    return df
tdf = read_testing_data('./test.csv')

In [58]:
def split_df_by_id(df):
    df.index = df.id
    df = df.drop(columns=['id'])
    return list(df.groupby(df.index))

In [59]:
def flatten_df_by_id(df):
    new_iddfs = []
    iddfs = split_df_by_id(df)
    for iddf in iddfs:
        i = iddf[0]
        df = iddf[1]
        fdfs = split_df_by_feature(df)
        new_df = pd.DataFrame(columns=df.feature.unique())
        for fdf in fdfs:
            feature = fdf[0]
            df = fdf[1]
            new_df[feature] = df.values.flatten()
        new_df = new_df[['PM2.5', 'PM10']]
        new_iddfs.append(new_df)
    return new_iddfs

iddfs = flatten_df_by_id(tdf)

In [60]:
def normalize_feature(X):
    mu = np.mean(X, axis=0)
    sigma = np.std(X, axis=0)
    X_normalized = (X - mu) / sigma
    return X_normalized, mu, sigma 

X_normalized, mu, sigma = normalize_feature(X)

In [61]:
m, n = X_normalized.shape

theta = np.zeros((n+1, 1))
theta.shape
X_normalized = np.insert(X_normalized, obj=0, values=1, axis=1)


In [62]:
X_normalized.shape

(5652, 19)

In [63]:
def compute_cost(theta, X, y):
    h_theta = np.dot(X, theta)
    square_error = (h_theta - y)**2
    J = 1 / (2 * m) * np.sum(square_error)
    return J

# compute_cost(np.zeros((n+1, 1)), )

In [66]:

eta = 0.001
number_of_iterations = 20000
def gradient_descent(theta):
    for i in range(number_of_iterations):
        h_theta = np.dot(X_normalized, theta)
        
        theta = theta - (eta / m) * np.dot(X_normalized.T, h_theta-y)
#         print(i, theta)
    
    return theta
    

In [67]:
theta = gradient_descent(theta)

In [38]:
def predict(instance):
    normalized_instance = (instance - mu) / sigma
    normalized_instance = np.insert(normalized_instance.reshape((1, -1)), obj=0, values=1, axis=1)
    result = np.dot(normalized_instance, eq_theta)
    
    return result

In [39]:
towrite = []
for iddf in iddfs:
    towrite.append(predict(iddf.values.flatten()))

In [40]:
ans = np.array(towrite)

In [41]:
ans = ans.flatten()

In [42]:
ansdf = pd.DataFrame(columns=['id', 'value'])

In [43]:
ansdf.id = tdf.id.unique()
ansdf.value = ans

In [46]:
ansdf.to_csv('qwerty.csv', index=False)

In [34]:
def normal_eqn(X, y):
    #NORMALEQN Computes the closed-form solution to linear regression 
    #   NORMALEQN(X,y) computes the closed-form solution to linear 
    #   regression using the normal equations.

    theta = np.zeros(X.shape[1])

    # ====================== YOUR CODE HERE ======================
    # Instructions: Complete the code to compute the closed form solution
    #               to linear regression and put the result in theta.
    #


    theta = np.dot(np.dot(np.linalg.pinv(np.dot(X.T, X)), X.T), y)
    # ============================================================
    return theta

In [35]:
eq_theta = normal_eqn(X_normalized, y)

In [3]:
dm = DataManager()
dm.read_training_data('./train.csv')

In [4]:
dm.rdf_to_data()

In [5]:
d = dm.select_feature_to_mdfs(['PM2.5', 'PM10'])

In [6]:
d

{1:       PM10  PM2.5
 0     91.0   58.0
 1    104.0   59.0
 2     87.0   57.0
 3     91.0   59.0
 4     83.0   64.0
 5     76.0   64.0
 6     73.0   60.0
 7     66.0   55.0
 8     88.0   54.0
 9     97.0   61.0
 10   101.0   67.0
 11    88.0   72.0
 12    75.0   58.0
 13    62.0   48.0
 14    74.0   36.0
 15    72.0   38.0
 16    79.0   40.0
 17    79.0   42.0
 18    92.0   46.0
 19   120.0   57.0
 20   116.0   76.0
 21   113.0   74.0
 22    84.0   71.0
 23    68.0   50.0
 24    72.0   41.0
 25    71.0   45.0
 26    77.0   48.0
 27    54.0   43.0
 28    43.0   19.0
 29    40.0    9.0
 ..     ...    ...
 450  143.0   80.0
 451  148.0   85.0
 452  151.0   95.0
 453  159.0   99.0
 454  155.0  109.0
 455  153.0  105.0
 456  137.0  104.0
 457  136.0   95.0
 458  133.0   88.0
 459  124.0   81.0
 460  118.0   78.0
 461  107.0   75.0
 462  108.0   72.0
 463   99.0   60.0
 464  105.0   50.0
 465    0.0    0.0
 466    0.0   51.0
 467  133.0   69.0
 468  117.0   85.0
 469  111.0   76.0
 470  101

In [7]:
dm.rdf

Unnamed: 0,date,feature,0,1,2,3,4,5,6,7,...,14,15,16,17,18,19,20,21,22,23
0,2014-01-01,AMB_TEMP,14.00,13.00,13.00,12.00,12.00,12.00,12.00,12.00,...,24.00,23.00,22.00,21.00,20.00,18.00,18.00,17.00,17.00,17.00
1,2014-01-01,CH4,1.90,1.90,2.10,2.10,2.20,2.20,2.20,2.10,...,1.80,1.70,1.80,1.80,1.90,1.90,1.90,1.90,1.80,1.90
2,2014-01-01,CO,1.16,1.00,1.09,1.18,1.01,1.00,1.15,1.33,...,0.52,0.50,0.59,0.84,1.35,1.45,1.23,1.19,1.09,0.82
3,2014-01-01,NMHC,0.70,0.59,0.66,0.65,0.52,0.50,0.58,0.66,...,0.16,0.15,0.19,0.32,0.56,0.68,0.51,0.85,0.46,0.34
4,2014-01-01,NO,17.00,8.80,14.00,33.00,26.00,21.00,34.00,38.00,...,1.90,1.10,1.10,0.90,5.70,16.00,16.00,6.20,5.00,1.30
5,2014-01-01,NO2,31.00,29.00,26.00,25.00,23.00,21.00,20.00,22.00,...,21.00,20.00,25.00,35.00,55.00,55.00,50.00,41.00,37.00,28.00
6,2014-01-01,NOx,47.00,38.00,41.00,57.00,49.00,41.00,54.00,61.00,...,23.00,21.00,26.00,36.00,60.00,72.00,66.00,47.00,42.00,29.00
7,2014-01-01,O3,2.40,3.00,2.60,2.30,2.10,2.30,2.50,3.40,...,64.00,63.00,53.00,38.00,14.00,3.30,2.90,6.40,4.90,12.00
8,2014-01-01,PM10,91.00,104.00,87.00,91.00,83.00,76.00,73.00,66.00,...,74.00,72.00,79.00,79.00,92.00,120.00,116.00,113.00,84.00,68.00
9,2014-01-01,PM2.5,58.00,59.00,57.00,59.00,64.00,64.00,60.00,55.00,...,36.00,38.00,40.00,42.00,46.00,57.00,76.00,74.00,71.00,50.00


In [11]:
for f in d:
    print(f)

PM2.5
PM10


In [13]:
d['PM2.5'].keys()

dict_keys([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12])