In [48]:
import numpy as np
import pandas as pd

In [49]:
def read_training_data(file_name, encoding='big5'):
    df = pd.read_csv(file_name, encoding=encoding, na_values=['NR'])
    df = df.fillna(0)
    return df

In [50]:
def preprocess_training_data(df):
    df = df.rename(
        columns={
            df.columns[0]: 'date',
            df.columns[1]: 'location',
            df.columns[2]: 'feature'
    })
        
    df.date = pd.to_datetime(df.date)
    
    df = df.drop(columns=['location'])

    
    return df
    

In [51]:
df = read_training_data(file_name='./train.csv')

In [52]:
df = preprocess_training_data(df)

In [53]:
def split_df_by_month(df):
    df.index = df.date.dt.month
    df = df.drop(columns=['date'])
    return list(df.groupby([df.index]))

In [54]:
def split_df_by_feature(df):
    df.index = df.feature
    df = df.drop(columns=['feature'])
    return list(df.groupby([df.index]))

In [55]:
def flatten_df_by_month_df(df):
    new_mdfs = []
    mdfs = split_df_by_month(df)
    for mdf in mdfs:
        month = mdf[0]
        df = mdf[1]
        fdfs = split_df_by_feature(df)
        new_df = pd.DataFrame(columns=df.feature.unique())
        for fdf in fdfs:
            feature = fdf[0]
            df = fdf[1]
            new_df[feature] = df.values.flatten()
        new_df = new_df[['PM2.5', 'PM10']]
        new_mdfs.append(new_df)
        
    return new_mdfs

mdfs = flatten_df_by_month_df(df)

In [56]:
def chunk_training_examples(mdfs, chunk_size):
    X = []
    y = []
    for mdf in mdfs:
        nrow, ncol = mdf.shape
        for i in range(nrow-chunk_size):
            X.append(mdf.iloc[i:i+chunk_size].values.flatten())
            y.append(mdf.loc[i+chunk_size, 'PM2.5'])
    
    X = np.array(X)
    y = np.array(y).reshape((-1, 1))
    return X, y

X, y = chunk_training_examples(mdfs, chunk_size=9)

In [57]:
def read_testing_data(file_name):
    df = pd.read_csv(file_name, header=None, na_values=['NR'])
    df = df.rename(columns={0: 'id', 1: 'feature'})
    df = df.fillna(0)
    return df
tdf = read_testing_data('./test.csv')

In [58]:
def split_df_by_id(df):
    df.index = df.id
    df = df.drop(columns=['id'])
    return list(df.groupby(df.index))

In [59]:
def flatten_df_by_id(df):
    new_iddfs = []
    iddfs = split_df_by_id(df)
    for iddf in iddfs:
        i = iddf[0]
        df = iddf[1]
        fdfs = split_df_by_feature(df)
        new_df = pd.DataFrame(columns=df.feature.unique())
        for fdf in fdfs:
            feature = fdf[0]
            df = fdf[1]
            new_df[feature] = df.values.flatten()
        new_df = new_df[['PM2.5', 'PM10']]
        new_iddfs.append(new_df)
    return new_iddfs

iddfs = flatten_df_by_id(tdf)

In [60]:
def normalize_feature(X):
    mu = np.mean(X, axis=0)
    sigma = np.std(X, axis=0)
    X_normalized = (X - mu) / sigma
    return X_normalized, mu, sigma 

X_normalized, mu, sigma = normalize_feature(X)

In [61]:
m, n = X_normalized.shape

theta = np.zeros((n+1, 1))
theta.shape
X_normalized = np.insert(X_normalized, obj=0, values=1, axis=1)


In [62]:
X_normalized.shape

(5652, 19)

In [63]:
def compute_cost(theta, X, y):
    h_theta = np.dot(X, theta)
    square_error = (h_theta - y)**2
    J = 1 / (2 * m) * np.sum(square_error)
    return J

# compute_cost(np.zeros((n+1, 1)), )

In [66]:

eta = 0.001
number_of_iterations = 20000
def gradient_descent(theta):
    for i in range(number_of_iterations):
        h_theta = np.dot(X_normalized, theta)
        
        theta = theta - (eta / m) * np.dot(X_normalized.T, h_theta-y)
#         print(i, theta)
    
    return theta
    

In [67]:
theta = gradient_descent(theta)

In [38]:
def predict(instance):
    normalized_instance = (instance - mu) / sigma
    normalized_instance = np.insert(normalized_instance.reshape((1, -1)), obj=0, values=1, axis=1)
    result = np.dot(normalized_instance, eq_theta)
    
    return result

In [39]:
towrite = []
for iddf in iddfs:
    towrite.append(predict(iddf.values.flatten()))

In [40]:
ans = np.array(towrite)

In [41]:
ans = ans.flatten()

In [42]:
ansdf = pd.DataFrame(columns=['id', 'value'])

In [43]:
ansdf.id = tdf.id.unique()
ansdf.value = ans

In [46]:
ansdf.to_csv('qwerty.csv', index=False)

In [34]:
def normal_eqn(X, y):
    #NORMALEQN Computes the closed-form solution to linear regression 
    #   NORMALEQN(X,y) computes the closed-form solution to linear 
    #   regression using the normal equations.

    theta = np.zeros(X.shape[1])

    # ====================== YOUR CODE HERE ======================
    # Instructions: Complete the code to compute the closed form solution
    #               to linear regression and put the result in theta.
    #


    theta = np.dot(np.dot(np.linalg.pinv(np.dot(X.T, X)), X.T), y)
    # ============================================================
    return theta

In [35]:
eq_theta = normal_eqn(X_normalized, y)