In [1]:
import pandas as pd
import numpy as np

In [2]:
def feature_normalize(X):
    #   FEATURENORMALIZE Normalizes the features in X 
    #   FEATURENORMALIZE(X) returns a normalized version of X where
    #   the mean value of each feature is 0 and the standard deviation
    #   is 1. This is often a good preprocessing step to do when
    #   working with learning algorithms.

    # You need to set these values correctly
    X_norm = X
    mu     = np.zeros(X.shape[1])
    sigma  = np.zeros(X.shape[1])
    
    # ====================== YOUR CODE HERE ======================
    # Instructions: First, for each feature dimension, compute the mean
    #               of the feature and subtract it from the dataset,
    #               storing the mean value in mu. Next, compute the 
    #               standard deviation of each feature and divide
    #               each feature by it's standard deviation, storing
    #               the standard deviation in sigma. 
    #
    #               Note that X is a matrix where each column is a 
    #               feature and each row is an example. You need 
    #               to perform the normalization separately for 
    #               each feature. 
    #
    # Hint: You might find the 'np.mean' and 'np.std' functions useful.
    #  
    

    mu = np.mean(X, axis=0)
    sigma = np.std(X, axis=0)
    
    X_norm = (X - mu) / sigma
    
    
    # ============================================================
    
    return X_norm, mu, sigma

In [3]:
def normal_eqn(X, y):
    #NORMALEQN Computes the closed-form solution to linear regression 
    #   NORMALEQN(X,y) computes the closed-form solution to linear 
    #   regression using the normal equations.

    theta = np.zeros(X.shape[1])

    # ====================== YOUR CODE HERE ======================
    # Instructions: Complete the code to compute the closed form solution
    #               to linear regression and put the result in theta.
    #


    theta = np.dot(np.dot(np.linalg.pinv(np.dot(X.T, X)), X.T), y)
    # ============================================================
    return theta

In [4]:
def read_training_data(file_name, encoding='big5'):
    df = pd.read_csv(file_name, encoding=encoding, na_values=['NR'])
    df = df.fillna(0)
    return df

df = read_training_data('./train.csv')

In [5]:
def preprocess_training_data(df):
    df = df.rename(
        columns={
            df.columns[0]: 'date',
            df.columns[1]: 'location',
            df.columns[2]: 'feature'
    })
        
    df.date = pd.to_datetime(df.date)
    
    df = df.drop(columns=['location'])

    
    return df

df = preprocess_training_data(df)

In [6]:
data2 = df[df.feature == 'PM2.5']

In [7]:
data10 = df[df.feature == 'PM10']

In [8]:
flat_data2 = []
for name, group in data2.groupby(data2.date.dt.month):
    group = group.drop(columns=['date', 'feature'])
    flat_data2.append(group.values.flatten())
    
flat_data2 = np.array(flat_data2)

In [9]:
flat_data10 = []
for name, group in data10.groupby(data10.date.dt.month):
    group = group.drop(columns=['date', 'feature'])
    flat_data10.append(group.values.flatten())
    
flat_data10 = np.array(flat_data10)

In [10]:
X = []
y = []
for m2, m10 in zip(flat_data2, flat_data10):
#     print(m.shape)
    for i in range(m2.size-9):
        X.append(np.append(m2[i:i+9],m10[i:i+9]))
        y.append(m2[i+9])
X = np.array(X)
y = np.array(y).reshape((-1, 1))

In [11]:
X_n, mu, sigma = feature_normalize(X)

In [12]:
def read_testing_data(file_name):
    df = pd.read_csv(file_name, header=None, na_values=['NR'])
    df = df.rename(columns={0: 'id', 1: 'feature'})
    df = df.fillna(0)
    return df
tdf = read_testing_data('./test.csv')

In [13]:
tdf2 = tdf[tdf.feature == 'PM2.5']
tdf2 = tdf2.drop(columns=['id', 'feature'])
t2 = tdf2.values 

In [14]:
tdf10 = tdf[tdf.feature == 'PM10']
tdf10 = tdf10.drop(columns=['id', 'feature'])
t10 = tdf10.values

In [15]:
# tdf.index = tdf.id

In [16]:
# tdf = tdf.drop(columns=['id', 'feature'])

In [17]:
# test = tdf.values

In [18]:
theta = normal_eqn(np.insert(X_n, 0, 1, 1), y)

In [20]:
ans = pd.DataFrame(columns=['id', 'value'])

In [25]:
pred = []
for y2, y10 in zip(t2, t10):
#     print(y2, y10)
    t = (np.append(y2, y10)-mu)/sigma
    print(np.append(y2, y10))
    pred.append(np.dot(np.insert(t, 0, 1, 0), theta))

pred = np.array(pred)

[39. 43. 29. 23. 25. 27. 32. 26. 40. 42. 42. 35. 37. 34. 41. 41. 49. 51.]
[18. 13. 22. 18. 14. 10. 13. 11. 14. 25. 24. 19. 19. 25. 25. 28. 29. 28.]
[20. 17. 17. 24. 20. 22. 10.  8.  5. 44. 55. 51. 47. 36. 37. 32. 35. 28.]
[11.  4.  8. 12. 13. 15. 25. 28. 20. 28. 26. 28. 31. 37. 47. 51. 47. 43.]
[34. 24. 19. 17. 19. 16. 21. 27. 27. 61. 53. 57. 56. 49. 56. 70. 77. 72.]
[118. 122. 102.  76.  70.  83. 106. 116. 117. 131. 168. 159. 143. 109.
 117. 118. 144. 150.]
[ 81.  76.  62.  53.  43.  36.  32.  32.  28. 120. 110.  88.  71.  62.
  64.  60.  50.  42.]
[32. 29. 24. 28. 24. 23. 21. 28. 34. 59. 60. 67. 67. 65. 65. 73. 84. 84.]
[ 53.  61.  61.  47.  36.  34.  41.  47.  43.  69.  84.  93. 110.  95.
  71.  42.  39.  49.]
[34. 24. 16. 14. 16. 18. 14. 16. 17. 60. 54. 54. 56. 50. 49. 46. 56. 54.]
[25. 27. 33. 39. 39. 34. 27. 32. 23. 52. 55. 57. 70. 68. 64. 52. 52. 60.]
[25. 23. 21. 25. 25. 31. 34. 42. 42. 36. 44. 49. 44. 41. 52. 59. 55. 60.]
[43. 34. 32. 30. 34. 29. 34. 38. 43. 93. 89. 91. 88. 90

In [22]:
ans.id = tdf.id.unique()

In [23]:
ans.value = pred.flatten()

In [38]:
ans.to_csv('final.csv', index=False)

In [22]:
def compute_cost_multi(theta, X, y):
    m, n = X.shape
    h_theta = np.dot(X, theta)
    square_error = (h_theta - y)**2
    J = 1 / (2 * m) * np.sum(square_error)
    return J

In [23]:
def gradient_descent_multi(X, y, theta, alpha, num_iters):
    # GRADIENTDESCENT Performs gradient descent to learn theta
    # theta = GRADIENTDESCENT(X, y, theta, alpha, num_iters) updates theta by 
    # taking num_iters gradient steps with learning rate alpha
    m, n = X.shape
    # Initialize
    J_history = np.zeros((num_iters, 1))
#     T_history = np.zeros((num_iters,X.shape[1], 1))
    
    for i in range(num_iters):
#         T_history[i] = theta

        ### ========= YOUR CODE HERE ============
        # Instructions: Perform a single gradient step on the parameter vector theta.
        
        htheta = np.dot(X, theta)
        theta = theta - (alpha / m) * np.dot(X.T, htheta-y)
        
        ### =====================================
        
        J_history[i] = compute_cost_multi(theta, X, y)
    return theta, J_history#, T_history


In [29]:
g_theta, J_his = gradient_descent_multi(np.insert(X_n, 0, 1, 1), y, theta, 0.01, 20000)

In [25]:
X[147, :]

array([51., 45., 46., 38., 39., 36., 47., 55., 60., 73., 77., 62., 63.,
       70., 79., 93., 87., 96.])

In [26]:
y

array([[61.],
       [67.],
       [72.],
       ...,
       [51.],
       [48.],
       [49.]])

In [19]:
theta

array([[28.39755839],
       [-4.45330872],
       [ 2.12965279],
       [-3.32096469],
       [ 2.43711245],
       [ 9.89636874],
       [-1.27055567],
       [-0.03792362],
       [-3.23876269],
       [25.09174963],
       [ 2.41332511],
       [-0.51619257],
       [-1.28303912],
       [ 0.88566275],
       [-1.82602789],
       [-0.25125377],
       [-1.35480615],
       [ 3.62710105],
       [ 3.06946685]])

In [24]:
ans

Unnamed: 0,id,value
0,id_0,35.397057
1,id_1,13.717201
2,id_2,10.139937
3,id_3,18.867396
4,id_4,29.325118
5,id_5,96.602333
6,id_6,27.406502
7,id_7,36.332116
8,id_8,32.218736
9,id_9,19.616862
