In [1]:
import time
import numpy as np
import pandas as pd
from sklearn import linear_model
import matplotlib.pyplot as plt
import statsmodels.api as sm

  from pandas.core import datetools


In [2]:
def parseFile(file_path):
    file = pd.read_csv(file_path)
    return file


def train_test_split(mat):
    train_mat = mat[0:2800, :]
    test_mat = mat[2801:, :]

    return train_mat, test_mat


def train_test_split_lab(lab):
    train_mat = lab[0:2800]
    test_mat = lab[2801:]

    return train_mat, test_mat

In [3]:
def filter_full_feature(mat, label1, label2):
    # check if one data contains n/a feature, delte it and its correponding label
    row_count = 0
    full_list = []
    for row in mat:
        if np.any(np.isnan(row.astype(np.float64))):
            pass
        else:
            full_list.append(row_count)

        row_count += 1

    mat = mat[full_list, :]
    label1 = label1[full_list]
    label2 = label2[full_list]

    return mat, label1, label2

In [4]:
def model_train(feature_mat, lab_mat):

    model = linear_model.LinearRegression()
    feature_mat = feature_mat.astype(np.float)
    lab = lab_mat.astype(np.float)

    print('Model training - Started!')
    time_start = time.time()
    model.fit(feature_mat, lab)

    time_end = time.time()
    print('Model training - Completed! Training time: ' + str(time_end - time_start) + 's')

    predicted_lab = model.predict(feature_mat)

    return model, predicted_lab


def linear_reg(X, y):
    X_withIntercept = sm.add_constant(X)
    model = sm.OLS(y.astype(float), X_withIntercept.astype(float))
    est = model.fit()

    return est

In [6]:
def main():
    cal_rate_df = parseFile('CleanedData/daily_unemployment.csv')
    cal_rate = np.array(cal_rate_df)[:, 1] # unemployment rate from our caculation

    gallup_data_df = parseFile('CleanedData/gallup_daily.csv')
    gallup_rate = 1 - np.array(gallup_data_df)[:, 2] # unemployment rate from gallup_daily.csv

    header_list = list(gallup_data_df.columns.values) #feautres' names

    senti_features = np.array(gallup_data_df)[:, 3:9] # sentiment features array (3000,7)

    senti_features, gallup_rate, cal_rate = filter_full_feature(senti_features, gallup_rate, cal_rate)

    # split train test data
    cal_rate_train, cal_rate_test = train_test_split_lab(cal_rate) 
    gallup_rate_train, gallup_rate_test = train_test_split_lab(gallup_rate)
    sent_train, sent_test = train_test_split(senti_features)

    # train
    model_cal, cal_pred = model_train(sent_train, cal_rate_train)
    model_gallup, gallup_pred = model_train(sent_train, gallup_rate_train)


    senti_features_df = pd.DataFrame(sent_train, columns= header_list[3:9])
    cal_label = pd.DataFrame(cal_rate_train)
    model_cal_reg = linear_reg(senti_features_df, cal_label)
    print(model_cal_reg.summary())


    gallup_label = pd.DataFrame(gallup_rate_train)
    model_gallup_reg = linear_reg(senti_features_df, gallup_label)
    print(model_gallup_reg.summary())

    rate_pred = model_cal.predict(senti_features)

    plt.plot(cal_rate, label= 'calculated_rate')
    plt.plot(rate_pred, label='predicted_rate')
    plt.legend()
    plt.savefig('cal_rate_compare.png')
    plt.close()


    rate_pred = model_gallup.predict(senti_features)


    plt.plot(gallup_rate, label= 'calculated_rate')
    plt.plot(rate_pred, label='predicted_rate')
    plt.legend()
    plt.savefig('gallup_rate_compare.png')
    plt.close()


if __name__ == '__main__':
    main()

Model training - Started!
Model training - Completed! Training time: 0.0020520687103271484s
Model training - Started!
Model training - Completed! Training time: 0.0029325485229492188s
                            OLS Regression Results                            
Dep. Variable:                      0   R-squared:                       0.216
Model:                            OLS   Adj. R-squared:                  0.215
Method:                 Least Squares   F-statistic:                     128.6
Date:                Fri, 20 Apr 2018   Prob (F-statistic):          5.39e-144
Time:                        21:47:58   Log-Likelihood:                 5117.5
No. Observations:                2800   AIC:                        -1.022e+04
Df Residuals:                    2793   BIC:                        -1.018e+04
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
                        co