In [75]:
%matplotlib inline

# data manipulation and modeling
import numpy as np
import pandas as pd
import statsmodels.api as sm

# graphix
import matplotlib.pyplot as plt
import prettyplotlib as pplt
import seaborn as sns
import statsmodels.graphics.tsaplots as tsaplots
from statsmodels.tsa.arima_model import ARIMA
from sklearn.metrics import mean_squared_error as MSE
# utility
import os

In [8]:
pd.set_option('display.max_columns', 40)
plt.rcParams['figure.figsize'] = [20, 10]

In [42]:
def generate_year_list(start, stop=None):
    """ 
    make a list of column names for specific years
    in the format they appear in the data frame start/stop inclusive
    """
    
    if isinstance(start, list):
        data_range = start
    elif stop:
        data_range = range(start, stop+1)
    else:
        data_range = [start]
    
    yrs = []
    
    for yr in data_range:
        yrs.append("{0} [YR{0}]".format(yr))
        
    return yrs


def plot_rows(data, ids=None, linestyle="-", legend=True):
    # get some colors for the lines
    bmap = pplt.brewer2mpl.get_map('Set3','Qualitative', 10)
    colors = bmap.mpl_colors
    
    if not None == ids.all():
        get_rows = lambda: enumerate(ids)
    else:
        get_rows = lambda: enumerate(data.index.values)
    
    for i, r in get_rows():
        # get the time series values
        time_data = data.loc[r]

        # create an x axis to plot along
        just_years = [y[:4] for y in data.columns]
        X = pd.DatetimeIndex(just_years)

        # get time series info for labeling
        country, descrip = train[["Country Name", "Series Name"]].loc[r]

        # plot the series
        
        plt.plot(X, time_data, c=colors[i],
                 label="{} - {}".format(country, descrip), ls=linestyle)
        plt.scatter(X, time_data, alpha=0.8, c=colors[i], label = '')

    if legend:
        plt.legend(loc=0)
    plt.title("Progress Towards Subset of MDGs")
    
def ARIMA_prediction(data, P,D,Q):
    model = ARIMA(data, order=(P, D, Q))
    model_fit = model.fit(disp=0)
    prediction = model_fit.forecast()[0]
    return prediction

def plot_random_rows(data, seed):
    np.random.seed(seed)
    rand_rows = np.random.choice(data.index.values, size=10)
    plot_rows(data, ids=rand_rows)
    plt.show()

In [2]:
train = pd.read_csv("D:\\Datasets\\UNMDG\\TrainingSet.csv", index_col = 0)

In [3]:
submission_labels = pd.read_csv("D:\\Datasets\\UNMDG\\SubmissionRows.csv", index_col=0)

In [6]:
prediction_rows = train.loc[submission_labels.index]
prediction_rows = prediction_rows[generate_year_list(1972, 2007)]
prediction_rows.head()

Unnamed: 0,1972 [YR1972],1973 [YR1973],1974 [YR1974],1975 [YR1975],1976 [YR1976],1977 [YR1977],1978 [YR1978],1979 [YR1979],1980 [YR1980],1981 [YR1981],...,1998 [YR1998],1999 [YR1999],2000 [YR2000],2001 [YR2001],2002 [YR2002],2003 [YR2003],2004 [YR2004],2005 [YR2005],2006 [YR2006],2007 [YR2007]
559,,,,,,,,,,,...,0.152,0.187,0.221,0.256,0.291,0.325,0.36,0.395,0.43,0.465
618,,,,,,,,,,,...,,,,4.7e-05,4.6e-05,0.000879,0.001058,0.012241,0.021071,0.019
753,0.296,0.2909,0.2852,0.2798,0.2742,0.2683,0.2624,0.2565,0.2503,0.2439,...,0.1391,0.1366,0.1339,0.131,0.1277,0.1244,0.121,0.1177,0.1145,0.1115
1030,,,,,,,,,,,...,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001
1896,,,,,,,,,,,...,0.965,0.965,0.965,0.964,0.964,0.963,0.963,0.962,0.962,0.961


#### Hint
Use 'data1'.corr('data2'.shift(lag)) to find cross correlation between two different series. <br>
Use mutual information score. Figure out how.

In [12]:
#Use 2007 as validation set
trainset = prediction_rows[generate_year_list(1972, 2006)]

In [36]:
completed_trainset = trainset.drop(trainset[pd.isnull(trainset).any(axis=1)].index, axis = 0)

In [68]:
prediction_rows[generate_year_list(2007)].loc[753].item()

0.1115

In [72]:
yhat_2007 = []
for i in completed_trainset.index:
    yhat = ARIMA_prediction(list(completed_trainset.loc[i]), 1,0,0)[0]
    yhat_2007.append([i,yhat,prediction_rows[generate_year_list(2007)].loc[i].item()])

  R_mat, T_mat)
  (1+np.exp(-params))).copy()
  (1+np.exp(-params))).copy()
  large = s > cutoff


In [73]:
yhat_2007 = np.asarray(yhat_2007)

In [79]:
float(MSE(yhat_2007[:,2], yhat_2007[:,1]))

1.1536041732494047e-05