In [20]:
import gdelt
import pandas as pd
import numpy as  np
import seaborn as sns
import math
import tqdm
import matplotlib.pyplot as plt
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.layers import Embedding
from keras.layers import LSTM
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder,LabelBinarizer,OneHotEncoder
from sklearn.metrics import accuracy_score, roc_auc_score, make_scorer,mean_squared_error,r2_score

In [4]:
gd = gdelt.gdelt()

df=gd.Search(['2018 01 01','2018 02 28'],table='events')
df = df[df.EventRootCode != '--']
df.index = df.SQLDATE
df.sort_index(inplace=True)
df.index = pd.to_datetime(df.index, format='%Y%m%d')
df.drop(['SQLDATE','QuadClass'], axis=1, inplace=True)



In [19]:
#df.shape
#df.to_pickle('/home/chiru/Desktop/GDELT/data.pickle')
df=pd.read_pickle('/home/chiru/Desktop/GDELT/data.pickle')

In [21]:
def get_data(df,country,code):
    scaler = MinMaxScaler(feature_range=(0,50))
    df.NumMentions = scaler.fit_transform(df['NumMentions'].reshape(-1,1))
    df = df[df.ActionGeo_CountryCode == country]
    indices = df.index.unique()
    aggregated_df = pd.DataFrame(columns=['ActionGeo_CountryCode', 'EventRootCode', 'NumMentions','GoldsteinScale', 'AvgTone'], index= indices)
    aggregated_df.ActionGeo_CountryCode = country
    aggregated_df.EventRootCode = code
    for i in range(len(indices)):
        aggregated_df.NumMentions[indices[i]] = df.NumMentions[indices[i]].sum()
        aggregated_df.GoldsteinScale[indices[i]] = df.GoldsteinScale[indices[i]].max()
        aggregated_df.AvgTone[indices[i]] = df.AvgTone[indices[i]].mean()
    
    return aggregated_df

In [22]:
def create_data(df,lag_days,all_data):
    dataX, dataY = [], []
    y_df=all_data.NumMentions#get_data(all_data,'US',14).NumMentions
    #print y_df.NumMentions
    print len(range(len(df)-lag_days-1))
    print len(y_df)
    
    for i in range(len(df)-lag_days-1):
        a = df[i:(i+lag_days)]
        dataX.append(np.array(a))
        dataY.append(y_df.iloc[i+lag_days])
    
    return np.array(dataX), np.array(dataY) 

In [23]:
def specificCountry_orderedEvents(data,country,days,plot=False):
    z = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20]
    df = pd.DataFrame(columns=z)
    l = []
    print 'loading started'
    for i in tqdm(z):
        l.append(get_data(data,country, i).NumMentions)
    for i in tqdm(z):
        df[i] = l[i-1]
    plot_list = []
    df = df.fillna(0)
    df = df[:(len(df)-len(df)%days)]
    for i in range(int( df.shape[0]/days )):
        plot_list.append(df[days*i:days*(i+1)].sum())
    if(plot):
        fig = plt.figure()
        ax = fig.add_subplot(111)
        ax.set_yticks([25*i for i in range(11)])
        ax.set_xticks(z)
        plt.bar(z,plot_list[52])
    print 'Done'
    return df

In [24]:
def load_data(df):
    
    
    #''''''''''''''''''''''''''''''''''''''
    # data for protest for all country    '
    #''''''''''''''''''''''''''''''''''''''
    print 'Loding Started'
    core_df =df
    core_df.ActionGeo_CountryCode.dropna(inplace=True)
    country_list = core_df.ActionGeo_CountryCode.unique()
    code = 14
    i=0
    all_country_timeseries = {}
    print 'Country for loop'
    print 'Total:',len(country_list)
    for country in country_list:
        all_country_timeseries[country] = get_data(core_df,country,code).NumMentions
        print("For "+str(i) + country+": Done")
    
    df = pd.DataFrame(index=all_country_timeseries['US'].index, columns=country_list)
    print 'Country for loop'
    for country in country_list:
        df[country] = all_country_timeseries[country]
        
    #'''''''''''''''''''''''''''''''
    # data for US for all event    '
    #'''''''''''''''''''''''''''''''
    print 'order_events'
    df_allEvent = specificCountry_orderedEvents(core_df,'US', 500)
    str_column = df_allEvent.columns.astype(str)
    df_allEvent.columns = str_column
    print 'Loading Done'
    return df, df_allEvent, country_list, str_column

In [25]:
import datetime
import scipy.cluster.hierarchy


from random import randint, seed
from fastdtw import fastdtw

from scipy.spatial.distance import euclidean
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.preprocessing import normalize

def dtw(x, y):
    dist, _ = fastdtw(x, y, dist=euclidean)
    return dist

def calculate_dtw_distances(dataframe):
    n = len(list(dataframe))

    dtw_distances = np.zeros((n,n))
    for i in range(n):
        for j in range(n):
            
            # ADD PROVISION FOR NUMBER OF DAYS ( 500)
            dtw_distances[i,j] = dtw(dataframe.ix[-500:,i].fillna(0).values, dataframe.ix[-500:,j].fillna(0).values)
    
        print("Done [",i,"]")
    return dtw_distances    


# ADD PARAMETER FOR THE TYPE OF CLUSTERING 
def Cluster(number_of_clusters, dtw_distances ):
    #NUMBER OF CLUSTERS
    k = number_of_clusters

    #K-means
    y_pred_km_dtw = KMeans(n_clusters=k).fit_predict(dtw_distances)
    
    #Hierarchical
    y_pred_hac_dtw = AgglomerativeClustering(n_clusters=k).fit_predict(dtw_distances)  
    
    return y_pred_hac_dtw, y_pred_km_dtw

In [26]:
def cluster_two(df, country_list):
    try:
        distance_matrix = np.load('dtw_distance.npy')
    except:
        distance_matrix = calculate_dtw_distances(df)
        np.save('dtw_distance', distance_matrix)
    
    #print('Distance matrix calculated')
    hac_all_country, kmeans_all_country = Cluster(6, distance_matrix)
    country_cluster = {}
    i = 0
    for country in country_list:
        country_cluster[country] = hac_all_country[i]
        i += 1
    #print("Cluster of countries: ", country_cluster)
    
    return country_cluster

In [27]:
def similar_country_data(data,cl,df):
    #df_allCountry, df_allEvent, country_list, cameoCodes = load_data(data)
    df_allCountry, country_list=data,cl
    print 'Data Divided'
    df_allCountry.fillna(0, inplace=True)
    #df_allEvent.fillna(0, inplace=True)
    print 'Cluster two started'
    country_cluster = cluster_two(df_allCountry, country_list)

    #get similar country
    print 'Cluster two finished'
    similar_country = []
    CountryClusterValue = country_cluster[country]
    print 'Country_list loop'
    for cntry in country_list:
        try:
            if country_cluster[cntry] == CountryClusterValue:
                similar_country.append(cntry)
        except:
            pass

    #get data for similar countries
    #Fill missing date values
    print()
    print("Preparing dataset for similar countries: ", similar_country)
    allSimilarCountryData = []
    for cntry in similar_country:
        df1 = get_data(df,cntry, code)
        df1 = df.NumMentions
        allSimilarCountryData.append(df1)
    print 'Finished'
    return allSimilarCountryData

In [28]:
def allSimilarCountryLSTM(allSimilarCountryData,df_allData,plot=True):
    allSimilarCountryData = np.array(allSimilarCountryData)
    allSimilarCountryData = allSimilarCountryData.transpose()
    allSimilarCountryData = allSimilarCountryData.tolist()
    
    df = allSimilarCountryData
    total_days = len(df)
    
    train_len = int(0.75*total_days)
    train = df[:train_len]
    test = df[train_len:]
    
    lag_days = 6
    X_train, y_train = create_data(train, lag_days, df_allData)
    X_test, y_test = create_data(test, lag_days, df_allData)
    
    batch_size = 4
    epochs = 1
    
    model = Sequential()
    model.add(LSTM(256, batch_input_shape=(None, lag_days, X_train.shape[2])))
    model.add(Dropout(0.5))
    model.add(Dense(1))
    model.compile(loss= 'logcosh', optimizer= 'Adagrad')
    history = model.fit(X_train,y_train, epochs=epochs, batch_size=batch_size, verbose=1, validation_data=(X_test, y_test))
    
    trainPredict = model.predict(X_train)
    testPredict = model.predict(X_test)
    
    if plot==True:
        
        plt.figure(figsize=(12,7))
        plt.plot(history.history['loss'], c='r', label='train_loss')
        plt.plot(history.history['val_loss'], c='g', label='validation_loss')
        plt.xlabel('epoch')
        plt.ylabel('loss')
        plt.title('Epoch vs Loss curve')
        plt.legend(loc='best')
        
        
        plt.figure(figsize=(12,7))
        plt.plot(testPredict,  c='r', label="testPredict")
        plt.plot(y_test, label="test")
        plt.title('Actual vs Predicted test set')
        plt.legend(loc='best')
        
        
        plt.figure(figsize=(12,7))
        plt.plot(trainPredict, c='r',label="trainPredict")
        plt.plot(y_train, label="train")
        plt.title('Actual vs Predicted on training set')
        plt.legend(loc='best')
        
        
        train_NumMentions = df
        X_actual_data, y_actual_data = create_data(df, lag_days, df_allData)
        predicted_whole_series = model.predict(X_actual_data)
        
        plt.figure(figsize=(12,7))
        plt.plot(predicted_whole_series, c='g', label="Predicted whole series")
        plt.plot(train_NumMentions, c='r', label="Training series")
        plt.title('Actual training vs whole series as prediction')
        plt.legend(loc='best')
    
    y_test = y_test.reshape(y_test.shape[0],1)
    y_train = y_train.reshape(y_train.shape[0],1)

    print("r-sq(train): ",r2_score(y_train, model.predict(X_train)))
    print("r-sq(test): ",r2_score(y_test, testPredict))
    print("rmse(train) :", mean_squared_error(y_train, model.predict(X_train)))
    print("rmse(test) :", mean_squared_error(y_test, testPredict))
    print()
    print()
    
    return testPredict


In [29]:
country='US'
code=14

#df_allCountry, df_allEvent, country_list, cameoCodes = load_data(df)
#allSimilarCountryLSTM(similar_country_data(df))
core_df =df
core_df.ActionGeo_CountryCode.dropna(inplace=True)
country_list = core_df.ActionGeo_CountryCode.unique()

In [30]:
df_allCountry=pd.read_pickle('/home/chiru/Desktop/GDELT/df_allcountry.pickle')

#similar_country_data(df_allCountry,country_list)


In [31]:
allSimilarCountryData=similar_country_data(df_allCountry,country_list,core_df)

Data Divided
Cluster two started
Cluster two finished
Country_list loop
()
('Preparing dataset for similar countries: ', ['US'])


  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # Remove the CWD from sys.path while we load stuff.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # This is added back by InteractiveShellApp.init_path()
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if sys.path[0] == '':


Finished


In [32]:

predicted=allSimilarCountryLSTM(allSimilarCountryData,core_df, plot=True)

67917
90566
22635
90566
Train on 67917 samples, validate on 22635 samples
Epoch 1/1
90559
90566
('r-sq(train): ', 0.0094544674087746472)
('r-sq(test): ', -0.038837246664110037)
('rmse(train) :', 1.2525948239313607)
('rmse(test) :', 1.12725584480619)
()
()


In [51]:
final=[]
for pred in predicted:
    final.append(round(pred[0],3))
    

(0.696, array([ 0.69583899], dtype=float32))

In [59]:
from highcharts import Highstock
from highcharts.highstock.highstock_helper import jsonp_loader
H = Highstock()

data_url = 'http://www.highcharts.com/samples/data/jsonp.php?filename=aapl-c.json&callback=?'
data = time1 # to remove the comment in json doc from the url

options = {
    'rangeSelector' : {
            'selected' : 4
        },

    'title' : {
        'text' : 'Similar Country'
    },
}

H.add_data_set(time1, 'line', 'Actual', tooltip = {
    'valueDecimals': 2
    }
)


H.add_data_set(time2, 'line', 'Predicted', tooltip = {
    'valueDecimals': 2
    }
)

H.add_data_set(final)

H.set_dict_options(options)

H
