## Multi Modal Learning

### Information

 - Information in the real world coomes as different modalities
 - Ex. Images are associated with tags and text explanations
 - Ex. Texts contain images to express an idea clearly
 - Thus different modalities
 - Different modalities are characterised by different statistical properties
 - Ex. Images are represented as Pixel Intensities
 - Ex. Texts are represented as discrete word count vectors
 - Thus they have distinct statiscal properties
 - IT IS IMPORTANT TO DISCOVER THE RELATIONSHIP BETWEEN DIFFERENT MODALITIES

In [5]:
import pandas as pd
import numpy as np
import time
from datetime import date
from pandas import Series, DataFrame

In [2]:
import logging

In [3]:
logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s')

In [4]:
IMG_HELP_PATH = '/Users/shankar/dev/code/ds/studies/data_science/trading/honchar'
import os
import sys
sys.path.append(os.path.abspath(IMG_HELP_PATH))
from common.preprocessing import data_2_percentage_change

In [5]:
def load_text_csv(filename='Combined_News_DJIA.csv', date_split=date(2014, 12, 31)):
    df = pd.read_csv(filename)
    df['Combined'] = df.iloc[:, 2:27].apply(lambda row: ''.join(str(row.values)), axis=1)
    
    train = df.loc[(pd.to_datetime(df["Date"]) <= date_split), ['Label', 'Combined']]
    test = df.loc[(pd.to_datetime(df["Date"]) > date_split), ['Label', 'Combined']]
    
    return train, test

In [6]:
train, test = load_text_csv(filename='../datasets/stocknews/Combined_News_DJIA.csv')

In [7]:
train.head()

Unnamed: 0,Label,Combined
0,0,"['b""Georgia \'downs two Russian warplanes\' as..."
1,1,"[""b'Why wont America and Nato help us? If they..."
2,0,"[""b'Remember that adorable 9-year-old who sang..."
3,0,"[""b' U.S. refuses Israel weapons to attack Ira..."
4,1,"[""b'All the experts admit that we should legal..."


In [8]:
test.head()

Unnamed: 0,Label,Combined
1611,1,['Most cases of cancer are the result of sheer...
1612,0,['Moscow-&gt;Beijing high speed train will red...
1613,0,"['US oil falls below $50 a barrel'\n ""Toyota g..."
1614,1,"[""'Shots fired' at French magazine HQ""\n '90% ..."
1615,1,['New Charlie Hebdo issue to come out next wee...


In [9]:
df = pd.read_csv('../datasets/stocknews/Combined_News_DJIA.csv')
df.head(2)

Unnamed: 0,Date,Label,Top1,Top2,Top3,Top4,Top5,Top6,Top7,Top8,...,Top16,Top17,Top18,Top19,Top20,Top21,Top22,Top23,Top24,Top25
0,2008-08-08,0,"b""Georgia 'downs two Russian warplanes' as cou...",b'BREAKING: Musharraf to be impeached.',b'Russia Today: Columns of troops roll into So...,b'Russian tanks are moving towards the capital...,"b""Afghan children raped with 'impunity,' U.N. ...",b'150 Russian tanks have entered South Ossetia...,"b""Breaking: Georgia invades South Ossetia, Rus...","b""The 'enemy combatent' trials are nothing but...",...,b'Georgia Invades South Ossetia - if Russia ge...,b'Al-Qaeda Faces Islamist Backlash',"b'Condoleezza Rice: ""The US would not act to p...",b'This is a busy day: The European Union has ...,"b""Georgia will withdraw 1,000 soldiers from Ir...",b'Why the Pentagon Thinks Attacking Iran is a ...,b'Caucasus in crisis: Georgia invades South Os...,b'Indian shoe manufactory - And again in a se...,b'Visitors Suffering from Mental Illnesses Ban...,"b""No Help for Mexico's Kidnapping Surge"""
1,2008-08-11,1,b'Why wont America and Nato help us? If they w...,b'Bush puts foot down on Georgian conflict',"b""Jewish Georgian minister: Thanks to Israeli ...",b'Georgian army flees in disarray as Russians ...,"b""Olympic opening ceremony fireworks 'faked'""",b'What were the Mossad with fraudulent New Zea...,b'Russia angered by Israeli military sale to G...,b'An American citizen living in S.Ossetia blam...,...,b'Israel and the US behind the Georgian aggres...,"b'""Do not believe TV, neither Russian nor Geor...",b'Riots are still going on in Montreal (Canada...,b'China to overtake US as largest manufacturer',b'War in South Ossetia [PICS]',b'Israeli Physicians Group Condemns State Tort...,b' Russia has just beaten the United States ov...,b'Perhaps *the* question about the Georgia - R...,b'Russia is so much better at war',"b""So this is what it's come to: trading sex fo..."


In [17]:
def load_ts_csv(filename='DJIA_table.csv', date_split=date(2014, 12, 31)):
    data = pd.read_csv(filename)[::-1]
    
    train2 = data.loc[(pd.to_datetime(data["Date"]) <= date_split)]
    test2 = data.loc[(pd.to_datetime(data["Date"]) > date_split)]
    
    open_train = train2.loc[:, 'Open']
    open_test = test2.loc[:, 'Open']
    open_train = data_2_percentage_change(open_train)
    open_test = data_2_percentage_change(open_test)
    
    high_train = train2.loc[:, 'High']
    high_test = test2.loc[:, 'High']
    high_train = data_2_percentage_change(high_train)
    high_test = data_2_percentage_change(high_test)
    
    low_train = train2.loc[:, 'Low']
    low_test = test2.loc[:, 'Low']
    low_train = data_2_percentage_change(low_train)
    low_test = data_2_percentage_change(low_test)
    
    close_train = train2.loc[:, 'Close']
    close_test = test2.loc[:, 'Close']
    close_train = data_2_percentage_change(close_train)
    close_test = data_2_percentage_change(close_test)
    
    volume_train = train2.loc[:, 'Volume']
    volume_test = test2.loc[:, 'Volume']
    volume_train = data_2_percentage_change(volume_train)
    volume_test = data_2_percentage_change(volume_test)
    
    train = np.column_stack((open_train, high_train, low_train, close_train, volume_train))
    test = np.column_stack((open_test, high_test, low_test, close_test, volume_test))
    
    return train, test

In [18]:
df = pd.read_csv('../datasets/stocknews/DJIA_table.csv')
df.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume,Adj Close
0,2016-07-01,17924.240234,18002.380859,17916.910156,17949.369141,82160000,17949.369141
1,2016-06-30,17712.759766,17930.609375,17711.800781,17929.990234,133030000,17929.990234
2,2016-06-29,17456.019531,17704.509766,17456.019531,17694.679688,106380000,17694.679688
3,2016-06-28,17190.509766,17409.720703,17190.509766,17409.720703,112190000,17409.720703
4,2016-06-27,17355.210938,17355.210938,17063.080078,17140.240234,138740000,17140.240234


In [19]:
train, test = load_ts_csv(filename='../datasets/stocknews/DJIA_table.csv')
train[:10]

array([[ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00],
       [ 2.60302431e-02,  9.11145883e-03,  2.52449265e-02,
         4.09306170e-03, -1.39266081e-01],
       [ 4.43578322e-03, -7.14249219e-03, -6.33896194e-03,
        -1.18719854e-02, -5.24046072e-02],
       [-1.26374481e-02, -1.26094829e-02, -1.27724378e-02,
        -9.40605976e-03,  5.16158765e-02],
       [-8.65992496e-03,  7.26333127e-03, -2.13928516e-04,
         7.19413986e-03, -1.24678170e-01],
       [ 6.86257080e-03, -7.16028701e-04,  1.29981884e-02,
         3.78537958e-03,  3.45766318e-01],
       [ 4.17186755e-03, -1.66183974e-03, -1.42770862e-02,
        -1.54813281e-02, -2.73204985e-01],
       [-1.55716973e-02, -1.81567121e-02, -1.01118508e-02,
        -1.13978048e-02,  9.78309553e-02],
       [-1.15131887e-02, -2.09262724e-03, -2.46675107e-03,
         6.06948766e-03, -1.55612542e-01],
       [ 6.10703356e-03,  1.92590190e-03,  2.21337025e-03,
         1.11936516e-03

In [2]:
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import PorterStemmer

In [8]:
def text_process(text):
    '''
    Takes in a string of text, then performs the following
    1. Tokenizes and removes punctuation
    2. Removes stopwords
    3. Stems
    4. Returns a list of the cleaned text
    '''
    if(pd.isnull(text)):
        return []
    
    # Tokenize 
    tokenizer = RegexpTokenizer(r'\w+')
    text_processed = tokenizer.tokenize(text)
    
    # Removing any stopwords
    text_processed = [word.lower() for word in text_processed if word.lower() not in stopwords.words('english')]
    
    # Stemming
    porterStemmer = PorterStemmer()
    
    text_processed = [porterStemmer.stem(word) for word in text_processed]
    
    try:
        text_processed.remove('b')
        
    except:
        pass
    
    return " ".join(text_processed)
    
    
    

In [22]:
sample_text = "Takes in a string's of text, then performs between the ourselves following the who that take"
text = text_process(sample_text)
print(text)
tokenizer = RegexpTokenizer(r'\w+')
text_processed = tokenizer.tokenize(sample_text)
print(text_processed)
text_processed = [word.lower() for word in text_processed if word.lower() not in stopwords.words('english')]
print(text_processed)
porterStemmer = PorterStemmer()
text_processed = [porterStemmer.stem(word) for word in text_processed]
print(text_processed)


take string text perform follow take
['Takes', 'in', 'a', 'string', 's', 'of', 'text', 'then', 'performs', 'between', 'the', 'ourselves', 'following', 'the', 'who', 'that', 'take']
['takes', 'string', 'text', 'performs', 'following', 'take']
['take', 'string', 'text', 'perform', 'follow', 'take']


In [23]:
def transform_text_2_sentences(train, test, save_train='train_text.p', save_text='test_text.p'):
    '''
    Transforming raw text into sentences, if @save_train or @save_test in not None - saves
    pickles for further use
    '''
    train_text = []
    test_text = []
    for each in train['Combined']:
        train_text.append(text_process(each))
    
    for each in test['Combined']:
        test_text.append(text_process(each))
        
    if(save_train != None):
        cPickle.dump(train_text, open(save_train, 'wb'))
        
    if(save_test != None):
        cPickle.dump(test_text, open(save_text, 'wb'))
        
    return train_text, test_text

In [25]:
from gensim.models import Word2Vec

In [26]:
def transform_text_into_vectors(train_text, test_text, embedding_size=100, model_path='../output/word2vec10.model'):
    
    '''
    Transforms sentences into sequences of word2vec vectors. Returns
    train, test set and trained word2vec model
    '''
    
    data_for_w2v = []
    for text in train_text + test_text:
        words = text.split(' ')
        data_for_w2v.append(words)
        
    model = Word2Vect(data_for_w2v, size=embedding_size, window=5, min_count=1, workers=4)
    model.save(model_path)
    model = Word2Vec.load(model_path)
    
    train_text_vectors = [[model[x] for x in sentence.split(' ')] for sentence in train_text]
    test_text_vectors = [[model[x] for x in sentence.split(' ')] for sentence in test_text]
    
    
    train_text_vectors = [np.mean(x, axis=0) for x in train_text_vectors]
    test_text_vectors = [np.mean(x, axis=0) for x in test_text_vectors]
    
    return train_text_vectors, test_text_vectors, model
    

In [None]:
def split_into_XY(data_chng_train, train_text_vectors, step, window, forecast):
    '''
    Splits textual and time series data into train or test dataset for hybrid model;
    Objective y_i is percentage change of price movement for next day
    '''
    X_train, X_train_text, Y_train, Y_train2 = [], [], [], []
    
    for i in range(0, len(data_chng_train), step):
        try:
            x_i =data_chng_train[i:i+WINDOW]
            y_i = np.std(data_chng_train[i:i+WINDOW+forecast][3])
            
            text_average = train_text_vectors[i:i+WINDOW]
            last_close = x_i[-1]
            
            y_i2 = None
            if(data_chng_train[i+WINDOW+forecast][3] > 0.0):
                y_i2 = 1.
            else:
                y_i2 = 0.
                
        except Exception as e:
            print('KEK', e)
            break