In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import datetime
import lightgbm as lgb
from scipy import stats
from scipy.sparse import hstack, csr_matrix
from sklearn.model_selection import train_test_split
#from wordcloud import WordCloud
from collections import Counter
from nltk.corpus import stopwords
from nltk.util import ngrams
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
stop = set(stopwords.words('english'))


import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls

from xgboost import XGBClassifier
import lightgbm as lgb
from sklearn import model_selection
from sklearn.metrics import accuracy_score

### Importing Dataset

In [2]:
market_train_orig = pd.read_csv("C:\\Users\\FAKHRE\\Music\\Data\\market.csv")
news_train_orig = pd.read_csv("C:\\Users\\FAKHRE\\Music\\Data\\news.csv")


### Making Copy of Dataset

In [3]:
market_train_df = market_train_orig.copy()
news_train_df = news_train_orig.copy()
print('Market train shape: ',market_train_df.shape)
print('News train shape: ', news_train_df.shape)

Market train shape:  (15000, 16)
News train shape:  (30000, 35)


### Converting time variable into year, month, date, hour and in minutes formate

In [4]:
#market_train_df['date'] =  pd.to_datetime(market_train_df['time'], format='%Y%m%d:%H:%M:%S.%f')
market_train_df['time'] = pd.to_datetime(market_train_df['time'], format='%Y-%m-%d %H:%M')
news_train_df['time'] = pd.to_datetime(news_train_df['time'], format='%Y-%m-%d %H:%M')


### Exploring Market data

### Preprocessing of Market Data

##### Fill nulls - Market values: All null data comes from market adjusted columns. We fill them up with the raw values in the same row

In [5]:
### Seprating values in two part column market and column raw
##Market values: All null data comes from market adjusted columns. We fill them up with the raw values in the same row
column_market = ['returnsClosePrevMktres1','returnsOpenPrevMktres1','returnsClosePrevMktres10', 'returnsOpenPrevMktres10']
column_raw = ['returnsClosePrevRaw1', 'returnsOpenPrevRaw1','returnsClosePrevRaw10', 'returnsOpenPrevRaw10']

for i in range(len(column_raw)):
    market_train_df[column_market[i]] = market_train_df[column_market[i]].fillna(market_train_df[column_raw[i]])


### Outliers-Returns: Return should not exceed 50% or falls below 50%. If it does, it is either noise, or extreme data that will confuse our prediction later on. We remove these extreme data.

In [6]:
### Removing outlier
print('Removing outliers ...')
column_return = column_market + column_raw + ['returnsOpenNextMktres10']
orig_len = market_train_df.shape[0]
for column in column_return:
    market_train_df = market_train_df.loc[market_train_df[column]>=-2]
    market_train_df = market_train_df.loc[market_train_df[column]<=2]
new_len = market_train_df.shape[0]
rmv_len = np.abs(orig_len-new_len)
print('There were %i lines removed' %rmv_len)

Removing outliers ...
There were 4 lines removed


### Remove strange data: Here we remove data with unknown asset name or asset codes with strange behavior.

In [7]:
#### Removing Starnge data
print('Removing strange data ...')
orig_len = market_train_df.shape[0]
market_train_df = market_train_df[~market_train_df['assetCode'].isin(['PGN.N','EBRYY.OB'])]
#market_train_df = market_train_df[~market_train_df['assetName'].isin(['Unknown'])]
new_len = market_train_df.shape[0]
rmv_len = np.abs(orig_len-new_len)
print('There were %i lines removed' %rmv_len)

Removing strange data ...
There were 0 lines removed


### News data

Remove outliers: apply a clip filter to reduce too extreme data

In [8]:
# Function to remove outliers
def remove_outliers(data_frame, column_list, low=0.02, high=0.98):
    for column in column_list:
        this_column = data_frame[column]
        quant_df = this_column.quantile([low,high])
        low_limit = quant_df[low]
        high_limit = quant_df[high]
        data_frame[column] = data_frame[column].clip(lower=low_limit, upper=high_limit)
    return data_frame

In [9]:
# Remove outlier
columns_outlier = ['takeSequence', 'bodySize', 'sentenceCount', 'wordCount', 'sentimentWordCount', 'firstMentionSentence','noveltyCount12H',\
                  'noveltyCount24H', 'noveltyCount3D', 'noveltyCount5D', 'noveltyCount7D', 'volumeCounts12H', 'volumeCounts24H',\
                  'volumeCounts3D','volumeCounts5D','volumeCounts7D']
print('Clipping news outliers ...')
news_train_df = remove_outliers(news_train_df, columns_outlier)

Clipping news outliers ...


### Features engineering
#### Data processing function
Here we make a function process both market and news data, then merge them.

In [10]:
asset_code_dict = {k: v for v, k in enumerate(market_train_df['assetCode'].unique())}
drop_columns = [col for col in news_train_df.columns if col not in ['sourceTimestamp', 'urgency', 'takeSequence', 'bodySize', 'companyCount', 
               'sentenceCount', 'firstMentionSentence', 'relevance','firstCreated', 'assetCodes']]
columns_news = ['firstCreated','relevance','sentimentClass','sentimentNegative','sentimentNeutral',
               'sentimentPositive','noveltyCount24H','noveltyCount7D','volumeCounts24H','volumeCounts7D','assetCodes','sourceTimestamp',
               'assetName','audiences', 'urgency', 'takeSequence', 'bodySize', 'companyCount', 
               'sentenceCount', 'firstMentionSentence','time']

In [11]:
# Data processing function
def data_prep(market_df,news_df):
    market_df['date'] =pd.to_datetime(market_train_df['time'], format='%Y-%m-%d %H:%M')
    market_df['close_to_open'] = market_df['close'] / market_df['open']
    market_df.drop(['time'], axis=1, inplace=True)
    
    news_df = news_df[columns_news]
    news_df['sourceTimestamp']= pd.to_datetime(news_df['sourceTimestamp'], format='%Y-%m-%d %H:%M')
    news_df['firstCreated'] = pd.to_datetime(news_df['firstCreated'], format='%Y-%m-%d %H:%M')
    news_df['assetCodesLen'] = news_df['assetCodes'].map(lambda x: len(eval(x)))
    news_df['assetCodes'] = news_df['assetCodes'].map(lambda x: list(eval(x))[0])
    news_df['asset_sentiment_count'] = news_df.groupby(['assetName', 'sentimentClass'])['time'].transform('count')
    news_df['len_audiences'] = news_train_df['audiences'].map(lambda x: len(eval(x)))
    kcol = ['firstCreated', 'assetCodes']
    news_df = news_df.groupby(kcol, as_index=False).mean()
    market_df = pd.merge(market_df, news_df, how='left', left_on=['date', 'assetCode'], 
                            right_on=['firstCreated', 'assetCodes'])
    del news_df
    market_df['assetCodeT'] = market_df['assetCode'].map(asset_code_dict)
    market_df = market_df.drop(columns = ['firstCreated','assetCodes','assetName']).fillna(0) 
    return market_df

In [12]:
print('Merging data ...')
market_train_df = data_prep(market_train_df, news_train_df)
market_train_df.head()

Merging data ...




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



A 

Unnamed: 0,assetCode,volume,close,open,returnsClosePrevRaw1,returnsOpenPrevRaw1,returnsClosePrevMktres1,returnsOpenPrevMktres1,returnsClosePrevRaw10,returnsOpenPrevRaw10,...,urgency,takeSequence,bodySize,companyCount,sentenceCount,firstMentionSentence,assetCodesLen,asset_sentiment_count,len_audiences,assetCodeT
0,A.N,2729240,31.3,31.39,0.007403,0.011276,-0.002344,0.010532,0.063179,0.065874,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1,AAI.N,3436803,5.18,5.26,-0.007663,0.011538,-0.027851,0.021131,0.005825,0.005736,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
2,AAP.N,1701655,40.38,40.7,-0.00247,-0.009973,-0.012789,-0.008617,-0.008378,0.002677,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2
3,AAPL.O,17633150,214.01,213.5,0.015555,0.00183,-0.001329,0.009182,0.115449,0.095602,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3
4,AAV.N,833228,6.83,6.61,0.047546,-0.003017,0.028141,-0.000661,0.057276,0.037677,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4


In [13]:
### making dependent variables
market_train_df['returnsOpenNextMktres10'] = np.where(market_train_df.returnsOpenNextMktres10 <= 0,0,1)
market_train_df.returnsOpenNextMktres10.value_counts()

1    8364
0    6632
Name: returnsOpenNextMktres10, dtype: int64

In [14]:
### counting 0 and 1 
c = market_train_df['returnsOpenNextMktres10'].value_counts()
c

1    8364
0    6632
Name: returnsOpenNextMktres10, dtype: int64

### balancing zero and once. As we can see that ones is more than zero (oversampled), so I am trying to blance the dataset

In [15]:

import random
random.seed(10)
a = market_train_df[market_train_df['returnsOpenNextMktres10']==1].sample(n = c[0])
b = market_train_df[market_train_df['returnsOpenNextMktres10']==0]
market_train_df = b.append(a,ignore_index=True)


In [16]:
c = market_train_df['returnsOpenNextMktres10'].value_counts()
c

1    6632
0    6632
Name: returnsOpenNextMktres10, dtype: int64

In [17]:
#### imputing missing value 
market_train_df['returnsOpenNextMktres10'] = np.where(market_train_df.returnsOpenNextMktres10.isna==True,0,market_train_df.returnsOpenNextMktres10)


In [18]:
### cross verify is that missing value present or not 
market_train_df['returnsOpenNextMktres10'].isna().value_counts()

False    13264
Name: returnsOpenNextMktres10, dtype: int64

### One-Hot Encoding

For categorical variables where no such ordinal relationship exists, the integer encoding is not enough.

In fact, using this encoding and allowing the model to assume a natural ordering between categories may result in poor performance or unexpected results (predictions halfway between categories).

In this case, a one-hot encoding can be applied to the integer representation. This is where the integer encoded variable is removed and a new binary variable is added for each unique integer value.

In the “color” variable example, there are 3 categories and therefore 3 binary variables are needed. A “1” value is placed in the binary variable for the color and “0” values for the other colors.

In [19]:
# one hot-incoding with categorical variables 
columns = ['assetCode']

one_hot1 = pd.get_dummies(market_train_df[['assetCode']])

market_train_df = market_train_df.drop(columns,axis = 1)

market_train_df=market_train_df.join(one_hot1)

## Dealing With News Headlines Data 

#### Preprocessing of News headlines Data

Our first step should be cleaning the data in order to obtain better features. We will achieve this by doing some of the basic pre-processing steps on our news headline data.

So, let’s get into it.

In [20]:
## making copy of news_train_orig data
news_df = news_train_orig.copy()

### Lower case
The first pre-processing step which we will do is transform news headline into lower case. This avoids having multiple copies of the same words. For example, while calculating the word count, ‘Analytics’ and ‘analytics’ will be taken as different words.

In [21]:
### convering all text data into Lower case
news_df['headline'] = news_df['headline'].apply(lambda x: " ".join(x.lower() for x in x.split()))
news_df['headline'].head()

0    china's daqing pumps 43.41 mln tonnes of oil i...
1            feature-in kidnapping, finesse works best
2           press digest - wall street journal - jan 1
3                press digest - new york times - jan 1
4                press digest - new york times - jan 1
Name: headline, dtype: object

### Removing Punctuation
The next step is to remove punctuation, as it doesn’t add any extra information while treating text data. Therefore removing all instances of it will help us reduce the size of the data

In [22]:
## Removing Panctuation
news_df['headline'] = news_df['headline'].str.replace('[^\w\s]','')
news_df['headline'].head()

0    chinas daqing pumps 4341 mln tonnes of oil in 06
1             featurein kidnapping finesse works best
2            press digest  wall street journal  jan 1
3                 press digest  new york times  jan 1
4                 press digest  new york times  jan 1
Name: headline, dtype: object

### Removal of Stop Words
As we discussed earlier, stop words (or commonly occurring words) should be removed from the text data. For this purpose, we can either create a list of stopwords ourselves or we can use predefined libraries.

In [23]:
### Removing Stopwords
from nltk.corpus import stopwords
stop = stopwords.words('english')
news_df['headline'] = news_df['headline'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
news_df['headline'].head()

0    chinas daqing pumps 4341 mln tonnes oil 06
1       featurein kidnapping finesse works best
2        press digest wall street journal jan 1
3             press digest new york times jan 1
4             press digest new york times jan 1
Name: headline, dtype: object

### Removal of Numeric Word

In [24]:
## Removing numerical Value from text
news_df['headline'] = news_df['headline'].str.replace('\d+', ' ')
news_df['headline'].head()

0     chinas daqing pumps   mln tonnes oil  
1    featurein kidnapping finesse works best
2     press digest wall street journal jan  
3          press digest new york times jan  
4          press digest new york times jan  
Name: headline, dtype: object

### Spelling correction

Spelling correction is a useful pre-processing step because this also will help us in reducing multiple copies of words. For example, “Analytics” and “analytcs” will be treated as different words even if they are used in the same sense.

To achieve this we will use the textblob library. If you are not familiar with it, you can check my previous article on ‘NLP for beginners using textblob’.

In [None]:
### Spelling Correction
from textblob import TextBlob
news_df['headline'] = news_df['headline'].apply(lambda x: str(TextBlob(x).correct()))

### Stemming
Stemming refers to the removal of suffices, like “ing”, “ly”, “s”, etc. by a simple rule-based approach. For this purpose, we will use PorterStemmer from the NLTK library.

In [25]:
### Streaming text data with help of portar Stemmer library from nltk
from nltk.stem import PorterStemmer
st = PorterStemmer()
news_df['headline'] = news_df['headline'].apply(lambda x: " ".join([st.stem(word) for word in x.split()]))

### N-grams
N-grams are the combination of multiple words used together. Ngrams with N=1 are called unigrams. Similarly, bigrams (N=2), trigrams (N=3) and so on can also be used.

Unigrams do not usually contain as much information as compared to bigrams and trigrams. The basic principle behind n-grams is that they capture the language structure, like what letter or word is likely to follow the given one. The longer the n-gram (the higher the n), the more context you have to work with. Optimum length really depends on the application – if your n-grams are too short, you may fail to capture important differences. On the other hand, if they are too long, you may fail to capture the “general knowledge” and only stick to particular cases.

So, let’s quickly extract bigrams from our news headline using the ngrams function of the textblob library.

### Term frequency
Term frequency is simply the ratio of the count of a word present in a sentence, to the length of the sentence.

Therefore, we can generalize term frequency as:

TF = (Number of times term T appears in the particular row) / (number of terms in that row)

### Inverse Document Frequency
The intuition behind inverse document frequency (IDF) is that a word is not of much use to us if it’s appearing in all the documents.

Therefore, the IDF of each word is the log of the ratio of the total number of rows to the number of rows in which that word is present.

IDF = log(N/n), where, N is the total number of rows and n is the number of rows in which the word was present.

### Term Frequency – Inverse Document Frequency (TF-IDF)
TF-IDF is the multiplication of the TF and IDF which we calculated above

In [26]:
### calulate tfidf and making n grams
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(max_features=1000, lowercase=True, analyzer='word',
stop_words= 'english',ngram_range=(2,2))
train_vect = tfidf.fit_transform(news_df['headline'])

In [79]:
#train_vect1 = pd.DataFrame(train_vect)

### Singular Value Decomposition
Dimensionality reduction using truncated SVD (aka LSA).

This transformer performs linear dimensionality reduction by means of truncated singular value decomposition (SVD). Contrary to PCA, this estimator does not center the data before computing the singular value decomposition. This means it can work with scipy.sparse matrices efficiently.

In particular, truncated SVD works on term count/tf-idf matrices as returned by the vectorizers in sklearn.feature_extraction.text. In that context, it is known as latent semantic analysis (LSA).

In [27]:
### importing svd(singular value dicomposition) library 
from sklearn.decomposition import PCA, FastICA, TruncatedSVD
# from sklearn.random_projection import GaussianRandomProjection
# from sklearn.random_projection import SparseRandomProjection

In [28]:
### define svd function and pass argument 
svd = TruncatedSVD(n_components=240, n_iter=5)

In [29]:
### fitting tfidf values in svd function and This transformer performs linear dimensionality reduction
data1_svd = svd.fit_transform(train_vect)

In [30]:
### making data frame because svd function return values in array 
data1_svd = pd.DataFrame(data1_svd)

In [82]:
# data1_svd.head()

In [31]:
### join text data with main market data
market_train_df = pd.concat([market_train_df, data1_svd], axis=1)

In [32]:
##checking shape of the data
market_train_df.shape

(30000, 1804)

### Checking Missing value present or not

In [33]:
##checking missing values in data
market_train_df['returnsOpenNextMktres10'].isna().value_counts()

True     16736
False    13264
Name: returnsOpenNextMktres10, dtype: int64

### Imputing Missing Value

In [34]:
## imputing missing values 
market_train_df['returnsOpenNextMktres10'] = np.where(market_train_df.returnsOpenNextMktres10.isna()==True,0,market_train_df.returnsOpenNextMktres10)

### Devide data set into two part X and y(dependent and independent variable)

In [35]:
### defing X and Y 
columns = ['returnsOpenNextMktres10','date']
X = market_train_df.drop(columns, axis=1)
y = market_train_df['returnsOpenNextMktres10']
X.shape, y.shape

((30000, 1802), (30000,))

### Spliting data set into Train And Test

In [36]:
#### Spliting data set into train and test
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2)

In [37]:
# from sklearn.model_selection import train_test_split
# X_train1,X_test1,y_train1,y_test1 = train_test_split(X,y,test_size=0.3)

### importing Library for modeling

In [38]:
import gc
import matplotlib
from sklearn.metrics import confusion_matrix
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
#from sklearn import cross_validation
from sklearn.model_selection import cross_validate
from sklearn import metrics
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV
from sklearn.metrics import accuracy_score
#import pandas as pd
#import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix
from sklearn.feature_selection import VarianceThreshold
from keras.models import Sequential
from keras.layers import Activation, Dense, Dropout
from keras import regularizers
import matplotlib.pyplot as plt

Using TensorFlow backend.


In [39]:
##XGBOOST
import xgboost as xgb
import lightgbm as lgb
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import get_scorer
from sklearn.metrics import f1_score
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.ensemble import VotingClassifier
import lightgbm as lgb
from sklearn.externals.joblib import Parallel, delayed
from sklearn.base import clone
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_curve
import pickle

### XGBoost (Extreme Gradient Boosting)

Ever since its introduction in 2014, Extreme Gradient Boosting with XGBoost has been admired as the holy grail of machine learning competitions. From predicting ad click-through rates to classifying high energy physics events, XGBoost has proved its one of the best algorithms in terms of performance and speed. I always used to XGBoost as my first algorithm of building machine learning model because it saves time and gives better accuracy and provide better solution than other machine learning algorithm and it is optimized and distributed gradient boosting library. It uses gradient boosting framework at core.

XGBoost was created by Tianqi Chen, PhD Student, University of Washington. It is used for supervised ML problems. Features of XGBoost algorithm.

1.Parallel Computing: It is enabled with parallel processing (using OpenMP); i.e., when you run XGBoost, by default, it would use all the cores of your laptop/machine.

2.Regularization: I believe this is the biggest advantage of XGBoost. GBM has no provision for regularization. Regularization is a technique used to avoid overfitting in linear and tree-based models.

3.Enabled Cross Validation: In R, we usually use external packages such as caret and mlr to obtain CV results. But, XGBoost is enabled with internal CV function (we'll see below).

4.Missing Values: XGBoost is designed to handle missing values internally. The missing values are treated in such a manner that if there exists any trend in missing values, it is captured by the model.

5.Flexibility: In addition to regression, classification, and ranking problems, it supports user-defined objective functions also. An objective function is used to measure the performance of the model given a certain set of parameters. Furthermore, it supports user defined evaluation metrics as well.

6.Availability: Currently, it is available for programming languages such as R, Python, Java, Julia, and Scala.


### define xgboost classifier

In [40]:
 
clf = xgb.XGBClassifier()

### Parameter tuning code for prevent overfiiting 

In [41]:

param_grid = {
            'silent': [False],
            'max_depth': [5, 10],##15,20
            'learning_rate': [0.1, 0.2, 0.3],
            'subsample': [0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
            'colsample_bytree': [0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
            'colsample_bylevel': [0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
            'min_child_weight': [0.5, 1.0, 3.0, 5.0, 7.0, 10.0],
            'gamma': [0, 0.25, 0.5, 1.0],
            'reg_lambda': [0.1, 1.0, 5.0, 10.0, 50.0],  
            'n_estimators': [20]}
    
fit_params = {'eval_metric': 'logloss',
                 'early_stopping_rounds': 8, ##10
                  'eval_set': [(X_test, y_test)]}
    
rs_clf = RandomizedSearchCV(clf, param_grid, n_iter=10,
                                n_jobs=1, verbose=2, cv=2,
                                fit_params=fit_params,
                                scoring='neg_log_loss', random_state=50) #, refit=False

###  training the data with the help of xgboost algorithm

In [42]:

rs_clf.fit(X_train, y_train)


"fit_params" as a constructor argument was deprecated in version 0.19 and will be removed in version 0.21. Pass fit parameters to the "fit" method instead.

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 2 folds for each of 10 candidates, totalling 20 fits
[CV] subsample=0.9, silent=False, reg_lambda=10.0, n_estimators=20, min_child_weight=10.0, max_depth=10, learning_rate=0.1, gamma=0.5, colsample_bytree=0.9, colsample_bylevel=0.8 
[0]	validation_0-logloss:0.637164
Will train until validation_0-logloss hasn't improved in 8 rounds.
[1]	validation_0-logloss:0.591096
[2]	validation_0-logloss:0.552585
[3]	validation_0-logloss:0.519674
[4]	validation_0-logloss:0.491904
[5]	validation_0-logloss:0.467967
[6]	validation_0-logloss:0.447061
[7]	validation_0-logloss:0.429132
[8]	validation_0-logloss:0.41306
[9]	validation_0-logloss:0.398693
[10]	validation_0-logloss:0.386282
[11]	validation_0-logloss:0.37501
[12]	validation_0-logloss:0.364995
[13]	validation_0-logloss:0.356148
[14]	validation_0-logloss:0.348034
[15]	validation_0-logloss:0.341543
[16]	validation_0-logloss:0.33558
[17]	validation_0-logloss:0.330191
[18]	validation_0-logloss:0.324972
[19]	validation_0-logloss:0.320538
[CV] 

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   51.4s remaining:    0.0s


[0]	validation_0-logloss:0.637115
Will train until validation_0-logloss hasn't improved in 8 rounds.
[1]	validation_0-logloss:0.593241
[2]	validation_0-logloss:0.554015
[3]	validation_0-logloss:0.522128
[4]	validation_0-logloss:0.493508
[5]	validation_0-logloss:0.468819
[6]	validation_0-logloss:0.447355
[7]	validation_0-logloss:0.428482
[8]	validation_0-logloss:0.412377
[9]	validation_0-logloss:0.39831
[10]	validation_0-logloss:0.385502
[11]	validation_0-logloss:0.374562
[12]	validation_0-logloss:0.364731
[13]	validation_0-logloss:0.356299
[14]	validation_0-logloss:0.348402
[15]	validation_0-logloss:0.34157
[16]	validation_0-logloss:0.335174
[17]	validation_0-logloss:0.329552
[18]	validation_0-logloss:0.324595
[19]	validation_0-logloss:0.320213
[CV]  subsample=0.9, silent=False, reg_lambda=10.0, n_estimators=20, min_child_weight=10.0, max_depth=10, learning_rate=0.1, gamma=0.5, colsample_bytree=0.9, colsample_bylevel=0.8, total=  50.0s
[CV] subsample=0.7, silent=False, reg_lambda=5.0, 

[7]	validation_0-logloss:0.312141
[8]	validation_0-logloss:0.305932
[9]	validation_0-logloss:0.301646
[10]	validation_0-logloss:0.298827
[11]	validation_0-logloss:0.297614
[12]	validation_0-logloss:0.295884
[13]	validation_0-logloss:0.295063
[14]	validation_0-logloss:0.29439
[15]	validation_0-logloss:0.294269
[16]	validation_0-logloss:0.294961
[17]	validation_0-logloss:0.294894
[18]	validation_0-logloss:0.29528
[19]	validation_0-logloss:0.295068
[CV]  subsample=0.5, silent=False, reg_lambda=10.0, n_estimators=20, min_child_weight=7.0, max_depth=10, learning_rate=0.3, gamma=0.5, colsample_bytree=1.0, colsample_bylevel=0.5, total=  35.2s
[CV] subsample=0.5, silent=False, reg_lambda=10.0, n_estimators=20, min_child_weight=7.0, max_depth=10, learning_rate=0.3, gamma=0.5, colsample_bytree=1.0, colsample_bylevel=0.5 
[0]	validation_0-logloss:0.542159
Will train until validation_0-logloss hasn't improved in 8 rounds.
[1]	validation_0-logloss:0.45945
[2]	validation_0-logloss:0.407861
[3]	valid

[16]	validation_0-logloss:0.286856
[17]	validation_0-logloss:0.284937
[18]	validation_0-logloss:0.283888
[19]	validation_0-logloss:0.28273
[CV]  subsample=0.9, silent=False, reg_lambda=10.0, n_estimators=20, min_child_weight=0.5, max_depth=5, learning_rate=0.2, gamma=1.0, colsample_bytree=0.8, colsample_bylevel=0.9, total=  29.7s
[CV] subsample=0.6, silent=False, reg_lambda=5.0, n_estimators=20, min_child_weight=10.0, max_depth=10, learning_rate=0.2, gamma=0, colsample_bytree=0.5, colsample_bylevel=1.0 
[0]	validation_0-logloss:0.588783
Will train until validation_0-logloss hasn't improved in 8 rounds.
[1]	validation_0-logloss:0.522234
[2]	validation_0-logloss:0.469519
[3]	validation_0-logloss:0.433066
[4]	validation_0-logloss:0.403584
[5]	validation_0-logloss:0.382888
[6]	validation_0-logloss:0.364828
[7]	validation_0-logloss:0.350381
[8]	validation_0-logloss:0.3376
[9]	validation_0-logloss:0.329223
[10]	validation_0-logloss:0.3219
[11]	validation_0-logloss:0.315225
[12]	validation_0-

[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed: 12.7min finished


[0]	validation_0-logloss:0.586192
Will train until validation_0-logloss hasn't improved in 8 rounds.
[1]	validation_0-logloss:0.513609
[2]	validation_0-logloss:0.461814
[3]	validation_0-logloss:0.423407
[4]	validation_0-logloss:0.393382
[5]	validation_0-logloss:0.370723
[6]	validation_0-logloss:0.352658
[7]	validation_0-logloss:0.338085
[8]	validation_0-logloss:0.326855
[9]	validation_0-logloss:0.317892
[10]	validation_0-logloss:0.310224
[11]	validation_0-logloss:0.304052
[12]	validation_0-logloss:0.299217
[13]	validation_0-logloss:0.295189
[14]	validation_0-logloss:0.291533
[15]	validation_0-logloss:0.288895
[16]	validation_0-logloss:0.286756
[17]	validation_0-logloss:0.284952
[18]	validation_0-logloss:0.283178
[19]	validation_0-logloss:0.281957


RandomizedSearchCV(cv=2, error_score='raise-deprecating',
          estimator=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1),
          fit_params={'eval_metric': 'logloss', 'early_stopping_rounds': 8, 'eval_set': [(           volume  close    open  returnsClosePrevRaw1  returnsOpenPrevRaw1  \
18811         NaN    NaN     NaN                   NaN                  NaN
26023         NaN    NaN     NaN                   NaN        ...015     0.0
23061    0.0
24750    0.0
Name: returnsOpenNextMktres10, Length: 6000, dtype: float64)]},
          iid='warn', n_iter=10, n_jobs=1,
          param_distributions={'silent': [False], 'max_depth': [5, 10], 'learn

### Defing function for calculting threshold

In [83]:
##Sensitivity = 1-specificity(roc curve, where sensitivity and specificity cut at same point = optimal threshold)
## The optimal cut off point would be where true positive rate is high and the false positive rate is low.
##Based on this logic


#### ROC Curve: 
This is a commonly used graph that summarizes the performance of a classifier over all possible thresholds. It is generated by plotting the True Positive Rate (y-axis) against the False Positive Rate (x-axis) as you vary the threshold for assigning observations to a given class

In [43]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_curve
def Find_Optimal_Cutoff(target, predicted):
    fpr, tpr, threshold = roc_curve(target, predicted)
    i = np.arange(len(tpr))
    roc = pd.DataFrame({'tf' : pd.Series(tpr-(1-fpr), index=i), 'threshold' : pd.Series(threshold, index=i)})
    roc_t = roc.ix[(roc.tf-0).abs().argsort()[:1]]
    return list(roc_t['threshold'])

### Predicting Probability on Train Data

In [44]:
nn_train_pred = rs_clf.predict_proba(X_train)[:,1]
nn_train_pred

array([0.47086924, 0.43957728, 0.53854614, ..., 0.00905202, 0.39770377,
       0.00905202], dtype=float32)

### Find optimal probability threshold

In [45]:

threshold = Find_Optimal_Cutoff(y_train, nn_train_pred)
print(threshold)

[0.43181565403938293]




.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated



### Confusion Matrix
A confusion matrix is a table that is often used to describe the performance of a classification model (or "classifier") on a set of test data for which the true values are known.



### Building Confusion Matrix

In [46]:
from sklearn.metrics import confusion_matrix
nn_train_pred1 = np.where(nn_train_pred >=threshold,1,0)
cm1=confusion_matrix(y_train, nn_train_pred1)
cm1

array([[15987,  2688],
       [  765,  4560]], dtype=int64)

### Sensitivity, Specificity and Accuracy


            Predicted No    Predicted Yes
            
Actual No    TN= 15987                 FP=2688     Total of Actual No = 18675

Actual yes   FN= 765                   TP=4560     Total of Actual yes = 5325
            Total= 16752               Total= 7248
            
            
### Sensitivity or Recall
TPR - When it actual yes and how often does predict yes
TP/actual yes = 4560/765+4560= 0.856
### Specificity
TNR - When it actually no how often does it predict no
TN/actual no = 15987/15987+2688 = 0.856

### Accuracy
Overall, how often is the classifier correct..
(TP+TN)/Total = (15987+4560)/24000 = 0.856


### Calculating Sensitivity, Specificity andn Accuracy

In [47]:
specificity = cm1[0,0]/(cm1[0,0]+cm1[0,1])
sensitivity= cm1[1,1]/(cm1[1,0]+cm1[1,1])
print(sensitivity)
print(specificity)
accuracy_score(y_train, nn_train_pred1)

0.856338028169014
0.8560642570281124


0.856125

### Predict Probability on Test Data

In [48]:
nn_test_pred =  rs_clf.predict_proba(X_test)[:,1]
nn_test_pred

array([0.00905202, 0.00905202, 0.23425336, ..., 0.29610324, 0.00905202,
       0.00905202], dtype=float32)

### Confusion Matrix

In [49]:
nn_test_pred1 = np.where(nn_test_pred >=threshold,1,0)
cm2=confusion_matrix(y_test, nn_test_pred1)

cm2

array([[3961,  732],
       [ 212, 1095]], dtype=int64)

### calculating Sensitivity Specificity and Accuracy

In [50]:
specificity = cm2[0,0]/(cm2[0,0]+cm2[0,1])
sensitivity= cm2[1,1]/(cm2[1,0]+cm2[1,1])
print(sensitivity)
print(specificity)
accuracy_score(y_test, nn_test_pred1)

0.837796480489671
0.8440230129980822


0.8426666666666667

### LGB

In [51]:
import lightgbm as lgb
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import get_scorer
from sklearn.metrics import f1_score
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.ensemble import VotingClassifier
import lightgbm as lgb
from sklearn.externals.joblib import Parallel, delayed
from sklearn.base import clone

import pickle

In [52]:
# Set up decay learning rate
def learning_rate_power(current_round):
    base_learning_rate = 0.19000424246380565
    min_learning_rate = 0.01
    lr = base_learning_rate * np.power(0.995,current_round)
    return max(lr, min_learning_rate)

In [53]:
from scipy.stats import randint as sp_randint
from scipy.stats import uniform as sp_uniform

# tune_params = {'n_estimators': [200,500,1000,2500,5000],
#               'max_depth': sp_randint(4,12),
#                'learning_rate' : 0.005,
#                'bagging_fraction' : 0.7,
#                'feature_fraction' : 0.5,
#                'bagging_frequency' : 6,
#                'bagging_seed' : 42,
#               'colsample_bytree':sp_uniform(loc=0.8, scale=0.15),
#               'min_child_samples':sp_randint(60,120),
#               'subsample': sp_uniform(loc=0.75, scale=0.25),
#               'reg_lambda':[1e-3, 1e-2, 1e-1, 1]}
tune_params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': {'binary_logloss', 'auc'},
    'metric_freq': 1,
    'is_training_metric': True,
    'max_bin': 255,
    'learning_rate': 0.1,
    'num_leaves': 63,
    'tree_learner': 'serial',
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'min_data_in_leaf': 50,
    'min_sum_hessian_in_leaf': 5,
    'is_enable_sparse': True,
    'use_two_round_loading': False,
    'is_save_binary_file': False,
    'output_model': 'LightGBM_model.txt',
    'num_machines': 1,
    'local_listen_port': 12400,
    'machine_list_file': 'mlist.txt',
    'verbose': 0,
    # parameters to keep the exactly the same
    'subsample_for_bin': 200000,
    'min_child_samples': 20,
    'min_child_weight': 0.001,
    'min_split_gain': 0.0,
    'colsample_bytree': 1.0,
    'reg_alpha': 0.0,
    'reg_lambda': 0.0
}

fit_params = {'early_stopping_rounds':40,
              'eval_metric': 'accuracy',
              'eval_set': [(X_train, y_train), (X_test, y_test)],
              'verbose': 20,
              'callbacks': [lgb.reset_parameter(learning_rate=learning_rate_power)]}

In [54]:
lgb_clf = lgb.LGBMClassifier(n_jobs=4, objective='binary',random_state=1)
gs = RandomizedSearchCV(estimator=lgb_clf, 
                        param_distributions=tune_params, 
                        n_iter=10,
                        scoring='f1',
                        cv=5,
                        refit=True,
                        random_state=1,
                        verbose=True)

In [55]:
lgb_clf = lgb.LGBMClassifier(n_jobs=4,
                             objective='multiclass',
                            random_state=100)
opt_params = {'n_estimators':500,
              'boosting_type': 'dart',
              'objective': 'binary',
              'num_leaves':2452,
              'min_child_samples':212,
              'reg_lambda':0.01}
lgb_clf.set_params(**opt_params)
lgb_clf.fit(X_train, y_train,**fit_params)


Early stopping is not available in dart mode



[20]	training's binary_logloss: 0.579053	valid_1's binary_logloss: 0.581023
[40]	training's binary_logloss: 0.589611	valid_1's binary_logloss: 0.591549
[60]	training's binary_logloss: 0.595046	valid_1's binary_logloss: 0.596921
[80]	training's binary_logloss: 0.593917	valid_1's binary_logloss: 0.595834
[100]	training's binary_logloss: 0.58821	valid_1's binary_logloss: 0.590276
[120]	training's binary_logloss: 0.580101	valid_1's binary_logloss: 0.582348
[140]	training's binary_logloss: 0.570942	valid_1's binary_logloss: 0.573409
[160]	training's binary_logloss: 0.561592	valid_1's binary_logloss: 0.564298
[180]	training's binary_logloss: 0.552528	valid_1's binary_logloss: 0.555491
[200]	training's binary_logloss: 0.544012	valid_1's binary_logloss: 0.547225
[220]	training's binary_logloss: 0.536165	valid_1's binary_logloss: 0.539624
[240]	training's binary_logloss: 0.529034	valid_1's binary_logloss: 0.53272
[260]	training's binary_logloss: 0.522595	valid_1's binary_logloss: 0.526478
[280]

LGBMClassifier(boosting_type='dart', class_weight=None, colsample_bytree=1.0,
        importance_type='split', learning_rate=0.1, max_depth=-1,
        min_child_samples=212, min_child_weight=0.001, min_split_gain=0.0,
        n_estimators=500, n_jobs=4, num_leaves=2452, objective='binary',
        random_state=100, reg_alpha=0.0, reg_lambda=0.01, silent=True,
        subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

In [56]:
print('Training accuracy: ', accuracy_score(y_train, lgb_clf.predict(X_train)))
print('Test accuracy: ', accuracy_score(y_test, lgb_clf.predict(X_test)))

Training accuracy:  0.8557083333333333
Test accuracy:  0.8426666666666667


In [57]:
nn_train_pred = lgb_clf.predict_proba(X_train)[:,1]
nn_train_pred

array([0.48797667, 0.49387833, 0.52861003, ..., 0.29747057, 0.44331762,
       0.29747056])

In [58]:
# Find optimal probability threshold
    threshold = Find_Optimal_Cutoff(y_train, nn_train_pred)
    print(threshold)

[0.46296081481892276]




.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated



In [59]:
from sklearn.metrics import confusion_matrix
nn_train_pred1 = np.where(nn_train_pred >=threshold,1,0)
cm1=confusion_matrix(y_train, nn_train_pred1)
cm1

array([[15838,  2837],
       [  809,  4516]], dtype=int64)

In [60]:
specificity = cm1[0,0]/(cm1[0,0]+cm1[0,1])
sensitivity= cm1[1,1]/(cm1[1,0]+cm1[1,1])
print(sensitivity)
print(specificity)
accuracy_score(y_train, nn_train_pred1)

0.848075117370892
0.8480856760374833


0.8480833333333333

In [61]:
nn_test_pred =  lgb_clf.predict_proba(X_test)[:,1]
nn_test_pred

array([0.29802105, 0.29747057, 0.42032811, ..., 0.40774574, 0.29747056,
       0.29747056])

In [62]:
nn_test_pred1 = np.where(nn_test_pred >=threshold,1,0)
cm2=confusion_matrix(y_test, nn_test_pred1)

cm2

array([[3936,  757],
       [ 225, 1082]], dtype=int64)

In [63]:
specificity = cm2[0,0]/(cm2[0,0]+cm2[0,1])
sensitivity= cm2[1,1]/(cm2[1,0]+cm2[1,1])
print(sensitivity)
print(specificity)
accuracy_score(y_test, nn_test_pred1)

0.827850038255547
0.8386959301086725


0.8363333333333334