# Financial Model Prediction

In [97]:
import pandas as pd
import numpy as np
import seaborn as sbn
import re

import warnings
warnings.filterwarnings('ignore')

import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import RobustScaler, OneHotEncoder
from sklearn.metrics import classification_report
from sklearn.decomposition import PCA

import lightgbm as lgb

%matplotlib inline

In [2]:
def add_shifts(df_, col_to_shift, new_col, shift):
    """
    This function add shifted columns to data by ticker.
    
    :param pd.DataFrame df_: Dataframe with financial data.
    :param str col_to_shift: Column over to create the shift.
    :param str new_col: Name of the shifted column.
    :param int shift: Days to use as shift.
    :return pd.DataFrame: Dataframe with the shift added.
    """
    
    for id_ in df_['ticker'].unique():
        df_by_id = df_[df_['ticker'] == id_]
        df_.loc[df_['ticker'] == id_, new_col] = df_by_id[col_to_shift] - df_by_id[col_to_shift].shift(shift)
        
    return df_

In [3]:
def get_non_n_cols(df_, n):
    """
    Get the columns that his window is not n days
    
    :param pd.DataFrame df_: Dataframe with financial data.
    :param int n: n days to get columns with.
    :return list: List with the name of the columns.
    """
    
    return [elem for elem in df_.columns if (re.search(r'\d+$', elem) is not None) and (int(elem[-2:].strip()) != n)]

In [4]:
def interpolate_nan_values(df_, to_interpolate):
    """
    Interpolate and extrapolate nan values for numerical columns.
    
    :param pd.DataFrame df_: Dataframe with financial data with NaN values.
    :param list to_interpolate: List with columns to interpolate.
    :return pd.DataFrame: Dataframe with financial data without NaN values.
    """
    
    list_df = []
    for tick in df_['ticker'].unique():
        df_by_ticker = df_[df_['ticker'] == tick]
        for col in to_interpolate:
            df_by_ticker[col] = df_by_ticker[col].interpolate(method='linear', limit_direction='both')
        list_df.append(df_by_ticker)
    return pd.concat(list_df)

In [95]:
def categorize_each_difference(num_list, df_):
    """
    This function categorize the shifted columns in Weak Bull o Bear (W. Bull, W. Bear), 
    Bull or Bear and Strong Bull or Bear (S. Bull, S. Bear) depending on the value of the shifted column and
    his statistics (median, p25, p75) by ticker, year, month and sign.
    
    :param list num_list: List with days to categorize.
    :param pd.DataFrame df_: Dataframe to categorize.
    :return pd.DataFrame: Dataframe recalculated.
    """
    cols_to_keep = list(df_.columns)
    df_['year'], df_['month'] = df_['date'].dt.year, df_['date'].dt.month
    for num_ in num_list:
        df_.loc[df_['close_shifted_%i' % num_] >= 0, 'sign_%i' % num_] = 'Bull'
        df_.loc[df_['close_shifted_%i' % num_] < 0, 'sign_%i' % num_] = 'Bear'
        group = df_.groupby(['ticker', 'year', 'month', 'sign_%i' % num_])['close_shifted_%i' % num_].describe()
        group = group[['25%', '50%', '75%', 'std']].reset_index()
        group.rename({'std': 'std_%i' % num_}, axis='columns', inplace=True)
        df_ = pd.merge(left=df_, right=group, on=['ticker', 'year', 'month', 'sign_%i' % num_], how='inner')
        
        df_.loc[(df_['sign_%i' % num_] == 'Bull') & 
                (df_['close_shifted_%i' % num_] <= df_['50%']), 'cat_close_shifted_%i' % num_] = 'W. ' + df_['sign_%i' % num_]
        df_.loc[(df_['sign_%i' % num_] == 'Bull') & 
                   (df_['close_shifted_%i' % num_] > df_['50%']) &
                   (df_['close_shifted_%i' % num_] < df_['75%']), 'cat_close_shifted_%i' % num_] = df_['sign_%i' % num_]
        df_.loc[(df_['sign_%i' % num_] == 'Bull') &
                   (df_['close_shifted_%i' % num_] >= df_['75%']), 'cat_close_shifted_%i' % num_] = 'S. ' + df_['sign_%i' % num_]
        df_.loc[(df_['sign_%i' % num_] == 'Bear') & 
                (df_['close_shifted_%i' % num_] >= df_['50%']), 'cat_close_shifted_%i' % num_] = 'W. ' + df_['sign_%i' % num_]
        df_.loc[(df_['sign_%i' % num_] == 'Bear') & 
                   (df_['close_shifted_%i' % num_] < df_['50%']) &
                   (df_['close_shifted_%i' % num_] > df_['25%']), 'cat_close_shifted_%i' % num_] = df_['sign_%i' % num_]
        df_.loc[(df_['sign_%i' % num_] == 'Bear') & 
                   (df_['close_shifted_%i' % num_] <= df_['25%']), 'cat_close_shifted_%i' % num_] = 'S. ' + df_['sign_%i' % num_]
        
        df_.drop(['25%', '50%', '75%'], axis='columns', inplace=True)
        cols_to_keep.extend(['cat_close_shifted_%i' % num_, 'std_%i' % num_])
    return df_[cols_to_keep]

In [5]:
df_categorical = pd.read_csv('../data/db_bsm_categorical.csv')
df_financial = pd.read_csv('../data/db_bsm_financial.csv')

In [6]:
df_financial.replace(0, np.NaN, inplace=True)
df_financial.isnull().sum().to_frame('Null Values').loc[['close', 'volume']]

Unnamed: 0,Null Values
close,1081
volume,1996


In [7]:
df_financial_not_nan = interpolate_nan_values(df_financial, ['close', 'volume'])

In [8]:
df_financial_not_nan.isnull().sum().to_frame('Null Values').loc[['close', 'volume']]

Unnamed: 0,Null Values
close,0
volume,0


In [9]:
num_list = [3, 5, 7, 14, 21]

In [10]:
for num_ in num_list:
    df_fin = add_shifts(df_financial_not_nan, 'close', 'close_shifted_%i' % num_, num_)

In [11]:
df_fin.dropna(subset=['close_shifted_21'], inplace=True)

In [12]:
df_fin_not_nan = interpolate_nan_values(df_fin, list(df_fin.select_dtypes(float)))

In [13]:
df_fin_not_nan.isnull().sum().to_frame('Null Values').sort_values(by='Null Values', ascending=False).head(3)

Unnamed: 0,Null Values
ADX 14,0
RSI 21,0
ROCR 14,0


In [14]:
df_fin_not_nan['date'] = pd.to_datetime(df_fin_not_nan['date'])

In [91]:
%time df_final = categorize_each_difference(num_list, df_fin_not_nan)

Wall time: 4min 38s


In [103]:
df_categorical = df_categorical.dropna()
df_categorical = df_categorical.drop_duplicates(subset=['ticker'], keep='first')

In [104]:
df_final = pd.merge(left=df_final, right=df_categorical, how='inner', on='ticker')

In [None]:
df_final = pd.get_dummies(df_final, columns='sector_gics', prefix='sector_gics_')