# Financial Model Prediction

In [1]:
import pandas as pd
import numpy as np
import seaborn as sbn
import re

import warnings
warnings.filterwarnings('ignore')

import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import RobustScaler
from sklearn.metrics import classification_report
from sklearn.decomposition import PCA

import lightgbm as lgb

%matplotlib inline

  from numpy.core.umath_tests import inner1d


In [2]:
def add_shifts(df_, col_to_shift, new_col, shift):
    """
    This function add shifted columns to data by ticker.
    
    :param pd.DataFrame df_: Dataframe with financial data.
    :param str col_to_shift: Column over to create the shift.
    :param str new_col: Name of the shifted column.
    :param int shift: Days to use as shift.
    :return pd.DataFrame: Dataframe with the shift added.
    """
    
    for id_ in df_['ticker'].unique():
        df_by_id = df_[df_['ticker'] == id_]
        df_.loc[df_['ticker'] == id_, new_col] = df_by_id[col_to_shift] - df_by_id[col_to_shift].shift(shift)
        
    return df_

In [3]:
def get_non_n_cols(df_, n):
    """
    Get the columns that his window is not n days
    
    :param pd.DataFrame df_: Dataframe with financial data.
    :param int n: n days to get columns with.
    :return list: List with the name of the columns.
    """
    
    return [elem for elem in df_.columns if (re.search(r'\d+$', elem) is not None) and (int(elem[-2:].strip()) != n)]

In [4]:
def interpolate_nan_values(df_, to_interpolate):
    """
    Interpolate and extrapolate nan values for numerical columns.
    
    :param pd.DataFrame df_: Dataframe with financial data with NaN values.
    :param list to_interpolate: List with columns to interpolate.
    :return pd.DataFrame: Dataframe with financial data without NaN values.
    """
    
    list_df = []
    for tick in df_['ticker'].unique():
        df_by_ticker = df_[df_['ticker'] == tick]
        for col in to_interpolate:
            df_by_ticker[col] = df_by_ticker[col].interpolate(method='linear', limit_direction='both')
        list_df.append(df_by_ticker)
    return pd.concat(list_df)

In [5]:
df_categorical = pd.read_csv('../data/db_bsm_categorical.csv')
df_financial = pd.read_csv('../data/db_bsm_financial.csv')

In [18]:
df_financial.replace(0, np.NaN, inplace=True)
df_financial.isnull().sum().to_frame('Null Values').loc[['close', 'volume']]

Unnamed: 0,Null Values
close,1081
volume,1996


In [16]:
df_financial_not_nan = interpolate_nan_values(df_financial, ['close', 'volume'])

In [19]:
df_financial_not_nan.isnull().sum().to_frame('Null Values').loc[['close', 'volume']]

Unnamed: 0,Null Values
close,0
volume,0


In [21]:
num_list = [3, 5, 7, 14, 21]

In [22]:
for num_ in num_list:
    df_fin = add_shifts(df_financial_not_nan, 'close', 'close_shifted_%i' % num_, num_)

In [32]:
df_fin.dropna(subset=['close_shifted_21'], inplace=True)

In [47]:
df_fin_not_nan = interpolate_nan_values(df_fin, list(df_fin.select_dtypes(float)))

In [48]:
df_fin_not_nan.isnull().sum().to_frame('Null Values').sort_values(by='Null Values', ascending=False).head(3)

Unnamed: 0,Null Values
ADX 14,0
RSI 21,0
ROCR 14,0


In [55]:
df_fin_not_nan['date'] = pd.to_datetime(df_fin_not_nan['date'])

In [73]:
df_fin_not_nan['year'], df_fin_not_nan['month'] = df_fin_not_nan['date'].dt.year, df_fin_not_nan['date'].dt.month
df_fin_not_nan.loc[df_fin_not_nan['close_shifted_3'] >= 0, 'sign'] = '+'
df_fin_not_nan.loc[df_fin_not_nan['close_shifted_3'] < 0, 'sign'] = '-'

In [74]:
df_fin_not_nan.groupby(['ticker', 'year', 'month', 'sign'])[['close_shifted_3']].describe()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,close_shifted_3,close_shifted_3,close_shifted_3,close_shifted_3,close_shifted_3,close_shifted_3,close_shifted_3,close_shifted_3
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,count,mean,std,min,25%,50%,75%,max
ticker,year,month,sign,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2
AAL,2006,2,-,2.0,-0.925000,0.898026,-1.56,-1.2425,-0.925,-0.6075,-0.29
AAL,2006,3,+,13.0,1.859231,0.829623,0.85,1.3300,1.810,2.1200,4.17
AAL,2006,3,-,10.0,-0.893000,0.471641,-1.68,-1.1650,-0.855,-0.4600,-0.41
AAL,2006,4,+,12.0,1.955833,1.710654,0.17,0.6475,1.495,2.6650,5.26
AAL,2006,4,-,7.0,-1.160000,0.959896,-2.65,-1.8550,-0.600,-0.4850,-0.19
AAL,2006,5,+,12.0,2.161667,1.473998,0.37,1.0225,1.665,3.2875,4.54
AAL,2006,5,-,10.0,-1.881000,1.301430,-3.61,-3.0375,-1.550,-0.6925,-0.35
AAL,2006,6,+,14.0,2.178571,1.248765,0.03,1.2325,2.245,3.1650,3.98
AAL,2006,6,-,8.0,-2.778750,1.917233,-5.63,-3.8600,-2.380,-1.5200,-0.12
AAL,2006,7,+,10.0,3.041000,1.764051,0.28,2.3525,2.670,4.6125,5.39
