In [1]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer, MinMaxScaler, StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier, XGBRegressor

In [2]:
df = pd.read_csv("data_bucket/finance_final.csv")
df.head()

Unnamed: 0,ticker,quarter,quarter_end,Q_End_Price,prev_quarter,prev_quarter_end,Prev_Q_End_Price,CoQ #,CoQ %,Up/Down
0,NVR,Q4-24,31/12/2024,8178.899902,Q3-24,30/09/2024,9811.799805,-1632.899903,-17%,Down
1,NVR,Q2-24,30/06/2024,7588.560059,Q1-24,31/03/2024,8099.959961,-511.399902,-6%,Down
2,NVR,Q3-24,30/09/2024,9811.799805,Q2-24,30/06/2024,7588.560059,2223.239746,29%,Up
3,NVR,Q1-24,31/03/2024,8099.959961,Q4-23,31/12/2023,7000.450195,1099.509766,16%,Up
4,NVR,Q3-23,30/09/2023,5963.299805,Q2-23,30/06/2023,6350.620117,-387.320312,-6%,Down


In [3]:
df["Up/Down"].value_counts()

Up/Down
Up      6540
Down    4330
Name: count, dtype: int64

In [4]:
df['CoQ % Numeric'] = df['CoQ %'].str.replace('%', '', regex=False).astype(float)

In [5]:
df.shape

(10870, 11)

In [6]:
filtered_df = df[abs(df['CoQ % Numeric']) > 20]

In [7]:
#filtered_df.sort_values(by = ["CoQ % Numeric"], ascending = False).head(n=30)

In [8]:
df = df[["ticker", "quarter", "Up/Down"]]

In [9]:
df.head()

Unnamed: 0,ticker,quarter,Up/Down
0,NVR,Q4-24,Down
1,NVR,Q2-24,Down
2,NVR,Q3-24,Up
3,NVR,Q1-24,Up
4,NVR,Q3-23,Down


In [10]:
sectors = pd.read_csv("data_bucket/ticker_sector_industry.csv")
sectors.head()

Unnamed: 0,ticker,sector,industry
0,A,Healthcare,Diagnostics & Research
1,AAL,Industrials,Airlines
2,AAP,Consumer Cyclical,Auto Parts
3,AAPL,Technology,Consumer Electronics
4,ABBV,Healthcare,Drug Manufacturers - General


In [11]:
df = df.merge(sectors, how = "left", on = "ticker")

In [12]:
df['q_num'] = df['quarter'].str.extract(r'Q([1-4])').astype(str)

In [13]:
sentiment = pd.read_csv("cc_ignore.csv")
sentiment = sentiment[["ticker", "quarter_year", "net_sentiment"]]
sentiment = sentiment.rename(columns = {"quarter_year" : "quarter"})

In [14]:
sentiment = sentiment.drop_duplicates()

In [15]:
sentiment["quarter"].value_counts()

quarter
Q4-21    534
Q4-22    534
Q3-23    530
Q3-22    511
Q3-21    505
Q2-23    503
Q3-24    502
Q1-24    501
Q2-22    501
Q2-20    500
Q2-21    500
Q2-24    499
Q4-20    497
Q3-19    497
Q2-19    496
Q1-20    495
Q4-19    494
Q1-19    494
Q3-20    493
Q1-21    489
Q1-22    479
Q1-23    467
Q4-24     49
Name: count, dtype: int64

In [16]:
sentiment = sentiment.sort_values(by = "quarter", ascending = True)
sentiment[sentiment["quarter"] == "Q3-24"]

Unnamed: 0,ticker,quarter,net_sentiment
5991,STZ,Q3-24,
9598,ULTA,Q3-24,
6034,CLX,Q3-24,
9986,PYPL,Q3-24,
8823,LKQ,Q3-24,
...,...,...,...
6667,NUE,Q3-24,
10084,BKR,Q3-24,
7550,HON,Q3-24,
8281,PPL,Q3-24,


In [17]:
df = df.merge(sentiment, how = "left", on = ["quarter", "ticker"])

In [18]:
df = df.dropna(axis="index", subset=['net_sentiment'])

In [19]:
df.shape

(5075, 7)

In [20]:
df["sector"].value_counts()

sector
Technology                751
Industrials               714
Healthcare                669
Consumer Cyclical         653
Financial Services        581
Consumer Defensive        376
Real Estate               315
Utilities                 304
Communication Services    246
Basic Materials           238
Energy                    228
Name: count, dtype: int64

In [21]:
#df = df.sort_values(by = "quarter", ascending = True).reset_index(drop=True)

In [22]:
df.tail()

Unnamed: 0,ticker,quarter,Up/Down,sector,industry,q_num,net_sentiment
10967,F,Q1-21,Up,Consumer Cyclical,Auto Manufacturers,1,-0.026846
10968,NVDA,Q2-23,Up,Technology,Semiconductors,2,-0.141026
10975,NVDA,Q3-23,Up,Technology,Semiconductors,3,0.108434
10977,F,Q4-20,Up,Consumer Cyclical,Auto Manufacturers,4,-0.109344
10985,FTI,Q4-20,Up,Energy,Oil & Gas Equipment & Services,4,-0.086294


In [23]:
df["quarter"].value_counts()

quarter
Q3-23    483
Q4-22    467
Q2-23    450
Q3-22    449
Q4-21    440
Q2-22    431
Q2-21    420
Q1-22    410
Q1-21    406
Q1-23    403
Q4-20    362
Q3-21    354
Name: count, dtype: int64

In [24]:
feature_cols = ['net_sentiment','sector', 'q_num']
X = df[feature_cols]
y = df['Up/Down'].map({'Down': 0, 'Up': 1})

In [25]:
train_test_split(stratify/shuffle)

In [26]:
n = len(df)
train_end = int(n * 0.6)
val_end = int(n * 0.8)

X_train, y_train = X.iloc[:train_end], y.iloc[:train_end]
X_val, y_val = X.iloc[train_end:val_end], y.iloc[train_end:val_end]
X_test, y_test = X.iloc[val_end:], y.iloc[val_end:]

In [47]:
X_train.q_num.value_counts()

q_num
2    812
3    795
1    729
4    709
Name: count, dtype: int64

In [27]:
X_train.shape, X_test.shape, X_val.shape, y_val.shape, y_train.shape, y_test.shape

((3045, 3), (1015, 3), (1015, 3), (1015,), (3045,), (1015,))

In [28]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3045 entries, 4 to 6234
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   net_sentiment  3045 non-null   float64
 1   sector         3045 non-null   object 
 2   q_num          3045 non-null   object 
dtypes: float64(1), object(2)
memory usage: 95.2+ KB


In [29]:
num_selector = make_column_selector(dtype_include=['float64'])
cat_selector = make_column_selector(dtype_include=['object'])

In [30]:
num_pipeline = make_pipeline(
    StandardScaler()
)

In [31]:
cat_pipeline = make_pipeline(
    OneHotEncoder(sparse_output = False)
)

In [32]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_pipeline, num_selector),
        ('cat', cat_pipeline, cat_selector)
    ]
)

In [33]:
preprocessor

In [34]:
pipeline = Pipeline([
    ('preprocessing', preprocessor),
    ('classifier', XGBClassifier(
        objective='binary:logistic',
        eval_metric='logloss',
        random_state=42,
        n_estimators=128,
        learning_rate = 0.02
    ))
])

In [35]:
X_test.dtypes

net_sentiment    float64
sector            object
q_num             object
dtype: object

In [36]:
import time

# Start timer
start_time = time.time()
pipeline.fit(X_train, y_train)
end_time = time.time()

In [37]:
#y_val_pred = pipeline.predict(X_val)
#y_test_pred = pipeline.predict(X_test)

In [38]:
end_time - start_time

0.11840271949768066

In [39]:
preprocessor.set_output(transform='pandas')

In [40]:
preprocessor.fit_transform(X_train)

Unnamed: 0,num__net_sentiment,cat__sector_Basic Materials,cat__sector_Communication Services,cat__sector_Consumer Cyclical,cat__sector_Consumer Defensive,cat__sector_Energy,cat__sector_Financial Services,cat__sector_Healthcare,cat__sector_Industrials,cat__sector_Real Estate,cat__sector_Technology,cat__sector_Utilities,cat__q_num_1,cat__q_num_2,cat__q_num_3,cat__q_num_4
4,-0.988770,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
5,0.041506,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
6,-1.432582,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
7,0.196674,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
8,-0.253311,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6223,-0.525451,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
6228,0.041506,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
6229,-0.516860,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
6233,1.747627,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [41]:
y_test_pred = pipeline.predict(X_test)

In [43]:
from sklearn.metrics import accuracy_score

In [44]:
accuracy_score(y_test, y_test_pred)

0.6206896551724138

In [45]:
y.value_counts(normalize=True)

Up/Down
1    0.5667
0    0.4333
Name: proportion, dtype: float64

In [42]:
#binary model
def initialize_model():
    model = Sequential()
    model.add(Input(shape=(2,)))
    model.add(layers.Dense(20, activation='relu'))
    model.add(layers.Dense(40, activation='relu'))
    model.add(layers.Dense(20, activation='relu'))
    model.add(layers.Dense(1, activation='sigmoid'))

    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

    return model

model = initialize_model()

history = model.fit(X_train,
                    y_train,
                    epochs=1,
                    batch_size=16,
                    verbose=0)

NameError: name 'Sequential' is not defined

In [None]:
model = XGBClassifier(random_state=1, use_label_encoder=True, n_estimators=128)